In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing # One-hot-Encoder y LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np
#import opendatasets as od # Download of kaggle od.download(url)
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
sns.set_style('darkgrid')
plt.rcParams['font.size'] = 8
plt.rcParams['figure.figsize'] = (4, 3)
plt.rcParams['figure.facecolor'] = '#00000000'

In [None]:
data = pd.read_csv("./dataset/weatherAUS.csv")
data.head()

In [None]:
data.info()

In [None]:
# drop row, si existe datos faltantes de 2 variables
data.dropna(subset=["RainToday","RainTomorrow"], inplace=True)
data.info()

In [None]:
px.histogram(data, x="Location", title="Location vs Rainy Days", color="RainToday")

In [None]:
px.histogram(data, x="Temp3pm", title="Temperature at 3pm vs Rain Tomorrow", color="RainTomorrow")

In [None]:
px.histogram(data, x="RainTomorrow", color="RainToday", title="Rain Tomorrow vs Rain Today")

In [None]:
px.scatter(data.sample(2000), title="Min temp vs Max Temp", x="MinTemp", y="MaxTemp", color="RainToday")

In [None]:
px.scatter(data.sample(2000), title="Temp 3pm vs Humidity", x="Temp3pm", y="Humidity3pm", color="RainTomorrow")

In [None]:
# split data
train_val_df, test_df = train_test_split(data, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=42)

print('train_df.shape :', train_df.shape)
print('val_df.shape :', val_df.shape)
print('test_df.shape :', test_df.shape)

In [None]:
year = pd.to_datetime(data.Date).dt.year

train_df = data[year < 2015]
val_df = data[year == 2015]
test_df = data[year > 2015]

print("train shape: ", train_df.shape)
print("val shape: ", val_df.shape)
print("test shape: ", test_df.shape)

In [None]:
plt.title("No of Rows per Year")
sns.countplot(x=pd.to_datetime(data.Date).dt.year, hue=year);

In [None]:
train_df.head()

In [None]:
# identifying input and target columns
input_cols = list(train_df.columns)[1:-1]
target_cols = "RainTomorrow"

In [None]:
print("input cols: ", input_cols)
print("target cols: ", target_cols)

In [None]:
train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_cols].copy()

val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_cols].copy()

test_inputs = test_df[input_cols].copy()
test_targets = test_df[target_cols].copy()

In [None]:
train_targets

In [None]:
num_cols = train_inputs.select_dtypes(include=np.number).columns.tolist()
cat_cols = train_inputs.select_dtypes("object").columns.tolist()

In [None]:
train_inputs[num_cols].describe()

In [None]:
train_inputs[cat_cols].nunique()

## Imputing Missing Numeric Data

In [None]:
imputer = SimpleImputer(strategy="mean")

data[num_cols].isna().sum()

In [None]:
train_inputs[num_cols].isna().sum()

In [None]:
imputer.fit(data[num_cols])

In [None]:
list(imputer.statistics_) # is strategy="mean"

In [None]:
train_inputs[num_cols] = imputer.transform(train_inputs[num_cols])
val_inputs[num_cols] = imputer.transform(val_inputs[num_cols])
test_inputs[num_cols] = imputer.transform(test_inputs[num_cols])

In [None]:
train_inputs[num_cols].isna().sum()

In [None]:
# Scaling Numeric Features
scaler = MinMaxScaler()
scaler.fit(data[num_cols])

In [None]:
list(scaler.data_min_) # is value min for variable

In [None]:
# scaler transform data train, val & test
train_inputs[num_cols] = scaler.transform(train_inputs[num_cols])
val_inputs[num_cols] = scaler.transform(val_inputs[num_cols])
test_inputs[num_cols] = scaler.transform(test_inputs[num_cols])

train_inputs[num_cols].describe()

## Encoding Categorical Data

In [None]:
data[cat_cols].nunique()

In [None]:
encoder = preprocessing.OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoder.fit(data[cat_cols])

In [None]:
encoder.categories_

In [None]:
encoded_cols = list(encoder.get_feature_names_out(cat_cols))
print(encoded_cols)

In [None]:
train_inputs[encoded_cols] = encoder.transform(train_inputs[cat_cols])
val_inputs[encoded_cols] = encoder.transform(val_inputs[cat_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[cat_cols])
#pd.set_option('display.max_columns', None) # muestra mensaje de warning

In [None]:
test_inputs.head()

In [None]:
print('train_inputs:', train_inputs.shape)
print('train_targets:', train_targets.shape)
print('val_inputs:', val_inputs.shape)
print('val_targets:', val_targets.shape)
print('test_inputs:', test_inputs.shape)
print('test_targets:', test_targets.shape)

## Training a Logistic Regression Model

In [None]:
model = LogisticRegression(solver="liblinear")
model.fit(train_inputs[num_cols + encoded_cols], train_targets)

In [None]:
# making predictions and evaluating the model
x_train = train_inputs[num_cols + encoded_cols]
x_val = val_inputs[num_cols + encoded_cols]
x_test= test_inputs[num_cols + encoded_cols]

In [None]:
train_preds = model.predict(x_train)
train_preds

In [None]:
train_probs = model.predict_proba(x_train)
train_probs

In [None]:
model.classes_

In [None]:
accuracy_score(train_targets, train_preds)

In [None]:
confusion_matrix(train_targets, train_preds)

In [None]:
confusion_matrix(train_targets, train_preds, normalize="true")

In [None]:
def predict_and_plot(inputs, targets, name=''):
    preds = model.predict(inputs)
    
    accuracy = accuracy_score(targets, preds)
    print("Accuracy: {:.2f}%".format(accuracy * 100))
    
    cf = confusion_matrix(targets, preds, normalize='true')
    plt.figure()
    sns.heatmap(cf, annot=True)
    plt.xlabel('Prediction')
    plt.ylabel('Target')
    plt.title('{} Confusion Matrix'.format(name));
    
    return preds

In [None]:
predict_and_plot(x_train, train_targets, 'Training')

In [None]:
predict_and_plot(x_val, val_targets, 'Validation')

In [None]:
predict_and_plot(x_test, test_targets, 'Testing')

In [None]:
# making predictions on a single input

new_input = {'Date': '2021-06-19',
             'Location': 'Katherine',
             'MinTemp': 23.2,
             'MaxTemp': 33.2,
             'Rainfall': 10.2,
             'Evaporation': 4.2,
             'Sunshine': np.nan,
             'WindGustDir': 'NNW',
             'WindGustSpeed': 52.0,
             'WindDir9am': 'NW',
             'WindDir3pm': 'NNE',
             'WindSpeed9am': 13.0,
             'WindSpeed3pm': 20.0,
             'Humidity9am': 89.0,
             'Humidity3pm': 58.0,
             'Pressure9am': 1004.8,
             'Pressure3pm': 1001.5,
             'Cloud9am': 8.0,
             'Cloud3pm': 5.0,
             'Temp9am': 25.7,
             'Temp3pm': 33.0,
             'RainToday': 'Yes'}

new_input_df = pd.DataFrame([new_input])
new_input_df[num_cols] = imputer.transform(new_input_df[num_cols])
new_input_df[num_cols] = scaler.transform(new_input_df[num_cols])
new_input_df[encoded_cols] = encoder.transform(new_input_df[cat_cols])

x_new_input = new_input_df[num_cols + encoded_cols]
prediction = model.predict(x_new_input)[0]
prob = model.predict_proba(x_new_input)[0]

print("Prediction is: ", prediction)
print("Probability is: ", prob)

In [None]:
def predict_input(single_input):
    input_df = pd.DataFrame([single_input])
    input_df[num_cols] = imputer.transform(input_df[num_cols])
    input_df[num_cols] = scaler.transform(input_df[num_cols])
    input_df[encoded_cols] = encoder.transform(input_df[cat_cols])
    X_input = input_df[num_cols + encoded_cols]
    pred = model.predict(X_input)[0]
    prob = model.predict_proba(X_input)[0][list(model.classes_).index(pred)]
    return pred, prob

predict_input(new_input)