In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn import preprocessing # One-hot-Encoder y LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np
#import opendatasets as od # Download of kaggle od.download(url)
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
sns.set_style('darkgrid')
plt.rcParams['font.size'] = 8
plt.rcParams['figure.facecolor'] = '#00000000'

In [None]:
data_df = pd.read_csv('./dataset/weatherAUS.csv')
data_df.head()


In [None]:
data_df.dropna(subset=['RainTomorrow'], inplace=True)

In [None]:
# Preparing the Data for Training
plt.title("N of Rows per year")
sns.countplot(x=pd.to_datetime(data_df.Date).dt.year);

In [None]:
year = pd.to_datetime(data_df.Date).dt.year

train_df = data_df[year < 2015]
val_df = data_df[year == 2015]
test_df = data_df[year > 2015]

print('train_df.shape :', train_df.shape)
print('val_df.shape :', val_df.shape)
print('test_df.shape :', test_df.shape)

In [None]:
input_cols = list(train_df.columns)[1:-1]
target_col = 'RainTomorrow'

train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_col].copy()

val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_col].copy()

test_inputs = test_df[input_cols].copy()
test_targets = test_df[target_col].copy()

In [None]:
num_cols = train_inputs.select_dtypes(include=np.number).columns.tolist()
cat_cols = train_inputs.select_dtypes('object').columns.tolist()

In [None]:
# Imputing missing numeric values
imputer = SimpleImputer(strategy="mean").fit(data_df[num_cols])

train_inputs[num_cols] = imputer.transform(train_inputs[num_cols])
val_inputs[num_cols] = imputer.transform(val_inputs[num_cols])
test_inputs[num_cols] = imputer.transform(test_inputs[num_cols])

test_inputs[num_cols].isna().sum()

In [None]:
# Scaling Numeric Features
scaler = MinMaxScaler().fit(data_df[num_cols])

train_inputs[num_cols] = scaler.transform(train_inputs[num_cols])
val_inputs[num_cols] = scaler.transform(val_inputs[num_cols])
test_inputs[num_cols] = scaler.transform(test_inputs[num_cols])

train_inputs.describe().loc[['min', 'max']]

In [None]:
# Encoding Categorical Data
encoder = preprocessing.OneHotEncoder(sparse_output=False, handle_unknown="ignore").fit(data_df[cat_cols])
encoded_cols = list(encoder.get_feature_names_out(cat_cols))

train_inputs[encoded_cols] = encoder.transform(train_inputs[cat_cols])
val_inputs[encoded_cols] = encoder.transform(val_inputs[cat_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[cat_cols])

train_inputs.head()

In [None]:
x_train = train_inputs[num_cols + encoded_cols]
x_val = val_inputs[num_cols + encoded_cols]
x_test = test_inputs[num_cols + encoded_cols]

x_train.head()

## Decision Trees

In [None]:
model = DecisionTreeClassifier(random_state=42)
model.fit(x_train, train_targets)
train_preds = model.predict(x_train)
pd.value_counts(train_preds)

In [None]:
train_probs = model.predict_proba(x_train)
train_probs

In [None]:
accuracy_score(train_targets, train_preds)

In [None]:
val_targets.value_counts()/len(val_targets) # solo 78.8% de aciertos

In [None]:
# visualization
plt.figure(figsize=(80,20))
plot_tree(model, feature_names=x_train.columns, max_depth=2, filled=True);

In [None]:
model.tree_.max_depth

In [None]:
tree_text = export_text(model, max_depth=10, feature_names=list(x_train.columns))
print(tree_text[:1000])

In [None]:
model.feature_importances_

In [None]:
importance_df = pd.DataFrame({ 'feature': x_train.columns,
                              'importance': model.feature_importances_}).sort_values('importance', ascending=False)
importance_df.head(10)

In [None]:
plt.title('Feature Importance')
sns.barplot(data=importance_df.head(10), x='importance', y='feature');

In [None]:
# Hyperparameter Tuning and Overfitting
model = DecisionTreeClassifier(max_depth=3, random_state=42)
model.fit(x_train, train_targets)
model.score(x_train, train_targets) # = to accuracy

In [None]:
model.score(x_val, val_targets)

In [None]:
model.classes_

In [None]:
plt.figure(figsize=(80,20))
plot_tree(model, feature_names=x_train.columns, filled=True, rounded=True, class_names=model.classes_);

In [None]:
print(export_text(model, feature_names=list(x_train.columns)))

In [None]:
# Let's experiment with different depths
def max_depth_error(md):
    model = DecisionTreeClassifier(max_depth=md, random_state=42)
    model.fit(x_train, train_targets)
    train_acc = 1 - model.score(x_train, train_targets)
    val_acc = 1 - model.score(x_val, val_targets)
    return {'Max Depth': md, 'Training Error': train_acc, 'Validation Error': val_acc}

errors_df = pd.DataFrame(max_depth_error(md) for md in range(1,21))
errors_df

In [None]:
plt.plot(errors_df['Max Depth'], errors_df['Training Error'])
plt.plot(errors_df['Max Depth'], errors_df['Validation Error'])
plt.title('Training vs. Validation Error')
plt.xticks(range(0,21, 2))
plt.xlabel('Max. Depth')
plt.ylabel('Prediction Error (1 - Accuracy)')
plt.legend(['Training', 'Validation']);

In [None]:
# segun el plot, el best max Depth es el 7
model = DecisionTreeClassifier(max_depth=7, random_state=42).fit(x_train, train_targets)
model.score(x_val, val_targets)

In [None]:
# podar = max_leaf_nodes
model = DecisionTreeClassifier(max_leaf_nodes=128, random_state=42)
model.fit(x_train, train_targets)
model.score(x_train, train_targets)

In [None]:
model.score(x_val, val_targets)

In [None]:
model.tree_.max_depth

In [None]:
model_text = export_text(model, feature_names=list(x_train.columns))
print(model_text[:1000])

## Random Forest

In [None]:
model = RandomForestClassifier(n_jobs=-1, random_state=42)
model.fit(x_train, train_targets)
model.score(x_train, train_targets)

In [None]:
model.score(x_val, val_targets)

In [None]:
train_probs = model.predict_proba(x_train)
train_probs

In [None]:
model.estimators_[0]

In [None]:
plt.figure(figsize=(80,20))
plot_tree(model.estimators_[0], max_depth=2, feature_names=x_train.columns, filled=True, rounded=True, class_names=model.classes_);

In [None]:
plt.figure(figsize=(80,20))
plot_tree(model.estimators_[20], max_depth=2, feature_names=x_train.columns, filled=True, rounded=True, class_names=model.classes_);

In [None]:
len(model.estimators_)

In [None]:
importance_df = pd.DataFrame({'feature': x_train.columns,
                              'importance': model.feature_importances_}).sort_values('importance', ascending=False)
importance_df.head(10)

In [None]:
plt.title('Feature Importance')
sns.barplot(data=importance_df.head(10), x='importance', y='feature');

In [None]:
# Hyperparameter Tuning with Random Forests
base_model = RandomForestClassifier(n_jobs=-1, random_state=42).fit(x_train, train_targets)

base_train_acc = base_model.score(x_train, train_targets)
base_val_acc = base_model.score(x_val, val_targets)

base_train_acc, base_val_acc

In [None]:
# hyperparameter n_estimators = 10
model = RandomForestClassifier(n_jobs=-1, n_estimators=10, random_state=42)
model.fit(x_train, train_targets)
model.score(x_train, train_targets), model.score(x_val, val_targets)

In [None]:
# hyperparameter n_estimators = 200
model = RandomForestClassifier(n_jobs=-1, n_estimators=200, random_state=42)
model.fit(x_train, train_targets)
model.score(x_train, train_targets), model.score(x_val, val_targets)

In [None]:
# make it easy to test hyperparameters
def test_params(**params):
    model = RandomForestClassifier(random_state=42, n_jobs=-1, **params).fit(x_train, train_targets)
    return model.score(x_train, train_targets), model.score(x_val, val_targets)

test_params(max_depth=5)

In [None]:
test_params(max_depth=26)

In [None]:
test_params(max_leaf_nodes=2**5)

In [None]:
# min samples split & min samples leaf
test_params(min_samples_split=3, min_samples_leaf=2)

In [None]:
test_params(min_samples_split=50, min_samples_leaf=30)

In [None]:
# min impurity decrease
test_params(min_impurity_decrease=1e-7)

In [None]:
# bootstrap, max samples
test_params(bootstrap=False)

In [None]:
test_params(max_samples=0.9)

In [None]:
# for data Imbalaced
test_params(class_weight="balanced")

In [None]:
# Putting it together
model = RandomForestClassifier(n_jobs=-1, n_estimators=300, max_features=7, max_depth=30, 
                               class_weight={'No': 1, 'Yes': 1.5}, # imbalaced
                                random_state=42)

model.fit(x_train, train_targets)
model.score(x_train, train_targets), model.score(x_val, val_targets)

In [None]:
model.score(x_test, test_targets)

In [None]:
# Making Predictions on New Inputs
def predict_input(model, single_input):
    input_df = pd.DataFrame([single_input])
    input_df[num_cols] = imputer.transform(input_df[num_cols])
    input_df[num_cols] = scaler.transform(input_df[num_cols])
    input_df[encoded_cols] = encoder.transform(input_df[cat_cols])
    X_input = input_df[num_cols + encoded_cols]
    pred = model.predict(X_input)[0]
    prob = model.predict_proba(X_input)[0][list(model.classes_).index(pred)]
    return pred, prob

new_input = {'Date': '2021-06-19', 'Location': 'Launceston', 'MinTemp': 23.2, 'MaxTemp': 33.2,
             'Rainfall': 10.2, 'Evaporation': 4.2, 'Sunshine': np.nan, 'WindGustDir': 'NNW', 'WindGustSpeed': 52.0,
             'WindDir9am': 'NW', 'WindDir3pm': 'NNE', 'WindSpeed9am': 13.0, 'WindSpeed3pm': 20.0,
             'Humidity9am': 89.0, 'Humidity3pm': 58.0, 'Pressure9am': 1004.8, 'Pressure3pm': 1001.5,
             'Cloud9am': 8.0, 'Cloud3pm': 5.0, 'Temp9am': 25.7, 'Temp3pm': 33.0, 'RainToday': 'Yes'}

predict_input(model, new_input)