# Decision tree

for analysis of weather data

In [1]:
import opendatasets as od 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib 
import os

pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', None)
sns.set_style("darkgrid")
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

Downloading datasets

In [None]:
od.download('http://www.kaggle.com/jsphyg/weather-dataset-rattle-package')

In [2]:
raw_df = pd.read_csv('weather-dataset-rattle-package/weatherAUS.csv')

Drop rows with missing values

In [3]:
raw_df.dropna(subset=['RainTomorrow'], inplace=True)

### Preparing Data

In [4]:
year = pd.to_datetime(raw_df.Date).dt.year

# split the data into train, validation and test
train_df = raw_df[year < 2015]
val_df = raw_df[year == 2015]
test_df = raw_df[year > 2015]

1. Separate the data into training and test sets

In [5]:
input_cols = list(train_df.columns)[1:-1]
target_cols = 'RainTomorrow'

In [8]:
train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_cols].copy()

In [9]:
val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_cols].copy()

In [10]:
test_inputs = test_df[input_cols].copy()
test_targets = test_df[target_cols].copy()

In [11]:
numeric_cols = train_inputs.select_dtypes(include=np.number).columns.tolist()
catagorical_cols = train_inputs.select_dtypes('object').columns.tolist()

2. Impute missing numeric values

In [25]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean').fit(raw_df[numeric_cols])

In [13]:
train_inputs[numeric_cols] = imputer.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = imputer.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = imputer.transform(test_inputs[numeric_cols])

3. Scaling numeric features

In [14]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(raw_df[numeric_cols])

In [15]:
train_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = scaler.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

4. Encoding categorical variables

In [16]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
train_inputs[catagorical_cols] = train_inputs[catagorical_cols].fillna('Unknown', inplace=True)
val_inputs[catagorical_cols] = val_inputs[catagorical_cols].fillna('Unknown', inplace=True)
test_inputs[catagorical_cols] = test_inputs[catagorical_cols].fillna('Unknown', inplace=True)

In [None]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore').fit(raw_df[catagorical_cols])

In [35]:
encoded_cols = list(encoder.get_feature_names_out(catagorical_cols))

In [None]:
train_inputs[encoded_cols] = encoder.transform(train_inputs[catagorical_cols])
val_inputs[encoded_cols] = encoder.transform(val_inputs[catagorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[catagorical_cols])

In [39]:
X_train = train_inputs[numeric_cols + encoded_cols]
X_val = val_inputs[numeric_cols + encoded_cols]
X_test = val_inputs[numeric_cols + encoded_cols] 

## Decision tree

1. Training

In [41]:
from sklearn.tree import DecisionTreeClassifier

In [42]:
# some fixed value for optimal performance
model = DecisionTreeClassifier(random_state=42)

In [None]:
model.fit(X_train, train_targets)

2. Evaluation

In [45]:
train_preds = model.predict(X_train)

In [None]:
train_preds

In [None]:
pd.value_counts(train_preds)

In [48]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
# accuracy of model (similarity bw train_preds and train_targets)
accuracy_score(train_preds, train_targets)

In [None]:
train_probs = model.predict_proba(X_train)
train_probs

In [None]:
model.score(X_val, val_targets)

here the model has score very less as compared to the model on train data. we'll try to make it better. this is known as overfitting.

3. Visualize the data

In [53]:
from sklearn.tree import plot_tree, export_text

In [None]:
plt.figure(figsize=(80,20))
plot_tree(model, feature_names=X_train.columns, max_depth=2, filled=True)

In [None]:
model.tree_.max_depth

In [None]:
tree_text = export_text(model, max_depth=10, feature_names=list(X_train.columns))
print(tree_text)

### Imporance Values

In [None]:
model.feature_importances_

In [59]:
importance_df = pd.DataFrame({
  'feature': X_train.columns,
  'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

In [None]:
importance_df.head(10)

In [None]:
plt.title("Feature Importance")
sns.barplot(data=importance_df.head(10), x='importance', y='feature')

### Better validation scores
we want model to take genral trends not memorize all of them

In [68]:
model = DecisionTreeClassifier(max_depth=3, random_state=42)

In [None]:
model.fit(X_train, train_targets)

In [None]:
model.score(X_train, train_targets)

In [None]:
model.score(X_val, val_targets)

though model is not that good at train data, but far more better on new data. which is superb, model is not longer overfitting.

In [None]:
plt.figure(figsize=(80,20))
plot_tree(model, feature_names=X_train.columns, max_depth=2, filled=True)

In [73]:
def max_depth_error(md):
  model = DecisionTreeClassifier(max_depth=md, random_state=42)
  model.fit(X_train, train_targets)
  train_error = 1-model.score(X_train, train_targets)
  val_error = 1-model.score(X_val, val_targets)
  return {'Max Depth': md, 'Training Error': train_error, 'Validation Error': val_error}

In [74]:
error_df = pd.DataFrame([max_depth_error(md) for md in range(1, 21)])

In [None]:
plt.figure()
plt.plot(error_df['Max Depth'], error_df['Training Error'])
plt.plot(error_df['Max Depth'], error_df['Validation Error'])
plt.title('Training Error vs Validation Error')
plt.xlabel('Max Depth')
plt.ylabel('Prediction Error')
plt.legend(['Training Error', 'Validation Error'])

<- underfiting | overfitting ->

## Random Forest Regression
it is a much more effective strategy that combines the results for several decision trees trained with slightly different parameters.