# CSI4142 Project Phase 4: Data Mining
## Group 26
- Colin McFarlane 300074534 
- Will Lennox 300071951

Classification of Development Index using 20 attributes from world bank dataset 
- 1 = Developed
- 2 = Developing
- 3 = Underdeveloped


In [226]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from imblearn.over_sampling import SMOTE

from collections import Counter

global_seed = 26

np.random.seed(global_seed)

In [227]:
data = pd.read_csv('raw_data/PHASE_4_DATA.csv')
print("Loaded data successfully")
print("Dimensionality: \n", data.shape)


Loaded data successfully
Dimensionality: 
 (144, 22)


In [228]:
training_data = data[data['split'] == 'train']

data_features = training_data.drop(['Development Index'], axis=1)
data_features = data_features.drop("split", axis=1)

data_labels = training_data['Development Index'].copy()

feature_pipeline = Pipeline([
    ('imputer', KNNImputer()),
    ('scaler', MinMaxScaler()),
])

full_pipeline = ColumnTransformer([
    ("data_features", feature_pipeline, data_features.columns),
])

data_prepared = full_pipeline.fit_transform(data_features)

print("Before oversample")
print(data_prepared.shape)
print(Counter(data_labels))


oversampler = SMOTE(random_state=global_seed)
oversampled_features, oversampled_labels = oversampler.fit_resample(data_prepared, data_labels)

print("After oversample")

print(oversampled_features.shape)
print(Counter(oversampled_labels))


Before oversample
(114, 20)
Counter({2: 52, 3: 38, 1: 24})
After oversample
(156, 20)
Counter({3: 52, 2: 52, 1: 52})


In [229]:
model_decision_tree = DecisionTreeClassifier()

model_gradient_boosting = GradientBoostingClassifier()

model_random_forest = RandomForestClassifier()

In [230]:
decision_tree_parameters = {'min_samples_split': (2, 3, 4), "max_depth": (None, 1 , 2), 'criterion':('gini', 'entropy')}

gradient_boosting_parameters = {'n_estimators': (90, 100, 110), 'max_depth': (None, 1 , 2), 'criterion':('friedman_mse', 'squared_error')}

random_forest_parameters = {'n_estimators': (90, 100, 110), 'max_depth': (None, 1 , 2), 'criterion':('gini', 'entropy')}


grid_search_decision_tree = GridSearchCV(estimator=model_decision_tree, param_grid=decision_tree_parameters)
grid_search_gradient_boosting = GridSearchCV(estimator=model_gradient_boosting, param_grid=gradient_boosting_parameters)
grid_search_random_forest = GridSearchCV(estimator=model_random_forest, param_grid=random_forest_parameters)

start_time = time.time()
grid_search_decision_tree.fit(oversampled_features, oversampled_labels)
end_time = time.time()
print("Decision Tree Fit Time: " + str(end_time-start_time))


start_time = time.time()
grid_search_gradient_boosting.fit(oversampled_features, oversampled_labels)
end_time = time.time()
print("Gradient Boosting Fit Time: " + str(end_time-start_time))

start_time = time.time()
grid_search_random_forest.fit(oversampled_features, oversampled_labels)
end_time = time.time()
print("Random Forest Fit Time: " + str(end_time-start_time))


print("\nBest Params: ")
print(grid_search_decision_tree.best_params_)
print(grid_search_gradient_boosting.best_params_)
print(grid_search_random_forest.best_params_)


print("\nBest Estimator: ")
print(grid_search_decision_tree.best_estimator_)
print(grid_search_gradient_boosting.best_estimator_)
print(grid_search_random_forest.best_estimator_)

print("\nBest Score: ")
print(grid_search_decision_tree.best_score_)
print(grid_search_gradient_boosting.best_score_)
print(grid_search_random_forest.best_score_)


Decision Tree Fit Time: 0.10899853706359863
Gradient Boosting Fit Time: 11.097131729125977
Random Forest Fit Time: 7.0807671546936035

Best Params: 
{'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2}
{'criterion': 'friedman_mse', 'max_depth': 1, 'n_estimators': 100}
{'criterion': 'gini', 'max_depth': None, 'n_estimators': 110}

Best Estimator: 
DecisionTreeClassifier()
GradientBoostingClassifier(max_depth=1)
RandomForestClassifier(n_estimators=110)

Best Score: 
0.9806451612903226
1.0
0.9743951612903226


In [231]:
testing_data = data[data['split'] == 'test']

test_data_features = testing_data.drop(['Development Index'], axis=1)
test_data_features = test_data_features.drop("split", axis=1)

test_data_labels = testing_data['Development Index'].copy()

test_data_prepared = full_pipeline.fit_transform(test_data_features)

dt_pred = grid_search_decision_tree.predict(test_data_prepared)
gb_pred = grid_search_gradient_boosting.predict(test_data_prepared)
rf_pred = grid_search_random_forest.predict(test_data_prepared)

print("\nDECISION TREE MODEL: ")

print(grid_search_decision_tree.best_params_)

c_matrix = confusion_matrix(test_data_labels, dt_pred)
print(c_matrix)

report = classification_report(test_data_labels, dt_pred)
print(report)

print("\nGRADIENT BOOSTING MODEL: ")

print(grid_search_gradient_boosting.best_params_)

c_matrix = confusion_matrix(test_data_labels, gb_pred)
print(c_matrix)

report = classification_report(test_data_labels, gb_pred)
print(report)

print("\nRANDOM FOREST MODEL: ")

print(grid_search_random_forest.best_params_)

c_matrix = confusion_matrix(test_data_labels, rf_pred)
print(c_matrix)

report = classification_report(test_data_labels, rf_pred)
print(report)



DECISION TREE MODEL: 
{'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2}
[[ 8  0  0]
 [ 0 12  0]
 [ 0  2  8]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         8
           2       0.86      1.00      0.92        12
           3       1.00      0.80      0.89        10

    accuracy                           0.93        30
   macro avg       0.95      0.93      0.94        30
weighted avg       0.94      0.93      0.93        30


GRADIENT BOOSTING MODEL: 
{'criterion': 'friedman_mse', 'max_depth': 1, 'n_estimators': 100}
[[ 8  0  0]
 [ 0 12  0]
 [ 0  1  9]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         8
           2       0.92      1.00      0.96        12
           3       1.00      0.90      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97    