In [None]:
%load_ext autoreload
%autoreload 2 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from collections import OrderedDict
plt.rcParams["font.family"] = "Times New Roman"
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer 
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [None]:
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate

In [None]:
data = pd.read_csv('poverty_prediction_data.csv')

In [None]:
# Labels for training
train_labels = np.array(list(data[data['Target'].notnull()]['Target'].astype(np.uint8)))
# Extract the training data
train_set = data[data['Target'].notnull()].drop(columns = ['Id', 'idhogar', 'Target'])

In [None]:
train_set.info()

In [None]:
features = list(train_set.columns)
pipeline = Pipeline([('imputer', SimpleImputer(strategy = 'median')), 
                      ('scaler', MinMaxScaler())])
# Fit and transform training data
train_set = pipeline.fit_transform(train_set)


In [None]:
scoring = {'accuracy': make_scorer(accuracy_score),
           'f1_score': make_scorer(f1_score, greater_is_better=True, average = 'weighted'),
           'precision_score': make_scorer(precision_score, greater_is_better=True, average = 'weighted', zero_division=1),
           'recall_score': make_scorer(recall_score, greater_is_better=True, average = 'weighted')
          }

In [None]:
# run each classifier with 10-fold cross validation 
def run_classifier(X, y, clf_name):
    print(f'\n --- {clf_name} classifier')
    if clf_name == 'log_reg':
        clf = LogisticRegression()
    elif clf_name == 'svm':
        clf = SVC()
    elif clf_name == 'lda':
        clf = LinearDiscriminantAnalysis()
    elif clf_name == 'decision_tree':
        clf = DecisionTreeClassifier()
    elif clf_name == 'random_forest':
        clf = RandomForestClassifier()
    cv_results = cross_validate(clf, X, y, cv=10, return_train_score=True, scoring=scoring)
    train_acc = cv_results['train_accuracy']
    train_f1_score = cv_results['train_f1_score']
    train_precision = cv_results['train_precision_score']
    train_recall = cv_results['train_recall_score']
    print(f'Training:\n\taccuracy {np.mean(train_acc):.2f}')
    print(f'\t precision score {np.mean(train_precision):.2f}')
    print(f'\t recall score {np.mean(train_recall):.2f}')
    print(f'\t f1 score {np.mean(train_f1_score):.2f}')
    # print(f'{np.mean(train_acc):.2f},  {np.mean(train_precision):.2f}, {np.mean(train_recall):.2f}, {np.mean(train_f1_score):.2f} ')
    test_acc = cv_results['test_accuracy']
    test_f1_score = cv_results['test_f1_score']
    test_precision = cv_results['test_precision_score']
    test_recall = cv_results['test_recall_score']
    print(f'Testing :\n\taccuracy {np.mean(test_acc):.2f}')
    print(f'\t precision score {np.mean(test_precision):.2f}')
    print(f'\t recall score {np.mean(test_recall):.2f}')
    print(f'\t f1 score {np.mean(test_f1_score):.2f}')

In [None]:
## 10-fold cross validation scores for imbalanced training
run_classifier(train_set, train_labels, 'lda')
run_classifier(train_set, train_labels, 'log_reg')
run_classifier(train_set, train_labels, 'svm')
run_classifier(train_set, train_labels, 'decision_tree')
run_classifier(train_set, train_labels, 'random_forest')

In [None]:
# # random forest confusion matrix
# cm = confusion_matrix(y_test, predictions, labels=clf.classes_)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm,
#                               display_labels=clf.classes_)
# disp.plot()
# plt.show()

### upsampling the data

In [None]:
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE

In [None]:
# imblearn pipeline makes sure we cross-validate on the original non-upsampled data

In [None]:
# run each classifier with 10-fold cross validation 
def run_classifier_with_upsampling(X, y, clf_name):
    print(f'\n --- {clf_name} classifier')
    if clf_name == 'log_reg':
        clf = LogisticRegression()
    elif clf_name == 'svm':
        clf = SVC()
    elif clf_name == 'lda':
        clf = LinearDiscriminantAnalysis()
    elif clf_name == 'decision_tree':
        clf = DecisionTreeClassifier()
    elif clf_name == 'random_forest':
        clf = RandomForestClassifier()
    imba_pipeline = make_pipeline(SMOTE(random_state=42), clf)
    cv_results = cross_validate(imba_pipeline, X, y, cv=10, return_train_score=True, scoring=scoring)
    train_acc = cv_results['train_accuracy']
    train_f1_score = cv_results['train_f1_score']
    train_precision = cv_results['train_precision_score']
    train_recall = cv_results['train_recall_score']
    print(f'Training:\n\taccuracy {np.mean(train_acc):.2f}')
    print(f'\t precision score {np.mean(train_precision):.2f}')
    print(f'\t recall score {np.mean(train_recall):.2f}')
    print(f'\t f1 score {np.mean(train_f1_score):.2f}')
    # print(f'{np.mean(train_acc):.2f},  {np.mean(train_precision):.2f}, {np.mean(train_recall):.2f}, {np.mean(train_f1_score):.2f} ')
    test_acc = cv_results['test_accuracy']
    test_f1_score = cv_results['test_f1_score']
    test_precision = cv_results['test_precision_score']
    test_recall = cv_results['test_recall_score']
    print(f'Testing :\n\taccuracy {np.mean(test_acc):.2f}')
    print(f'\t precision score {np.mean(test_precision):.2f}')
    print(f'\t recall score {np.mean(test_recall):.2f}')
    print(f'\t f1 score {np.mean(test_f1_score):.2f}')

In [None]:
## 10-fold cross validation scores
run_classifier_with_upsampling(train_set, train_labels, 'lda')
run_classifier_with_upsampling(train_set, train_labels, 'log_reg')
run_classifier_with_upsampling(train_set, train_labels, 'svm')
run_classifier_with_upsampling(train_set, train_labels, 'decision_tree')
run_classifier_with_upsampling(train_set, train_labels, 'random_forest')

### confusion matrix display

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    train_set, train_labels, test_size=0.2, random_state=42, stratify=train_labels)

In [None]:
# confusion matrix without imbalanced correction
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
cm = confusion_matrix(y_test, predictions, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=clf.classes_)
disp.plot()
plt.show()

In [None]:
clf = RandomForestClassifier()
imba_pipeline = make_pipeline(SMOTE(random_state=42), clf)
imba_pipeline.fit(X_train, y_train)


In [None]:
predictions = imba_pipeline.predict(X_test)
cm = confusion_matrix(y_test, predictions, labels=imba_pipeline.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=imba_pipeline.classes_)
disp.plot()
plt.show()

### PCA

In [None]:
pca = PCA(n_components=50)
X_train_pca = pca.fit_transform(train_set)


In [None]:
## 10-fold cross validation scores
run_classifier_with_upsampling(X_train_pca, train_labels, 'lda')
run_classifier_with_upsampling(X_train_pca, train_labels, 'log_reg')
run_classifier_with_upsampling(X_train_pca, train_labels, 'svm')
run_classifier_with_upsampling(X_train_pca, train_labels, 'decision_tree')
run_classifier_with_upsampling(X_train_pca, train_labels, 'random_forest')