In [88]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import umap
import datetime
import optuna
import pprint

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, label_binarize, PolynomialFeatures, RobustScaler
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV, cross_val_score
from sklearn.base import clone
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import r2_score, accuracy_score, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn import metrics
from sklearn.tree import plot_tree
from sklearn.multiclass import OneVsRestClassifier
from sklearn.compose import ColumnTransformer, make_column_transformer

import statsmodels.api as sm

from scipy import stats

import xgboost

import gc

gc.collect()

8975

In [89]:
train_df = pd.read_csv("data/train.csv", index_col=0)
test_df = pd.read_csv("data/test.csv", index_col=0)
original_df = pd.read_csv("data/original.csv",sep=";")
train_features = test_df.columns

cat_features = ['Marital status', 'Application mode', 'Course',
                'Previous qualification', 'Nacionality', "Mother's qualification", 
                "Father's qualification", "Mother's occupation",
                "Father's occupation"]
cont_features = list(set(train_features).difference(cat_features))

In [90]:
for feat in cat_features:
    dtype = pd.CategoricalDtype(categories=list(set(train_df[feat]) | set(test_df[feat]) | set(original_df[feat])), ordered=False)
    for df in [train_df, test_df, original_df]:
        df[feat] = df[feat].astype(dtype)

In [91]:
for df in [train_df, test_df, original_df]:
    df["Misery Index"] = df["Unemployment rate"] + df["Inflation rate"]
    # df["Economic Discomfort"] = df["Misery Index"] - df["GDP"]

cont_features.append("Misery Index")

In [92]:
feature_importances_order = pd.DataFrame.from_dict(
    {
        "Curricular units 2nd sem (approved)": 0.5262597,
        "Tuition fees up to date": 0.12040115,
        "Scholarship holder": 0.04374109,
        "Curricular units 1st sem (approved)": 0.043124773,
        "Curricular units 2nd sem (enrolled)": 0.034864154,
        "Curricular units 2nd sem (evaluations)": 0.033007413,
        "Curricular units 1st sem (evaluations)": 0.026952056,
        "Debtor": 0.02125777,
        "Curricular units 2nd sem (grade)": 0.017488716,
        "Gender": 0.015914941,
        "Age at enrollment": 0.009499201,
        "Daytime/evening attendance": 0.009322386,
        "Curricular units 1st sem (enrolled)": 0.008687382,
        "Curricular units 2nd sem (credited)": 0.008607062,
        "Course": 0.008552427,
        "Mother's occupation": 0.0074276286,
        "Application mode": 0.007248743,
        "GDP": 0.0058370973,
        "Curricular units 1st sem (grade)": 0.00579759,
        "Unemployment rate": 0.0038729862,
        "Mother's qualification": 0.0035413294,
        "Displaced": 0.0034240438,
        "Previous qualification": 0.0033880775,
        "Father's occupation": 0.0033660706,
        "Curricular units 1st sem (credited)": 0.0032424054,
        "Admission grade": 0.0031858524,
        "Curricular units 1st sem (without evaluations)": 0.0031376902,
        "Marital status": 0.0027686018,
        "Father's qualification": 0.002645827,
        "Nacionality": 0.0026041167,
        "Application order": 0.0025012011,
        "Previous qualification (grade)": 0.0024294835,
        "Inflation rate": 0.0023359724,
        "International": 0.0020073373,
        "Curricular units 2nd sem (without evaluations)": 0.0015576994,
    },
    orient="index",
    columns=["val"],
)

low_freq_cols = [
    "Nacionality",
    "Educational special needs",
    "International",
    "Curricular units 1st sem (credited)",
    "Curricular units 1st sem (without evaluations)",
    "Curricular units 2nd sem (credited)",
    "Curricular units 2nd sem (without evaluations)",
]

def prune_low_freq(inp_df):
    return inp_df.drop(columns=low_freq_cols, errors="ignore")

def prune_low_importance(inp_df, thresh):
    pruned_cols = list(
        feature_importances_order.loc[
            feature_importances_order["val"] < thresh
        ].index
    )
    return inp_df.drop(columns=pruned_cols, errors="ignore")

In [93]:
xgb_study = optuna.load_study(
    storage="sqlite:///optuna.sqlite3",
    study_name="aug_xgb_v1",
)
rf_study = optuna.load_study(
    storage="sqlite:///optuna.sqlite3",
    study_name="aug_rf_v1",
)
logreg_study = optuna.load_study(
    storage="sqlite:///optuna.sqlite3",
    study_name="aug_logreg_v2",
)

untuned_xgb = xgboost.XGBClassifier(enable_categorical=True, n_jobs=-1)
untuned_rf = RandomForestClassifier(n_jobs=-1)
untuned_logreg = LogisticRegression(n_jobs=-1)

untuned_with_orig_xgb = xgboost.XGBClassifier(enable_categorical=True, n_jobs=-1)
untuned_with_orig_rf = RandomForestClassifier(n_jobs=-1)
untuned_with_orig_logreg = LogisticRegression(n_jobs=-1)

In [94]:

pprint.pp(xgb_study.best_params)
tuned_xgb = xgboost.XGBClassifier(
    enable_categorical=True,
    n_jobs=-1,

    n_estimators= xgb_study.best_params["n_estimators"],
    eta= xgb_study.best_params["eta"],
    gamma= xgb_study.best_params["gamma"],
    max_depth= xgb_study.best_params["max_depth"],
    max_leaves= xgb_study.best_params["max_leaves"],
    colsample_bytree= xgb_study.best_params["colsample_bytree"],
    colsample_bylevel= xgb_study.best_params["colsample_bylevel"],
    colsample_bynode= xgb_study.best_params["colsample_bynode"],
    reg_lambda= xgb_study.best_params["reg_lambda"],
    reg_alpha= xgb_study.best_params["reg_alpha"],
    grow_policy= xgb_study.best_params["grow_policy"],
    min_child_weight= xgb_study.best_params["min_child_weight"],
    max_delta_step= xgb_study.best_params["max_delta_step"]
)
print()
pprint.pp(rf_study.best_params)
tuned_rf = RandomForestClassifier(
    n_jobs=-1,

    n_estimators=rf_study.best_params["n_estimators"],
    max_depth=rf_study.best_params["max_depth"],
    min_samples_split=rf_study.best_params["min_samples_split"],
    min_samples_leaf=rf_study.best_params["min_samples_leaf"],
    min_weight_fraction_leaf=rf_study.best_params["min_weight_fraction_leaf"],
    max_features=rf_study.best_params["max_features"]
)
print()
pprint.pp(logreg_study.best_params)
tuned_logreg = LogisticRegression(
    n_jobs=-1,

    penalty=logreg_study.best_params["penalty"],
    C=logreg_study.best_params["logreg_c"],
    solver="saga",
    max_iter=int(1e5)
)
print()

{'include_orig': False,
 'prune_low_freq': True,
 'prune_low_importance': False,
 'poly_feats': False,
 'use_standardscaler': False,
 'use_robustscaler': False,
 'classifier': 'xgboost',
 'n_estimators': 1883,
 'eta': 0.11299665616020274,
 'gamma': 0.9634286209401426,
 'max_depth': 3,
 'max_leaves': 1346,
 'colsample_bytree': 0.8385856672697529,
 'colsample_bylevel': 0.8074747607390849,
 'colsample_bynode': 0.7064701846488642,
 'reg_lambda': 3,
 'reg_alpha': 8,
 'grow_policy': 'lossguide',
 'min_child_weight': 70.27952002620962,
 'max_delta_step': 94.63438008721305}

{'include_orig': True,
 'prune_low_freq': False,
 'prune_low_importance': False,
 'classifier': 'random_forest',
 'n_estimators': 3373,
 'max_depth': 6,
 'min_samples_split': 0.022853786556543416,
 'min_samples_leaf': 0.00020242539544159874,
 'min_weight_fraction_leaf': 5.455836606089006e-05,
 'max_features': 0.8642355014257856}

{'include_orig': False,
 'prune_low_freq': False,
 'prune_low_importance': False,
 'poly_feats

In [95]:
label_encoder = LabelEncoder()
label_encoder.fit(train_df["Target"])

x = train_df.drop(columns=["Target"])
y = label_encoder.transform(train_df["Target"])

train_df_with_orig = pd.concat([train_df, original_df], axis=0)
x_combined = train_df_with_orig.drop(columns=["Target"])
y_combined = label_encoder.transform(train_df_with_orig["Target"])

In [96]:
print("Training Untuned XGB")
untuned_xgb.fit(x, y)
print("Training Untuned RF")
untuned_rf.fit(x, y)
print("Training Untuned LogReg")
untuned_logreg.fit(x, y)

print("Training Untuned XGB with Original Dataset")
untuned_with_orig_xgb.fit(x_combined, y_combined)
print("Training Untuned RF with Original Dataset")
untuned_with_orig_rf.fit(x_combined, y_combined)
print("Training Untuned LogReg with Original Dataset")
untuned_with_orig_logreg.fit(x_combined, y_combined)

Training Untuned XGB
Training Untuned RF
Training Untuned LogReg
Training Untuned XGB with Original Dataset
Training Untuned RF with Original Dataset
Training Untuned LogReg with Original Dataset


In [97]:
models = {
    "untuned_xgb":untuned_xgb, 
    "untuned_rf":untuned_rf, 
    "untuned_logreg":untuned_logreg, 
    "untuned_with_orig_xgb":untuned_with_orig_xgb, 
    "untuned_with_orig_rf":untuned_with_orig_rf, 
    "untuned_with_orig_logreg":untuned_with_orig_logreg
}

for name, model in models.items():
    pred = model.predict(test_df)
    pred = label_encoder.inverse_transform(pred)
    out_pd = pd.DataFrame(index=test_df.index)
    out_pd["Target"] = pred

    out_pd.to_csv(f"{name}.csv", columns=["Target"], index=True)


In [98]:
ensemble_preds = None
for name, model in models.items():
    try:
        ensemble_preds += model.predict_proba(test_df)
    except TypeError:
        ensemble_preds = model.predict_proba(test_df)

ensemble_preds /= len(models)
ensemble_preds = np.argmax(ensemble_preds, axis=1)
ensemble_preds = label_encoder.inverse_transform(ensemble_preds)
out_pd = pd.DataFrame(index=test_df.index)
out_pd["Target"] = ensemble_preds
out_pd.to_csv("untuned_ensemble.csv", columns=["Target"], index=True)

In [99]:
ensemble_preds = None
for name, model in list(models.items())[:3]:
    try:
        ensemble_preds += model.predict_proba(test_df)
    except TypeError:
        ensemble_preds = model.predict_proba(test_df)

ensemble_preds /= len(models)
ensemble_preds = np.argmax(ensemble_preds, axis=1)
ensemble_preds = label_encoder.inverse_transform(ensemble_preds)
out_pd = pd.DataFrame(index=test_df.index)
out_pd["Target"] = ensemble_preds
out_pd.to_csv("untuned_ensemble_no_orig.csv", columns=["Target"], index=True)

In [104]:
xgb_inp_df = train_df.copy()
xgb_inp_df = prune_low_freq(xgb_inp_df)

x = xgb_inp_df.drop(columns=["Target"])
y = label_encoder.fit_transform(xgb_inp_df["Target"])

xgb_test_df = test_df.copy()
xgb_test_df = prune_low_freq(xgb_test_df)

tuned_xgb.fit(x, y)
tuned_xgb_preds = tuned_xgb.predict(xgb_test_df)
tuned_xgb_preds = label_encoder.inverse_transform(tuned_xgb_preds)
out_pd = pd.DataFrame(index=xgb_test_df.index)
out_pd["Target"] = tuned_xgb_preds
out_pd.to_csv("tuned_xgb.csv", columns=["Target"], index=True)

tuned_xgb_predproba = tuned_xgb.predict_proba(xgb_test_df)

In [101]:
rf_inp_df = train_df.copy()
rf_inp_df = pd.concat([rf_inp_df, original_df], axis=0)

x = rf_inp_df.drop(columns=["Target"])
y = rf_inp_df["Target"]

tuned_rf.fit(x, y)
tuned_rf_preds = tuned_rf.predict(test_df)
out_pd = pd.DataFrame(index=test_df.index)
out_pd["Target"] = tuned_rf_preds
out_pd.to_csv("tuned_rf.csv", columns=["Target"], index=True)

tuned_rf_predproba = tuned_rf.predict_proba(test_df)

In [102]:
poly_feats = Pipeline([("poly_feats", PolynomialFeatures(2))])
col_transformer = ColumnTransformer(
    [("numerical_pipeline", poly_feats, cont_features)],
    remainder="passthrough",
)

logreg_inp_df = train_df.copy()

x = logreg_inp_df.drop(columns=["Target"])
x = col_transformer.fit_transform(x)
y = logreg_inp_df["Target"]

test_x = col_transformer.transform(test_df)

tuned_logreg.fit(x, y)
tuned_logreg_preds = tuned_logreg.predict(test_x)
out_pd = pd.DataFrame(index=test_df.index)
out_pd["Target"] = tuned_logreg_preds
out_pd.to_csv("tuned_logreg.csv", columns=["Target"], index=True)

tuned_logreg_predproba = tuned_logreg.predict_proba(test_x)

In [103]:
ensemble_preds = tuned_xgb_predproba + tuned_rf_predproba + tuned_logreg_predproba
ensemble_preds /= 3

ensemble_preds = np.argmax(ensemble_preds, axis=1)
ensemble_preds = label_encoder.inverse_transform(ensemble_preds)
out_pd = pd.DataFrame(index=test_df.index)
out_pd["Target"] = ensemble_preds
out_pd.to_csv("tuned_ensemble.csv", columns=["Target"], index=True)