In [45]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import umap
import datetime
import optuna
import pprint
import joblib

from autogluon.tabular import TabularDataset, TabularPredictor

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, label_binarize, PolynomialFeatures, RobustScaler
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV, cross_val_score, RepeatedStratifiedKFold
from sklearn.base import clone
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import r2_score, accuracy_score, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFE, RFECV
from sklearn import metrics
from sklearn.tree import plot_tree
from sklearn.multiclass import OneVsRestClassifier
from sklearn.compose import ColumnTransformer, make_column_transformer

import statsmodels.api as sm

from scipy import stats

import xgboost

import gc

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

gc.collect()

2161

In [46]:
train_df = pd.read_csv("../data/train.csv", index_col=0)
test_df = pd.read_csv("../data/test.csv", index_col=0)
original_df = pd.read_csv("../data/original.csv",sep=";")
train_features = test_df.columns

cat_features = ['Marital status', 'Application mode', "Application order", 'Course', "Daytime/evening attendance",
                'Previous qualification', 'Nacionality', "Mother's qualification", 
                "Father's qualification", "Mother's occupation",
                "Father's occupation", "Displaced", "Educational special needs", "Debtor", "Tuition fees up to date",
                "Gender", "Scholarship holder", "International"]
cont_features = [feature for feature in train_features if feature not in cat_features]

In [47]:
for col in test_df.columns:
    unique_vals = train_df[col].nunique()
    print(f"{col:>40} nunique {unique_vals}")

                          Marital status nunique 6
                        Application mode nunique 22
                       Application order nunique 8
                                  Course nunique 19
              Daytime/evening attendance nunique 2
                  Previous qualification nunique 21
          Previous qualification (grade) nunique 110
                             Nacionality nunique 18
                  Mother's qualification nunique 35
                  Father's qualification nunique 39
                     Mother's occupation nunique 40
                     Father's occupation nunique 56
                         Admission grade nunique 668
                               Displaced nunique 2
               Educational special needs nunique 2
                                  Debtor nunique 2
                 Tuition fees up to date nunique 2
                                  Gender nunique 2
                      Scholarship holder nunique 2
                   

In [50]:
seed = 0
label_enc = LabelEncoder()
label_enc.fit(train_df["Target"])

# kfold = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=seed)
kfold = StratifiedKFold(n_splits=10)
clf = xgboost.XGBClassifier(enable_categorical=True, n_jobs=-1)

unique_val_thresh = np.arange(10, 31)

scores = {}

for thresh in unique_val_thresh:
    cat_features = []
    inp_df = train_df.copy()

    for col in test_df.columns:
        unique_vals = train_df[col].nunique()

        if unique_vals <= thresh:
            cat_features.append(col)

    for feat in cat_features:
        dtype = pd.CategoricalDtype(categories=list(set(train_df[feat]) | set(test_df[feat]) | set(original_df[feat])), ordered=False)
        inp_df[feat] = inp_df[feat].astype(dtype)

    x = inp_df.drop(columns=["Target"])
    y = label_enc.transform(inp_df["Target"])
    cv_scores = cross_val_score(clf, x, y, cv=kfold)

    print(f"thresh: {thresh} - {np.mean(cv_scores)}")
    scores[thresh] = cv_scores
        

thresh: 10 - 0.830575727726895
thresh: 11 - 0.8311768675045842
thresh: 12 - 0.8306279708968682
thresh: 13 - 0.8309285561583846
thresh: 14 - 0.8309285561583846
thresh: 15 - 0.8309285561583846
thresh: 16 - 0.8309285561583846
thresh: 17 - 0.8309285561583846
thresh: 18 - 0.831176908498375
thresh: 19 - 0.8303665774072424
thresh: 20 - 0.8299876000615726
thresh: 21 - 0.8301966974309123
thresh: 22 - 0.8310723025932056
thresh: 23 - 0.8294649172748721
thresh: 24 - 0.8304580277217776
thresh: 25 - 0.8304580277217776
thresh: 26 - 0.8304580277217776
thresh: 27 - 0.8304580277217776
thresh: 28 - 0.8304580277217776
thresh: 29 - 0.8304580277217776
thresh: 30 - 0.8304580277217776


In [51]:
sorted([(thresh, np.mean(cv_scores)) for thresh, cv_scores in scores.items()], key=lambda x : x[1], reverse=True)

[(18, 0.831176908498375),
 (11, 0.8311768675045842),
 (22, 0.8310723025932056),
 (13, 0.8309285561583846),
 (14, 0.8309285561583846),
 (15, 0.8309285561583846),
 (16, 0.8309285561583846),
 (17, 0.8309285561583846),
 (12, 0.8306279708968682),
 (10, 0.830575727726895),
 (24, 0.8304580277217776),
 (25, 0.8304580277217776),
 (26, 0.8304580277217776),
 (27, 0.8304580277217776),
 (28, 0.8304580277217776),
 (29, 0.8304580277217776),
 (30, 0.8304580277217776),
 (19, 0.8303665774072424),
 (21, 0.8301966974309123),
 (20, 0.8299876000615726),
 (23, 0.8294649172748721)]