In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import umap
import datetime
import optuna
import pprint
import joblib

from autogluon.tabular import TabularDataset, TabularPredictor

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, label_binarize, PolynomialFeatures, RobustScaler
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV, cross_val_score
from sklearn.base import clone
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import r2_score, accuracy_score, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn import metrics
from sklearn.tree import plot_tree
from sklearn.multiclass import OneVsRestClassifier
from sklearn.compose import ColumnTransformer, make_column_transformer

import statsmodels.api as sm

from scipy import stats

import xgboost

import gc

gc.collect()

3000

In [2]:
train_df = pd.read_csv("data/train.csv", index_col=0)
test_df = pd.read_csv("data/test.csv", index_col=0)
original_df = pd.read_csv("data/original.csv",sep=";")
train_features = test_df.columns

cat_features = ['Marital status', 'Application mode', "Application order", 'Course', "Daytime/evening attendance",
                'Previous qualification', 'Nacionality', "Mother's qualification", 
                "Father's qualification", "Mother's occupation",
                "Father's occupation", "Displaced", "Educational special needs", "Debtor", "Tuition fees up to date",
                "Gender", "Scholarship holder", "International"]
cont_features = [feature for feature in train_features if feature not in cat_features]

In [3]:
for feat in cat_features:
    dtype = pd.CategoricalDtype(categories=list(set(train_df[feat]) | set(test_df[feat]) | set(original_df[feat])), ordered=False)
    for df in [train_df, test_df, original_df]:
        df[feat] = df[feat].astype(dtype)

combined_df = pd.concat([train_df, original_df], axis=0, ignore_index=True)

In [16]:
train_data = TabularDataset(train_df)
test_data = TabularDataset(test_df)

predictor = TabularPredictor(label="Target").fit(train_data=train_data)
predictions = predictor.predict(test_data)

No path specified. Models will be saved in: "AutogluonModels\ag-20240618_230351"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20240618_230351"
AutoGluon Version:  1.1.0
Python Version:     3.11.1
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          12
Memory Avail:       13.70 GB / 31.93 GB (42.9%)
Disk Space Avail:   584

In [17]:
out_pd = pd.DataFrame(index=test_df.index)
out_pd["Target"] = list(predictions)
out_pd.to_csv("autogluon_no_orig.csv", columns=["Target"], index=True)

In [18]:
train_data = TabularDataset(combined_df)
test_data = TabularDataset(test_df)

predictor = TabularPredictor(label="Target").fit(train_data=train_data)
predictions = predictor.predict(test_data)

No path specified. Models will be saved in: "AutogluonModels\ag-20240618_231136"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20240618_231136"
AutoGluon Version:  1.1.0
Python Version:     3.11.1
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          12
Memory Avail:       13.77 GB / 31.93 GB (43.1%)
Disk Space Avail:   581

In [19]:
out_pd = pd.DataFrame(index=test_df.index)
out_pd["Target"] = list(predictions)
out_pd.to_csv("autogluon.csv", columns=["Target"], index=True)

In [5]:
poly_feats = PolynomialFeatures(3)

train_cont = combined_df[cont_features]
test_cont = test_df[cont_features]

poly_train = poly_feats.fit_transform(train_cont)
poly_train = pd.DataFrame(poly_train, index=train_cont.index, columns=poly_feats.get_feature_names_out())

poly_test = poly_feats.transform(test_cont)
poly_test = pd.DataFrame(poly_test, index=test_cont.index, columns=poly_feats.get_feature_names_out())

train_data = combined_df.drop(columns=cont_features)
train_data = pd.concat([train_data, poly_train], axis=1)
test_data = test_df.drop(columns=cont_features)
test_data = pd.concat([test_data, poly_test], axis=1)

train_data = TabularDataset(train_data)
test_data = TabularDataset(test_data)

predictor = TabularPredictor(label="Target", path="AutogluonModels/ag-misc/").fit(train_data=train_data, presets="medium_quality")
predictions = predictor.predict(test_data)

joblib.dump(poly_feats, "AutogluonModels/misc/poly_feats.pkl")

Presets specified: ['medium_quality']
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/misc/"
AutoGluon Version:  1.1.0
Python Version:     3.11.1
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          12
Memory Avail:       17.48 GB / 31.93 GB (54.7%)
Disk Space Avail:   561.55 GB / 1863.00 GB (30.1%)
Train Data Rows:    80942
Train Data Columns: 1348
Label Column:       Target
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == object).
	3 unique label values:  ['Graduate', 'Dropout', 'Enrolled']
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['bin

['AutogluonModels/misc/poly_feats.pkl']

In [6]:
feature_importances = predictor.feature_importance(train_data)
feature_importances.to_csv("ag_feature_importances.csv")

These features in provided data are not utilized by the predictor and will be ignored: ['1']
Computing feature importance via permutation shuffling for 1347 features using 5000 rows with 5 shuffle sets...
	8749.23s	= Expected runtime (1749.85s per shuffle set)
	4314.18s	= Actual runtime (Completed 5 of 5 shuffle sets)


In [16]:
out_pd = pd.DataFrame(index=test_df.index)
out_pd["Target"] = list(predictions)
out_pd.to_csv("autogluon_poly_orig.csv", columns=["Target"], index=True)