## Imports

In [156]:
import string
letters = list(string.ascii_lowercase)

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

pd.options.display.max_columns = None
sns.set_theme()

## Ler dados

In [157]:
df = pd.read_csv("../data/Student_performance_processed.csv")
df.shape

(2392, 15)

In [158]:
df.head()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,Some college,19.833723,7,Y,Moderate,N,N,Y,N,2.929196,C
1,1002,18,0,0,High School,15.408756,0,N,Low,N,N,N,N,3.042915,B
2,1003,15,0,2,Bachelor's,4.21057,26,N,Moderate,N,N,N,N,0.112602,E
3,1004,17,1,0,Bachelor's,10.028829,14,N,High,Y,N,N,N,2.054218,D
4,1005,17,1,0,Some college,4.672495,17,Y,High,N,N,N,N,1.288061,E


In [159]:
TARGET = "GradeClass"

### Remover colunas

In [160]:
df.drop(["StudentID", "Gender", "Ethnicity"], axis=1, inplace=True)

## Corrige tipo

Os tipos estão corretos!

In [161]:
df.dtypes

Age                    int64
ParentalEducation     object
StudyTimeWeekly      float64
Absences               int64
Tutoring              object
ParentalSupport       object
Extracurricular       object
Sports                object
Music                 object
Volunteering          object
GPA                  float64
GradeClass            object
dtype: object

## Categorização

In [162]:
def create_labels(bins):
    
    labels = []
    
    for i in range(len(bins)-1):
        label = f"{letters[i]}_({round(bins[i], 2)}_{round(bins[i+1], 2)}]"
        labels.append(label)
    
    return labels

def categorize_column(data, column, q=5, bins=None, suffix="cat"):
    
    if bins is None:
        _, bins = pd.qcut(data[column], q=q, retbins=True)
        bins[0] = -np.inf
        bins[-1] = np.inf

    labels = create_labels(bins)

    data[f"{column}_{suffix}"] = pd.cut(data[column], bins=bins, labels=labels)
    data[f"{column}_{suffix}"] = data[f"{column}_{suffix}"].astype(str)

    return data

In [163]:
numeric_cols = list(df.select_dtypes(include='number').columns)
numeric_cols.remove('Age')

In [164]:
for col in numeric_cols:
    categorize_column(df, col)

In [165]:
df.head()

Unnamed: 0,Age,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass,StudyTimeWeekly_cat,Absences_cat,GPA_cat
0,17,Some college,19.833723,7,Y,Moderate,N,N,Y,N,2.929196,C,e_(15.61_inf],b_(6.0_12.0],e_(2.77_inf]
1,18,High School,15.408756,0,N,Low,N,N,N,N,3.042915,B,d_(11.48_15.61],a_(-inf_6.0],e_(2.77_inf]
2,15,Bachelor's,4.21057,26,N,Moderate,N,N,N,N,0.112602,E,b_(3.99_7.94],e_(23.0_inf],a_(-inf_1.01]
3,17,Bachelor's,10.028829,14,N,High,Y,N,N,N,2.054218,D,c_(7.94_11.48],c_(12.0_18.0],c_(1.61_2.2]
4,17,Some college,4.672495,17,Y,High,N,N,N,N,1.288061,E,b_(3.99_7.94],c_(12.0_18.0],b_(1.01_1.61]


In [166]:
df.dtypes

Age                      int64
ParentalEducation       object
StudyTimeWeekly        float64
Absences                 int64
Tutoring                object
ParentalSupport         object
Extracurricular         object
Sports                  object
Music                   object
Volunteering            object
GPA                    float64
GradeClass              object
StudyTimeWeekly_cat     object
Absences_cat            object
GPA_cat                 object
dtype: object

## Feature engineering

In [167]:
df['GPA_cat'].unique()

array(['e_(2.77_inf]', 'a_(-inf_1.01]', 'c_(1.61_2.2]', 'b_(1.01_1.61]',
       'd_(2.2_2.77]'], dtype=object)

In [168]:
bins = [-np.inf, 5, 10, 15, np.inf]
_ = categorize_column(df, "StudyTimeWeekly", bins=bins, suffix="mcat")

In [169]:
bins = [-np.inf, 5, 10, 15, 20, 25, np.inf]
_ = categorize_column(df, "Absences", bins=bins, suffix="mcat")

In [170]:
bins = [-np.inf, 2.0, 2.5, 3.0, 3.5, np.inf]
_ = categorize_column(df, "GPA", bins=bins, suffix="mcat")

In [171]:
df.head()

Unnamed: 0,Age,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass,StudyTimeWeekly_cat,Absences_cat,GPA_cat,StudyTimeWeekly_mcat,Absences_mcat,GPA_mcat
0,17,Some college,19.833723,7,Y,Moderate,N,N,Y,N,2.929196,C,e_(15.61_inf],b_(6.0_12.0],e_(2.77_inf],d_(15_inf],b_(5_10],c_(2.5_3.0]
1,18,High School,15.408756,0,N,Low,N,N,N,N,3.042915,B,d_(11.48_15.61],a_(-inf_6.0],e_(2.77_inf],d_(15_inf],a_(-inf_5],d_(3.0_3.5]
2,15,Bachelor's,4.21057,26,N,Moderate,N,N,N,N,0.112602,E,b_(3.99_7.94],e_(23.0_inf],a_(-inf_1.01],a_(-inf_5],f_(25_inf],a_(-inf_2.0]
3,17,Bachelor's,10.028829,14,N,High,Y,N,N,N,2.054218,D,c_(7.94_11.48],c_(12.0_18.0],c_(1.61_2.2],c_(10_15],c_(10_15],b_(2.0_2.5]
4,17,Some college,4.672495,17,Y,High,N,N,N,N,1.288061,E,b_(3.99_7.94],c_(12.0_18.0],b_(1.01_1.61],a_(-inf_5],d_(15_20],a_(-inf_2.0]


### Feature selection

In [172]:
cols_model = [
              'Age',
              'Tutoring',
              'ParentalSupport',
              'StudyTimeWeekly_mcat',
              'Absences_mcat',
              TARGET
]

In [173]:
x = df[cols_model]
y = x.pop(TARGET)

### Encode

In [174]:
oe_cat = ["StudyTimeWeekly_mcat", "Absences_mcat"]

In [175]:
enc = OrdinalEncoder()
enc.fit(x[oe_cat])

In [176]:
x[oe_cat] = enc.transform(x[oe_cat])

In [177]:
ohe_cat = ["Tutoring", "ParentalSupport"]

In [178]:
enc = OneHotEncoder(handle_unknown="ignore", drop="if_binary")
enc.fit(x[ohe_cat])

In [179]:
def create_ohe_cols(ohe_cat, enc):

    ohe_cols = []
    
    for i, col in enumerate(ohe_cat):
        if len(enc.categories_[i]) == 2:
            col_name = f"{col}_{enc.categories_[i][1]}"
            ohe_cols.append(col_name)
        else:
            for colj in enc.categories_[i]:
                col_name = f"{col}_{colj}"
                ohe_cols.append(col_name)

    return ohe_cols

In [180]:
ohe_cols = create_ohe_cols(ohe_cat, enc)

In [181]:
x[ohe_cols] = enc.transform(x[ohe_cat]).toarray()

In [182]:
x.drop(ohe_cat, axis=1, inplace=True)

In [183]:
x.head()

Unnamed: 0,Age,StudyTimeWeekly_mcat,Absences_mcat,Tutoring_Y,ParentalSupport_High,ParentalSupport_Low,ParentalSupport_Moderate,ParentalSupport_Very High,ParentalSupport_nan
0,17,3.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,18,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,15,0.0,5.0,0.0,0.0,0.0,1.0,0.0,0.0
3,17,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0
4,17,0.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0


### Split

In [184]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

## Train

In [185]:
clf = RandomForestClassifier(random_state=42)
clf.fit(x_train, y_train)

### Importancia

In [186]:
clf.feature_importances_

array([0.16021012, 0.13596814, 0.55113099, 0.0501709 , 0.02345555,
       0.02131982, 0.01945893, 0.02193525, 0.01635031])

In [187]:
imp = pd.DataFrame({"column": x.columns, 
                    "importances": clf.feature_importances_})
imp.sort_values(by="importances", ascending=False, inplace=True)

In [188]:
imp.head()

Unnamed: 0,column,importances
2,Absences_mcat,0.551131
0,Age,0.16021
1,StudyTimeWeekly_mcat,0.135968
3,Tutoring_Y,0.050171
4,ParentalSupport_High,0.023456


In [189]:
def group_imps(imp, ohe_cat, ohe_cols):
    
    imp_grouped = {"column": [], "importances": []}
    for col in ohe_cat:
        imp_g = imp[imp["column"].str.contains(col)]["importances"].sum()
        imp_grouped["column"].append(col)
        imp_grouped["importances"].append(imp_g)

    for col in set(imp.column) - set(ohe_cols):
        imp_g = imp[imp["column"] == col]["importances"].values[0]
        imp_grouped["column"].append(col)
        imp_grouped["importances"].append(imp_g)
        
    return pd.DataFrame(imp_grouped)

In [190]:
imp_grouped = group_imps(imp, ohe_cat, ohe_cols)
imp_grouped.sort_values(by="importances", ascending=False, inplace=True)

In [191]:
imp_grouped

Unnamed: 0,column,importances
3,Absences_mcat,0.551131
4,Age,0.16021
2,StudyTimeWeekly_mcat,0.135968
1,ParentalSupport,0.10252
0,Tutoring,0.050171


### Performances

In [192]:
clf

In [193]:
y_pred = clf.predict(x_test)

In [194]:
accuracy_score(y_test, y_pred)

0.6468354430379747

In [195]:
86.46864686468647-82.83828382838284

3.6303630363036348

## Salvar

In [196]:
df = pd.read_csv("../data/Student_performance_processed.csv")
df.shape

(2392, 15)

In [197]:
TARGET = "GradeClass"

In [198]:
df.head()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,Some college,19.833723,7,Y,Moderate,N,N,Y,N,2.929196,C
1,1002,18,0,0,High School,15.408756,0,N,Low,N,N,N,N,3.042915,B
2,1003,15,0,2,Bachelor's,4.21057,26,N,Moderate,N,N,N,N,0.112602,E
3,1004,17,1,0,Bachelor's,10.028829,14,N,High,Y,N,N,N,2.054218,D
4,1005,17,1,0,Some college,4.672495,17,Y,High,N,N,N,N,1.288061,E


In [199]:
cols_model = [
              'Age',
              'StudyTimeWeekly', 
              'Absences', 
              'Tutoring', 
              'ParentalSupport',
              TARGET
]

In [200]:
df = df[cols_model]

In [201]:
ohe_cat = [
        "Tutoring", 
        "ParentalSupport"]

In [202]:
enc = OneHotEncoder(handle_unknown="ignore", drop="if_binary")
enc.fit(df[ohe_cat])

In [203]:
def create_ohe_cols(ohe_cat, enc):

    ohe_cols = []
    
    for i, col in enumerate(ohe_cat):
        if len(enc.categories_[i]) == 2:
            col_name = f"{col}_{enc.categories_[i][1]}"
            ohe_cols.append(col_name)
        else:
            for colj in enc.categories_[i]:
                col_name = f"{col}_{colj}"
                ohe_cols.append(col_name)

    return ohe_cols

In [204]:
ohe_cols = create_ohe_cols(ohe_cat, enc)

In [205]:
df[ohe_cols] = enc.transform(df[ohe_cat]).toarray()

In [206]:
df.drop(ohe_cat, axis=1, inplace=True)

In [207]:
df.head()

Unnamed: 0,Age,StudyTimeWeekly,Absences,GradeClass,Tutoring_Y,ParentalSupport_High,ParentalSupport_Low,ParentalSupport_Moderate,ParentalSupport_Very High,ParentalSupport_nan
0,17,19.833723,7,C,1.0,0.0,0.0,1.0,0.0,0.0
1,18,15.408756,0,B,0.0,0.0,1.0,0.0,0.0,0.0
2,15,4.21057,26,E,0.0,0.0,0.0,1.0,0.0,0.0
3,17,10.028829,14,D,0.0,1.0,0.0,0.0,0.0,0.0
4,17,4.672495,17,E,1.0,1.0,0.0,0.0,0.0,0.0


In [208]:
df.to_csv("../data/Student_performance_fe.csv", index=False)

In [210]:
import pickle

with open('../pkls/ohe.pickle', 'wb') as handle:
    pickle.dump(enc, handle, protocol=pickle.HIGHEST_PROTOCOL)