Okay heres the rundown.<br>
Features and what to do with them:<br>
- gender (target, binarize)
- race/ethnicity -> turn into a category dtype OR dummy
- lunch -> dummy (free_lunch = 1 or 0)
- test preparation course -> dummy (test_prep = 1 or 0)
- parental level of education -> explore a bit more
- when looking at scores and genders we start to see some noticeable separation...
    - reading with math, writing with math have noticeable separation per gender
    - i.e. low reading scores AND high math scores -> tend to be male
    - low writing scores AND high math scores -> tend to be male
    - the question is how we will capture this with the model..
- also lets scale the test scores. They seem normal so maybe just standard scale.
- well we got a couple of individuals with very low scores. lets see about them first. theres a very slight skew if theyre included.<br><br><br>
Scores:<br>
- First run with no param grid:<br>
.839 accuracy
- after optimizing:
0.851128

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import openml
from openml.tasks import TaskType
import numpy as np
from scipy import stats

# model imports
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

plt.rcParams['axes.labelcolor'] = 'white'
plt.rcParams['xtick.labelcolor'] = 'white'
plt.rcParams['ytick.labelcolor'] = 'white'

In [2]:
df = pd.read_csv("student-performance.csv")

In [3]:
df = pd.get_dummies(df, columns=['gender'], drop_first=True)

In [4]:
df = pd.get_dummies(df, columns=['race/ethnicity'], drop_first=True)

In [5]:
df = pd.get_dummies(df, columns=['lunch'], drop_first=True)

In [6]:
df = pd.get_dummies(df, columns=['test preparation course'], drop_first=True)

In [7]:
pd.get_dummies(df, columns=['parental level of education'], drop_first=True)

Unnamed: 0,math score,reading score,writing score,gender_male,race/ethnicity_'group B',race/ethnicity_'group C',race/ethnicity_'group D',race/ethnicity_'group E',lunch_standard,test preparation course_none,parental level of education_'bachelor\'s degree',parental level of education_'high school',parental level of education_'master\'s degree',parental level of education_'some college',parental level of education_'some high school'
0,72,72,74,0,1,0,0,0,1,1,1,0,0,0,0
1,69,90,88,0,0,1,0,0,1,0,0,0,0,1,0
2,90,95,93,0,1,0,0,0,1,1,0,0,1,0,0
3,47,57,44,1,0,0,0,0,0,1,0,0,0,0,0
4,76,78,75,1,0,1,0,0,1,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,88,99,95,0,0,0,0,1,1,0,0,0,1,0,0
996,62,55,55,1,0,1,0,0,0,1,0,1,0,0,0
997,59,71,65,0,0,1,0,0,0,0,0,1,0,0,0
998,68,78,77,0,0,0,1,0,1,0,0,0,0,1,0


In [8]:
# gets rid of lower outliers by nixing all values less than 3 standard deviations
# away from the mean.
for col in ['math score', 'reading score', 'writing score']:
    df = df.drop(
        index=df[(np.abs(stats.zscore(df[col])) >= 3)].index
        )

In [9]:
df.head(5).T

Unnamed: 0,0,1,2,3,4
parental level of education,'bachelor\'s degree','some college','master\'s degree','associate\'s degree','some college'
math score,72,69,90,47,76
reading score,72,90,95,57,78
writing score,74,88,93,44,75
gender_male,0,0,0,1,1
race/ethnicity_'group B',1,0,1,0,0
race/ethnicity_'group C',0,1,0,0,1
race/ethnicity_'group D',0,0,0,0,0
race/ethnicity_'group E',0,0,0,0,0
lunch_standard,1,1,1,0,1


In [11]:
xtrain, xtest, ytrain, ytest = train_test_split(df.drop(columns=['gender_male']), 
                                               df['gender_male'],
                                               test_size=.33,
                                               random_state=24)

In [12]:
# Standard Scaling since, after the outliers are removed, our continuous
# variables look pretty normal.
standard_scaler = StandardScaler()
column_trans = ColumnTransformer(transformers=[
    ('standard', standard_scaler, ['math score', 'reading score', 'writing score'])
])

In [13]:
tree = GradientBoostingClassifier()

In [14]:
# definition of our pipeline
steps = [
    ('ct', column_trans),
    ('model', tree)
]
pipe = Pipeline(steps)

In [15]:
# grid for the grid search
param_grid = dict(
    model__n_estimators=[100,500,1000],
    model__max_depth=[1,3,5,7], 
    model__warm_start=[True], 
    model__learning_rate=np.arange(.02, .2, .02),
    model__max_leaf_nodes=[2,5,None]
)

In [16]:
grid_search = GridSearchCV(pipe, param_grid=param_grid,
                          scoring='accuracy')
grid_search.fit(xtrain, ytrain)
res=pd.DataFrame(grid_search.cv_results_)

In [17]:
res.sort_values('mean_test_score', ascending=False).head(3)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__learning_rate,param_model__max_depth,param_model__max_leaf_nodes,param_model__n_estimators,param_model__warm_start,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
159,0.065082,0.000511,0.00254,0.000114,0.1,3,,100,True,"{'model__learning_rate': 0.1, 'model__max_dept...",0.834586,0.81203,0.864662,0.864662,0.879699,0.851128,0.024433,1
52,0.30534,0.001156,0.003347,8.3e-05,0.04,3,,500,True,"{'model__learning_rate': 0.04, 'model__max_dep...",0.819549,0.834586,0.849624,0.864662,0.87218,0.84812,0.019258,2
123,0.064245,0.000623,0.00241,0.000104,0.08,3,,100,True,"{'model__learning_rate': 0.08, 'model__max_dep...",0.827068,0.827068,0.842105,0.864662,0.879699,0.84812,0.020945,2


In [18]:
print(res[res['mean_test_score'] == res['mean_test_score'].max()]['params'].values)

[{'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__max_leaf_nodes': None, 'model__n_estimators': 100, 'model__warm_start': True}]
