In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.metrics import mutual_info_score, roc_auc_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier
%matplotlib inline

In [2]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,type_school,school_accreditation,gender,interest,residence,parent_age,parent_salary,house_area,average_grades,parent_was_in_college,in_college
0,Academic,A,Male,Less Interested,Urban,56,6950000,83.0,84.09,False,True
1,Academic,A,Male,Less Interested,Urban,57,4410000,76.8,86.91,False,True
2,Academic,B,Female,Very Interested,Urban,50,6500000,80.6,87.43,False,True
3,Vocational,B,Male,Very Interested,Rural,49,6600000,78.2,82.12,True,True
4,Academic,A,Female,Very Interested,Urban,57,5250000,75.1,86.79,False,False


In [3]:
from tqdm.auto import tqdm

In [4]:
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state=1)

In [21]:
def train(df_train, y_train, max_depth=5, min_samples_leaf=1, n_estimators=100):
    dicts = df_train[['type_school', 'school_accreditation', 'gender', 'interest',
       'residence', 'parent_age', 'parent_salary', 'house_area',
       'average_grades', 'parent_was_in_college']].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = RandomForestClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf, n_estimators=n_estimators, n_jobs=-1)
    model.fit(X_train, y_train)
    
    return dv, model

In [22]:
def predict(df, dv, model):
    dicts = df[['type_school', 'school_accreditation', 'gender', 'interest',
       'residence', 'parent_age', 'parent_salary', 'house_area',
       'average_grades', 'parent_was_in_college']].to_dict(orient='records')

    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [26]:
n_splits = 5


rf_parameters = {'max_depth' : [None,5,10,15,25],
                'min_samples_leaf' : [1,3,5,10,50],
                'n_estimators': np.arange(50,501,50)
                }
scores = []

for max_depth in rf_parameters['max_depth']:
    for min_samples_leaf in rf_parameters['min_samples_leaf']:
        for n_estimators in rf_parameters['n_estimators']:

            kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

            

            for train_idx, val_idx in kfold.split(df_full_train):
                df_train = df_full_train.iloc[train_idx]
                df_val = df_full_train.iloc[val_idx]

                y_train = df_train.in_college.values
                y_val = df_val.in_college.values

                dv, model = train(df_train, y_train, max_depth=max_depth, min_samples_leaf=min_samples_leaf, n_estimators=n_estimators)
                y_pred = predict(df_val, dv, model)

                auc = roc_auc_score(y_val, y_pred)
                scores.append([max_depth, min_samples_leaf, n_estimators, auc])

print(scores)

[[None, 1, 50, 0.968358395989975], [None, 1, 50, 0.9716228893058162], [None, 1, 50, 0.9489056841442293], [None, 1, 50, 0.9651162790697674], [None, 1, 50, 0.962171052631579], [None, 1, 100, 0.96921992481203], [None, 1, 100, 0.9727954971857411], [None, 1, 100, 0.9537868052275232], [None, 1, 100, 0.9706945317410434], [None, 1, 100, 0.9627192982456141], [None, 1, 150, 0.9761904761904762], [None, 1, 150, 0.9734990619136962], [None, 1, 150, 0.9548102660998268], [None, 1, 150, 0.968730358265242], [None, 1, 150, 0.9634241854636592], [None, 1, 200, 0.974624060150376], [None, 1, 200, 0.9752188868042527], [None, 1, 200, 0.9538655329869312], [None, 1, 200, 0.9682589566310497], [None, 1, 200, 0.9634241854636592], [None, 1, 250, 0.9730576441102756], [None, 1, 250, 0.9712320200125079], [None, 1, 250, 0.9535506219492993], [None, 1, 250, 0.9689660590823381], [None, 1, 250, 0.9674185463659147], [None, 1, 300, 0.9731359649122807], [None, 1, 300, 0.9738899312070043], [None, 1, 300, 0.956542276806802], [No

In [27]:
df_tun_param = pd.DataFrame(scores, columns=['max_depth', 'min_samples_leaf', 'n_estimators', 'auc'])
df_tun_param

Unnamed: 0,max_depth,min_samples_leaf,n_estimators,auc
0,,1,50,0.968358
1,,1,50,0.971623
2,,1,50,0.948906
3,,1,50,0.965116
4,,1,50,0.962171
...,...,...,...,...
1245,25.0,50,500,0.911184
1246,25.0,50,500,0.911664
1247,25.0,50,500,0.896394
1248,25.0,50,500,0.943589


In [29]:
df_tun_param.sort_values('auc', ascending=False)

Unnamed: 0,max_depth,min_samples_leaf,n_estimators,auc
795,15.0,1,500,0.976739
10,,1,150,0.976190
790,15.0,1,450,0.975721
540,10.0,1,450,0.975564
1015,25.0,1,200,0.975564
...,...,...,...,...
714,10.0,50,150,0.889411
734,10.0,50,350,0.889254
457,5.0,50,100,0.888521
222,,50,250,0.888364


In [45]:
df_tun_param.iloc[0:5]

Unnamed: 0,max_depth,min_samples_leaf,n_estimators,auc
0,,1,50,0.968358
1,,1,50,0.971623
2,,1,50,0.948906
3,,1,50,0.965116
4,,1,50,0.962171


In [46]:
df_tun_param.iloc[5:10]

Unnamed: 0,max_depth,min_samples_leaf,n_estimators,auc
5,,1,100,0.96922
6,,1,100,0.972795
7,,1,100,0.953787
8,,1,100,0.970695
9,,1,100,0.962719


In [59]:
k=[]
for i in range(0, len(df_tun_param), 5):
    k.append(df_tun_param.iloc[i:i+5].mean())

In [60]:
k = pd.DataFrame(k).sort_values('auc', ascending=False)

In [62]:
k.head(15)

Unnamed: 0,max_depth,min_samples_leaf,n_estimators,auc
7,,1.0,400.0,0.969168
158,15.0,1.0,450.0,0.968917
159,15.0,1.0,500.0,0.968556
208,25.0,1.0,450.0,0.96826
9,,1.0,500.0,0.968255
106,10.0,1.0,350.0,0.968214
157,15.0,1.0,400.0,0.968134
107,10.0,1.0,400.0,0.96812
206,25.0,1.0,350.0,0.968115
204,25.0,1.0,250.0,0.967947


In [71]:
dicts = df_train[['type_school', 'school_accreditation', 'gender', 'interest',
       'residence', 'parent_age', 'parent_salary', 'house_area',
       'average_grades', 'parent_was_in_college']].to_dict(orient='records')

dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(dicts)
y_train = df_train.in_college.values

model = RandomForestClassifier(max_depth=10, min_samples_leaf=1, n_estimators=150, n_jobs=-1)
model.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, n_estimators=150, n_jobs=-1)

In [72]:
test_dict = df_test[['type_school', 'school_accreditation', 'gender', 'interest',
       'residence', 'parent_age', 'parent_salary', 'house_area',
       'average_grades', 'parent_was_in_college']].to_dict(orient='records')
X_test = dv.fit_transform(test_dict)
y_train = df_test.in_college.values

y_pred = model.predict_proba(X_test)[:, 1]
roc_auc_score(y_train, y_pred)

0.9707268170426064