# Classification with an Academic Success Dataset

## Exploring datasets

In [1]:
import os
from zipfile import ZipFile
import pandas as pd

In [2]:
os.environ['KAGGLE_CONFIG_DIR'] = '.'
!kaggle competitions download -c playground-series-s4e6

playground-series-s4e6.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
with ZipFile('playground-series-s4e6.zip') as f:
    f.extractall('data')

In [4]:
raw_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
sub_df = pd.read_csv('data/sample_submission.csv')

In [5]:
raw_df

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,1,1,1,9238,1,1,126.0,1,1,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate
1,1,1,17,1,9238,1,1,125.0,1,19,...,0,6,9,0,0.000000,0,11.1,0.6,2.02,Dropout
2,2,1,17,2,9254,1,1,137.0,1,3,...,0,6,0,0,0.000000,0,16.2,0.3,-0.92,Dropout
3,3,1,1,3,9500,1,1,131.0,1,19,...,0,8,11,7,12.820000,0,11.1,0.6,2.02,Enrolled
4,4,1,1,2,9500,1,1,132.0,1,19,...,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76513,76513,1,17,1,9254,1,1,121.0,1,19,...,0,6,8,5,10.600000,0,13.9,-0.3,0.79,Graduate
76514,76514,1,1,6,9254,1,1,125.0,1,1,...,0,6,9,6,13.875000,0,9.4,-0.8,-3.12,Graduate
76515,76515,5,17,1,9085,1,1,138.0,1,37,...,0,5,8,5,11.400000,1,9.4,-0.8,-3.12,Enrolled
76516,76516,1,1,3,9070,1,1,136.0,1,38,...,0,6,0,0,0.000000,0,7.6,2.6,0.32,Dropout


In [6]:
raw_df['Target'].value_counts(normalize=True)

Target
Graduate    0.474163
Dropout     0.330589
Enrolled    0.195248
Name: proportion, dtype: float64

In [7]:
numeric_features = []
categorical_features = []

In [8]:
def view_categoricals_feature(cat_col: str):
    d = dict()    
    total = raw_df[cat_col].value_counts()
    idx = raw_df['Target'].value_counts().index.tolist()
    for t in idx:
        x = raw_df[raw_df['Target']==t][cat_col].value_counts()
        d[t] = x/total*100
    
    return pd.DataFrame(data=d, columns=idx)

In [9]:
view_categoricals_feature('Marital status')

Unnamed: 0_level_0,Graduate,Dropout,Enrolled
Marital status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,49.048996,30.953568,19.997435
2,30.079305,56.042296,13.878399
3,43.75,43.75,12.5
4,24.249423,58.775982,16.974596
5,34.482759,54.310345,11.206897
6,14.285714,65.714286,20.0


In [10]:
categorical_features.append('Marital status')

In [11]:
view_categoricals_feature('Application mode')

Unnamed: 0_level_0,Graduate,Dropout,Enrolled
Application mode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,61.061118,18.77821,20.160672
2,75.0,12.5,12.5
3,100.0,,
4,100.0,,
5,47.126437,20.689655,32.183908
7,21.428571,72.162884,6.408545
9,,100.0,
10,55.813953,23.255814,20.930233
12,100.0,,
15,45.901639,22.404372,31.693989


In [12]:
categorical_features.append('Application mode')

In [13]:
view_categoricals_feature('Application order')

Unnamed: 0_level_0,Graduate,Dropout,Enrolled
Application order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,33.333333,66.666667,
1,42.935952,36.463103,20.600945
2,53.825352,26.726761,19.447887
3,61.577181,21.290082,17.132737
4,65.070644,22.63213,12.297227
5,45.929412,35.952941,18.117647
6,71.340524,16.538264,12.121212
9,,,100.0


In [14]:
categorical_features.append('Application order')

In [15]:
view_categoricals_feature('Course')

Unnamed: 0_level_0,Graduate,Dropout,Enrolled
Course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33,11.111111,75.0,13.888889
39,,100.0,
171,21.476041,71.248688,7.275271
979,,100.0,
8014,47.744053,41.71452,10.541427
9003,27.939995,52.558264,19.501741
9070,64.096312,21.060652,14.843036
9085,42.173832,32.402755,25.423413
9119,4.960053,59.82024,35.219707
9130,22.665006,54.607721,22.727273


In [16]:
def split_courser(x):
    if x in [33,39,171,979,9003,9119,9130,9991]:
        return 2 ## Hard
    if x in [9773,9500,9238,9070]:
        return 0 ## Easy
    return 1 ## Medium
for df in [raw_df, test_df]:
    df['Course Difficulity'] = df['Course'].apply(lambda x: split_courser(x))

In [17]:
view_categoricals_feature('Course Difficulity')

Unnamed: 0_level_0,Graduate,Dropout,Enrolled
Course Difficulity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,70.756094,18.150076,11.093829
1,36.878811,35.096526,28.024662
2,19.859127,59.746951,20.393922


In [18]:
categorical_features.append('Course Difficulity')
categorical_features.append('Course')

In [19]:
view_categoricals_feature('Daytime/evening attendance')

Unnamed: 0_level_0,Graduate,Dropout,Enrolled
Daytime/evening attendance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,48.912019,31.201633,19.886347
0,31.25,53.132716,15.617284


In [20]:
categorical_features.append('Daytime/evening attendance')

In [21]:
view_categoricals_feature('Previous qualification')

Unnamed: 0_level_0,Graduate,Dropout,Enrolled
Previous qualification,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,50.246342,29.468764,20.284893
2,30.612245,61.22449,8.163265
3,21.341899,74.375446,4.282655
4,34.782609,43.478261,21.73913
5,,33.333333,66.666667
6,42.857143,45.918367,11.22449
9,1.948052,96.103896,1.948052
10,9.302326,88.372093,2.325581
11,50.0,50.0,
12,10.901001,84.315907,4.783092


In [22]:
categorical_features.append('Previous qualification')

In [23]:
raw_df['Previous qualification (grade)'].describe()

count    76518.000000
mean       132.378766
std         10.995328
min         95.000000
25%        125.000000
50%        133.100000
75%        140.000000
max        190.000000
Name: Previous qualification (grade), dtype: float64

In [24]:
numeric_features.append('Previous qualification (grade)')

In [25]:
view_categoricals_feature('Nacionality')

Unnamed: 0_level_0,Graduate,Dropout,Enrolled
Nacionality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,47.412943,33.053557,19.533501
2,100.0,,
6,35.714286,39.285714,25.0
11,93.333333,6.666667,
17,50.0,50.0,
21,,66.666667,33.333333
22,62.5,26.785714,10.714286
24,73.333333,6.666667,20.0
25,16.666667,66.666667,16.666667
26,53.731343,17.910448,28.358209


In [26]:
categorical_features.append('Nacionality')

In [27]:
import numpy as np

In [28]:
sort_parent_qualification = ['Lower Parent Qualification', 'Higher Parent Qualification']
for df in [raw_df, test_df]:
    df[sort_parent_qualification] = np.sort(
        df[["Mother's qualification", "Father's qualification"]],
        axis=1
    )

In [29]:
categorical_features.append('Lower Parent Qualification')
categorical_features.append('Higher Parent Qualification')

In [30]:
sort_parent_occupation = ['Lower Parent Occupation', 'Higher Parent Occupation']
for df in [raw_df, test_df]:
    df[sort_parent_occupation] = np.sort(
        df[["Mother's occupation", "Father's occupation"]],
        axis=1
    )

In [31]:
categorical_features.append('Lower Parent Occupation')
categorical_features.append('Higher Parent Occupation')

In [32]:
raw_df['Admission grade'].describe()

count    76518.000000
mean       125.363971
std         12.562328
min         95.000000
25%        118.000000
50%        124.600000
75%        132.000000
max        190.000000
Name: Admission grade, dtype: float64

In [33]:
numeric_features.append('Admission grade')

In [34]:
view_categoricals_feature('Displaced')

Unnamed: 0_level_0,Graduate,Dropout,Enrolled
Displaced,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,39.831306,40.75063,19.418065
1,53.15549,27.238917,19.605592


In [35]:
categorical_features.append('Displaced')

In [36]:
view_categoricals_feature('Educational special needs')

Unnamed: 0_level_0,Graduate,Dropout,Enrolled
Educational special needs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,47.419719,33.059608,19.520674
1,46.503497,32.867133,20.629371


In [37]:
view_categoricals_feature('Debtor')

Unnamed: 0_level_0,Graduate,Dropout,Enrolled
Debtor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,50.17451,29.718813,20.106676
1,11.534237,76.510436,11.955328


In [38]:
categorical_features.append('Debtor')

In [39]:
view_categoricals_feature('Tuition fees up to date')

Unnamed: 0_level_0,Graduate,Dropout,Enrolled
Tuition fees up to date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,52.86926,25.813103,21.317637
0,1.597444,93.942,4.460555


In [40]:
categorical_features.append('Tuition fees up to date')

In [41]:
view_categoricals_feature('Gender')

Unnamed: 0_level_0,Graduate,Dropout,Enrolled
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,57.747555,23.525367,18.727078
1,25.035173,53.711827,21.253


In [42]:
categorical_features.append('Gender')

In [43]:
view_categoricals_feature('Scholarship holder')

Unnamed: 0_level_0,Graduate,Dropout,Enrolled
Scholarship holder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,35.931097,41.583316,22.485587
1,82.356049,7.126255,10.517697


In [44]:
categorical_features.append('Scholarship holder')

In [45]:
raw_df['Age at enrollment'].describe()

count    76518.000000
mean        22.278653
std          6.889241
min         17.000000
25%         18.000000
50%         19.000000
75%         23.000000
max         70.000000
Name: Age at enrollment, dtype: float64

In [46]:
numeric_features.append('Age at enrollment')

In [47]:
def get_age_group(x):
    if x<=18:
        return 1
    if x==19:
        return 2
    if x>19 and x<24:
        return 3
    if x>=24 and x<=30:
        return 4
    return 5

for df in [raw_df, test_df]:
    df['Age Group'] = df['Age at enrollment'].apply(lambda x: get_age_group(x))

In [48]:
raw_df['Age Group'].value_counts()

Age Group
1    22417
2    18078
3    17832
4     9256
5     8935
Name: count, dtype: int64

In [49]:
view_categoricals_feature('Age Group')

Unnamed: 0_level_0,Graduate,Dropout,Enrolled
Age Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,64.781193,17.352902,17.865905
2,56.14006,23.34329,20.51665
3,44.150965,29.475101,26.373934
4,18.117978,66.443388,15.438634
5,23.066592,64.689424,12.243984


In [50]:
categorical_features.append('Age Group')

In [51]:
view_categoricals_feature('International')

Unnamed: 0_level_0,Graduate,Dropout,Enrolled
International,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,47.412874,33.053111,19.534015
1,47.928994,33.925049,18.145957


In [52]:
first_sem_features = [f'Curricular units 1st sem ({x})' 
                      for x in ['credited','enrolled','evaluations','approved','grade','without evaluations']]
raw_df[first_sem_features]

Unnamed: 0,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations)
0,0,6,6,6,14.500000,0
1,0,6,8,4,11.600000,0
2,0,6,0,0,0.000000,0
3,0,7,9,7,12.591250,0
4,0,7,12,6,12.933333,0
...,...,...,...,...,...,...
76513,0,6,9,6,10.666667,0
76514,0,6,22,4,13.000000,0
76515,0,5,13,4,12.500000,2
76516,0,6,0,0,0.000000,0


In [53]:
view_categoricals_feature('Curricular units 1st sem (without evaluations)')

Unnamed: 0_level_0,Graduate,Dropout,Enrolled
Curricular units 1st sem (without evaluations),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,47.974186,32.913905,19.111909
1,33.84391,31.655726,34.500365
2,32.975871,30.831099,36.193029
3,18.699187,60.162602,21.138211
4,9.090909,77.922078,12.987013
5,7.692308,80.769231,11.538462
6,3.225806,93.548387,3.225806
7,5.0,92.5,2.5
8,3.571429,96.428571,
9,,100.0,


In [54]:
raw_df[first_sem_features].describe()

Unnamed: 0,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations)
count,76518.0,76518.0,76518.0,76518.0,76518.0,76518.0
mean,0.188871,5.891516,7.352362,4.17852,9.995862,0.05796
std,1.175296,1.671776,3.508292,2.687995,5.264224,0.40849
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,5.0,6.0,2.0,10.666667,0.0
50%,0.0,6.0,7.0,5.0,12.166667,0.0
75%,0.0,6.0,9.0,6.0,13.314286,0.0
max,20.0,26.0,45.0,26.0,18.875,12.0


In [55]:
for f in first_sem_features:
    numeric_features.append(f)

In [56]:
def get_cut(x):
    if x<=6:
        return 1
    if x<=11:
        return 2
    if x<=12.5:
        return 3
    if x<=13.5:
        return 4
    return 5
for df in [raw_df, test_df]:
    df['1st sem grade cut'] = df['Curricular units 1st sem (grade)'].apply(lambda x: get_cut(x))

In [57]:
raw_df['1st sem grade cut'].value_counts()

1st sem grade cut
3    20945
4    17405
1    16036
5    14839
2     7293
Name: count, dtype: int64

In [58]:
view_categoricals_feature('1st sem grade cut')

Unnamed: 0_level_0,Graduate,Dropout,Enrolled
1st sem grade cut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3.417311,92.635321,3.947368
2,15.508021,41.670095,42.821884
3,43.213177,23.041299,33.745524
4,73.013502,9.893709,17.092789
5,86.555698,5.755105,7.689197


In [59]:
categorical_features.append('1st sem grade cut')

In [60]:
second_sem_features = [f'Curricular units 2nd sem ({x})' 
                      for x in ['credited','enrolled','evaluations','approved','grade','without evaluations']]
raw_df[second_sem_features]

Unnamed: 0,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations)
0,0,6,7,6,12.428571,0
1,0,6,9,0,0.000000,0
2,0,6,0,0,0.000000,0
3,0,8,11,7,12.820000,0
4,0,7,12,6,12.933333,0
...,...,...,...,...,...,...
76513,0,6,8,5,10.600000,0
76514,0,6,9,6,13.875000,0
76515,0,5,8,5,11.400000,1
76516,0,6,0,0,0.000000,0


In [61]:
for f in second_sem_features:
    numeric_features.append(f)

In [62]:
for df in [raw_df, test_df]:
    df['2nd sem grade cut'] = df['Curricular units 2nd sem (grade)'].apply(lambda x: get_cut(x))

In [63]:
raw_df['2nd sem grade cut'].value_counts()

2nd sem grade cut
3    19430
1    18482
4    16270
5    14985
2     7351
Name: count, dtype: int64

In [64]:
view_categoricals_feature('2nd sem grade cut')

Unnamed: 0_level_0,Graduate,Dropout,Enrolled
2nd sem grade cut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3.002922,93.37734,3.619738
2,12.569718,38.062849,49.367433
3,48.908904,16.304683,34.786413
4,74.228642,8.340504,17.430854
5,88.241575,4.771438,6.986987


In [65]:
categorical_features.append('2nd sem grade cut')

In [66]:
for df in [raw_df, test_df]:
    df['Avg Grade'] = (df['Curricular units 1st sem (grade)'] + df['Curricular units 2nd sem (grade)'])/2

In [67]:
for df in [raw_df, test_df]:
    df['Avg Previous Grade'] = (df['Previous qualification (grade)'] + df['Admission grade'])/2

In [68]:
numeric_features.append('Avg Grade')
numeric_features.append('Avg Previous Grade')

In [69]:
raw_df[['Unemployment rate','Inflation rate','GDP']].describe()

Unnamed: 0,Unemployment rate,Inflation rate,GDP
count,76518.0,76518.0,76518.0
mean,11.52034,1.228218,-0.080921
std,2.653375,1.398816,2.251382
min,7.6,-0.8,-4.06
25%,9.4,0.3,-1.7
50%,11.1,1.4,0.32
75%,12.7,2.6,1.79
max,16.2,3.7,3.51


In [70]:
numeric_features.append('Unemployment rate')
numeric_features.append('Inflation rate')
numeric_features.append('GDP')

## Preprocessing

In [71]:
raw_df = pd.concat([raw_df, test_df]).copy()
raw_df.index = raw_df['id']
raw_df

Unnamed: 0_level_0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Course Difficulity,Lower Parent Qualification,Higher Parent Qualification,Lower Parent Occupation,Higher Parent Occupation,Age Group,1st sem grade cut,2nd sem grade cut,Avg Grade,Avg Previous Grade
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,1,1,1,9238,1,1,126.0,1,1,...,0,1,19,5,5,1,5,3,13.464286,124.30
1,1,1,17,1,9238,1,1,125.0,1,19,...,0,19,19,9,9,1,3,1,5.800000,122.40
2,2,1,17,2,9254,1,1,137.0,1,3,...,1,3,19,2,3,1,1,1,0.000000,140.85
3,3,1,1,3,9500,1,1,131.0,1,19,...,0,3,19,2,3,1,4,4,12.705625,128.55
4,4,1,1,2,9500,1,1,132.0,1,19,...,0,19,37,4,9,1,4,4,12.933333,126.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127525,127525,1,1,2,171,1,1,128.0,1,38,...,2,37,38,7,10,2,1,1,0.000000,126.35
127526,127526,2,39,1,9119,1,19,133.1,1,19,...,2,19,37,9,9,5,1,1,0.000000,136.55
127527,127527,1,1,1,171,1,1,127.0,1,1,...,2,1,1,4,10,3,1,1,0.000000,123.70
127528,127528,1,1,3,9773,1,1,132.0,1,19,...,0,19,19,5,5,1,4,4,12.800000,129.15


In [72]:
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder

In [73]:
need_encode_features = ['Application mode', 'Previous qualification',
                        'Nacionality', 'Lower Parent Qualification', 
                        'Higher Parent Qualification', 'Lower Parent Occupation', 
                        'Higher Parent Occupation', 'Course']

In [74]:
encoder = OrdinalEncoder()
encoder.fit(raw_df[need_encode_features])
raw_df[need_encode_features] = encoder.transform(raw_df[need_encode_features])

In [75]:
input_cols = numeric_features+categorical_features
target_col = 'Target'

In [76]:
scaler = MinMaxScaler()
scaler.fit(raw_df[numeric_features])
raw_df[numeric_features] = scaler.transform(raw_df[numeric_features])

In [77]:
from sklearn.model_selection import train_test_split

In [78]:
test_df = raw_df[raw_df['id']>=test_df['id'].min()]

In [79]:
raw_df = raw_df[raw_df['id']<test_df['id'].min()]

In [80]:
train_df, val_df = train_test_split(raw_df, test_size=0.2, random_state=42)

In [81]:
train_inputs = train_df[input_cols]
val_inputs = val_df[input_cols]
test_inputs = test_df[input_cols]

In [82]:
target_map = {
    'Dropout': 0,
    'Enrolled': 1,
    'Graduate': 2
}
train_targets = train_df[target_col].map(target_map)
val_targets = val_df[target_col].map(target_map)

In [83]:
train_targets

id
12065    2
17210    0
60954    2
2322     1
374      0
        ..
37194    2
6265     2
54886    2
860      2
15795    2
Name: Target, Length: 61214, dtype: int64

## Model

In [84]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [85]:
classifier = {
    'LogisticRegression': LogisticRegression(n_jobs=-1),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier(n_jobs=-1),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'XGBClassifier': XGBClassifier(n_jobs=-1),
    'LGBMClassifier': LGBMClassifier(n_jobs=-1)
}
result = {'CLF': [], 'ACC': []}

In [86]:
from sklearn.metrics import accuracy_score

In [87]:
for clf_name, clf in classifier.items():
    clf.fit(train_inputs, train_targets)
    val_preds = clf.predict(val_inputs)
    acc = accuracy_score(val_targets, val_preds)
    result['CLF'].append(clf_name)
    result['ACC'].append(acc)
result_df = pd.DataFrame(result)
result_df

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007089 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1894
[LightGBM] [Info] Number of data points in the train set: 61214, number of used features: 40
[LightGBM] [Info] Start training from score -1.105333
[LightGBM] [Info] Start training from score -1.635907
[LightGBM] [Info] Start training from score -0.746287


Unnamed: 0,CLF,ACC
0,LogisticRegression,0.786788
1,DecisionTreeClassifier,0.742878
2,RandomForestClassifier,0.824425
3,GradientBoostingClassifier,0.829783
4,XGBClassifier,0.830633
5,LGBMClassifier,0.833116


In [88]:
def test_params(**params):
    model = XGBClassifier(**params)
    model.fit(train_inputs, train_targets)
    train_acc = model.score(train_inputs, train_targets)
    val_acc = model.score(val_inputs, val_targets)
    return train_acc, val_acc

In [89]:
test_params(n_jobs=-1)

(0.8840297971052373, 0.8306325143753267)

In [90]:
for i in [70,90, 120, 150, 250, 350]:
    train_acc, val_acc = test_params(n_jobs=-1, n_estimators=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 70: train_acc = 0.869654, val_acc = 0.831939
Test 90: train_acc = 0.879113, val_acc = 0.830175
Test 120: train_acc = 0.892704, val_acc = 0.830044
Test 150: train_acc = 0.905087, val_acc = 0.830502
Test 250: train_acc = 0.934753, val_acc = 0.829326
Test 350: train_acc = 0.956840, val_acc = 0.827365


In [91]:
for i in range(4,12):
    train_acc, val_acc = test_params(n_jobs=-1, n_estimators=70,
                                     max_depth=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 4: train_acc = 0.841180, val_acc = 0.833116
Test 5: train_acc = 0.852485, val_acc = 0.832397
Test 6: train_acc = 0.869654, val_acc = 0.831939
Test 7: train_acc = 0.893962, val_acc = 0.831221
Test 8: train_acc = 0.922452, val_acc = 0.829783
Test 9: train_acc = 0.953213, val_acc = 0.826451
Test 10: train_acc = 0.975414, val_acc = 0.825536
Test 11: train_acc = 0.992681, val_acc = 0.826451


In [92]:
for i in [0.005,0.01,0.05,0.1,0.2,0.3,0.4]:
    train_acc, val_acc = test_params(n_jobs=-1, n_estimators=70,
                                     max_depth=5, learning_rate=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 0.005: train_acc = 0.812608, val_acc = 0.811226
Test 0.01: train_acc = 0.814814, val_acc = 0.813970
Test 0.05: train_acc = 0.826625, val_acc = 0.826385
Test 0.1: train_acc = 0.834515, val_acc = 0.830175
Test 0.2: train_acc = 0.845280, val_acc = 0.831678
Test 0.3: train_acc = 0.852485, val_acc = 0.832397
Test 0.4: train_acc = 0.858529, val_acc = 0.831025


In [93]:
for i in [0, 0.1, 0.2, 0.4, 0.7, 1, 1.2, 1.5, 2]:
    train_acc, val_acc = test_params(n_jobs=-1, n_estimators=70,
                                     max_depth=5, reg_alpha=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 0: train_acc = 0.852485, val_acc = 0.832397
Test 0.1: train_acc = 0.853105, val_acc = 0.832201
Test 0.2: train_acc = 0.853530, val_acc = 0.832462
Test 0.4: train_acc = 0.853922, val_acc = 0.832135
Test 0.7: train_acc = 0.853105, val_acc = 0.833116
Test 1: train_acc = 0.851782, val_acc = 0.832397
Test 1.2: train_acc = 0.852681, val_acc = 0.832527
Test 1.5: train_acc = 0.852142, val_acc = 0.832593
Test 2: train_acc = 0.851488, val_acc = 0.832005


In [94]:
xgb_model = XGBClassifier(n_jobs=-1, n_estimators=70,
                          max_depth=5, reg_alpha=0.7)

In [95]:
xgb_model.fit(train_inputs, train_targets)

In [96]:
xgb_model.score(val_inputs, val_targets)

0.8331155253528489

In [97]:
test_preds = xgb_model.predict(test_inputs)
test_preds = pd.Series(test_preds).map({
    0: 'Dropout',
    1: 'Enrolled',
    2: 'Graduate'
})
test_preds

0         Dropout
1        Graduate
2        Graduate
3        Graduate
4        Enrolled
           ...   
51007     Dropout
51008     Dropout
51009     Dropout
51010     Dropout
51011     Dropout
Length: 51012, dtype: object

In [98]:
sub_df['Target'] = test_preds
sub_df.to_csv('data/sub1.csv',index=None)

In [99]:
def test_params(**params):
    model = LGBMClassifier(**params)
    model.fit(train_inputs, train_targets)
    train_acc = model.score(train_inputs, train_targets)
    val_acc = model.score(val_inputs, val_targets)
    return train_acc, val_acc

In [100]:
test_params(n_jobs=-1)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004504 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1894
[LightGBM] [Info] Number of data points in the train set: 61214, number of used features: 40
[LightGBM] [Info] Start training from score -1.105333
[LightGBM] [Info] Start training from score -1.635907
[LightGBM] [Info] Start training from score -0.746287


(0.8491194824713301, 0.8331155253528489)

In [101]:
train_accs = []
val_accs = []
idx = []
for i in ['gbdt','dart']:
    train_acc, val_acc = test_params(n_jobs=-1, boosting_type=i)
    idx.append(i)
    train_accs.append(train_acc)
    val_accs.append(val_acc)
for i in range(len(train_accs)):
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(idx[i], train_accs[i], val_accs[i]))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004404 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1894
[LightGBM] [Info] Number of data points in the train set: 61214, number of used features: 40
[LightGBM] [Info] Start training from score -1.105333
[LightGBM] [Info] Start training from score -1.635907
[LightGBM] [Info] Start training from score -0.746287
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003322 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1894
[LightGBM] [Info] Number of data points in the train set: 61214, number of used features: 40
[LightGBM] [Info] Start training from score -1.105333
[LightGBM] [Info] Start training from score -1.635907
[LightGBM] [Info] Start 

In [103]:
train_accs = []
val_accs = []
idx = []
for i in range(31,65,7):
    train_acc, val_acc = test_params(n_jobs=-1, num_leaves=i)
    idx.append(i)
    train_accs.append(train_acc)
    val_accs.append(val_acc)
for i in range(len(train_accs)):
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(idx[i], train_accs[i], val_accs[i]))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003468 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1894
[LightGBM] [Info] Number of data points in the train set: 61214, number of used features: 40
[LightGBM] [Info] Start training from score -1.105333
[LightGBM] [Info] Start training from score -1.635907
[LightGBM] [Info] Start training from score -0.746287
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003627 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1894
[LightGBM] [Info] Number of data points in the train set: 61214, number of used features: 40
[LightGBM] [Info] Start training from score -1.105333
[LightGBM] [Info] Start training from score -1.635907
[LightGBM] [Info] Start 

In [105]:
train_accs = []
val_accs = []
idx = []
for i in [0.05,0.1,0.15,0.2,0.3]:
    train_acc, val_acc = test_params(n_jobs=-1, learning_rate=i)
    idx.append(i)
    train_accs.append(train_acc)
    val_accs.append(val_acc)
for i in range(len(train_accs)):
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(idx[i], train_accs[i], val_accs[i]))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004565 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1894
[LightGBM] [Info] Number of data points in the train set: 61214, number of used features: 40
[LightGBM] [Info] Start training from score -1.105333
[LightGBM] [Info] Start training from score -1.635907
[LightGBM] [Info] Start training from score -0.746287
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004168 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1894
[LightGBM] [Info] Number of data points in the train set: 61214, number of used features: 40
[LightGBM] [Info] Start training from score -1.105333
[LightGBM] [Info] Start training from score -1.635907
[LightGBM] [Info] Start 

In [106]:
train_accs = []
val_accs = []
idx = []
for i in [60,70,90,100,150,200]:
    train_acc, val_acc = test_params(n_jobs=-1, n_estimators=i)
    idx.append(i)
    train_accs.append(train_acc)
    val_accs.append(val_acc)
for i in range(len(train_accs)):
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(idx[i], train_accs[i], val_accs[i]))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003472 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1894
[LightGBM] [Info] Number of data points in the train set: 61214, number of used features: 40
[LightGBM] [Info] Start training from score -1.105333
[LightGBM] [Info] Start training from score -1.635907
[LightGBM] [Info] Start training from score -0.746287
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003676 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1894
[LightGBM] [Info] Number of data points in the train set: 61214, number of used features: 40
[LightGBM] [Info] Start training from score -1.105333
[LightGBM] [Info] Start training from score -1.635907
[LightGBM] [Info] Start 

In [107]:
train_accs = []
val_accs = []
idx = []
for i in [0, 0.25,0.5,0.75,1,1.25]:
    train_acc, val_acc = test_params(n_jobs=-1, reg_alpha=i)
    idx.append(i)
    train_accs.append(train_acc)
    val_accs.append(val_acc)
for i in range(len(train_accs)):
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(idx[i], train_accs[i], val_accs[i]))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003563 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1894
[LightGBM] [Info] Number of data points in the train set: 61214, number of used features: 40
[LightGBM] [Info] Start training from score -1.105333
[LightGBM] [Info] Start training from score -1.635907
[LightGBM] [Info] Start training from score -0.746287
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003246 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1894
[LightGBM] [Info] Number of data points in the train set: 61214, number of used features: 40
[LightGBM] [Info] Start training from score -1.105333
[LightGBM] [Info] Start training from score -1.635907
[LightGBM] [Info] Start 

In [108]:
train_accs = []
val_accs = []
idx = []
for i in [0, 0.25,0.5,0.75,1,1.25]:
    train_acc, val_acc = test_params(n_jobs=-1, reg_alpha=1.25,
                                     reg_lambda=i)
    idx.append(i)
    train_accs.append(train_acc)
    val_accs.append(val_acc)
for i in range(len(train_accs)):
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(idx[i], train_accs[i], val_accs[i]))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006220 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1894
[LightGBM] [Info] Number of data points in the train set: 61214, number of used features: 40
[LightGBM] [Info] Start training from score -1.105333
[LightGBM] [Info] Start training from score -1.635907
[LightGBM] [Info] Start training from score -0.746287
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003861 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1894
[LightGBM] [Info] Number of data points in the train set: 61214, number of used features: 40
[LightGBM] [Info] Start training from score -1.105333
[LightGBM] [Info] Start training from score -1.635907
[LightGBM] [Info] Start 

In [110]:
lgbm_model = LGBMClassifier(n_jobs=-1, reg_alpha=1.25, reg_lambda=0.5)
lgbm_model.fit(train_inputs, train_targets)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004795 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1894
[LightGBM] [Info] Number of data points in the train set: 61214, number of used features: 40
[LightGBM] [Info] Start training from score -1.105333
[LightGBM] [Info] Start training from score -1.635907
[LightGBM] [Info] Start training from score -0.746287


In [111]:
lgbm_model.score(val_inputs, val_targets)

0.8337036069001568

In [113]:
test_preds = lgbm_model.predict(test_inputs)
test_preds = pd.Series(test_preds).map({
    0: 'Dropout',
    1: 'Enrolled',
    2: 'Graduate'
})
test_preds
sub_df['Target'] = test_preds
sub_df.to_csv('data/sub2.csv',index=None)