In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [5]:
df = pd.read_csv('students.csv')

In [6]:
pd.set_option("display.max_columns", 40)

In [7]:
df.sample(15)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,final_score,grade
481,MS,F,17,R,GT3,T,2,1,at_home,other,course,mother,3,1,0,no,yes,no,yes,yes,no,no,yes,5,5,3,1,1,3,2,11,average
396,GP,M,18,U,LE3,T,3,4,services,other,home,mother,1,2,0,no,no,no,yes,yes,yes,yes,yes,4,3,3,1,3,5,6,17,high
77,GP,F,16,U,GT3,T,2,2,other,other,reputation,mother,1,4,0,no,no,no,no,yes,yes,yes,yes,5,2,3,1,3,3,1,13,average
346,GP,M,17,U,LE3,T,4,4,other,teacher,home,father,2,1,0,no,no,no,no,yes,yes,yes,no,4,1,1,2,2,5,0,13,average
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,no,yes,yes,yes,yes,yes,3,2,2,1,1,5,0,14,high
186,GP,M,17,U,LE3,T,4,4,teacher,other,reputation,mother,1,2,0,no,yes,no,yes,yes,yes,yes,no,4,4,4,1,3,5,0,10,average
493,MS,F,17,U,GT3,T,0,1,other,at_home,course,father,2,1,0,no,no,no,yes,no,yes,no,no,2,4,4,3,5,5,5,10,average
178,GP,M,17,R,LE3,T,1,1,other,services,course,mother,4,2,0,no,no,no,yes,yes,no,no,yes,5,3,5,1,5,5,0,8,average
24,GP,F,15,R,GT3,T,2,4,services,health,course,mother,1,3,0,yes,yes,no,yes,yes,yes,yes,no,4,3,2,1,1,5,2,10,average
515,MS,F,18,U,LE3,T,1,1,other,at_home,reputation,mother,2,2,0,yes,no,no,no,yes,yes,no,no,2,3,5,1,4,3,8,10,average


In [8]:
df.columns

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'final_score', 'grade'],
      dtype='object')

After doing some data eda i decided to choose these features:

In [9]:
features_chosen = ['school', 'address', 'schoolsup', 'paid', 'activities', 'higher', 'internet', 'Mjob', 'Fjob', 'Medu',
                   'Fedu', 'studytime', 'failures', 'freetime', 'Dalc', 'final_score']

features_cat = df[features_chosen].select_dtypes(include=['object']).columns
features_num = df[features_chosen].select_dtypes(include=['int64']).columns

In [10]:
df_dum = pd.get_dummies(df[features_cat], drop_first=True)

In [11]:
df_fin = pd.concat([df_dum, df[features_num]], axis=1)

In [12]:
df_fin

Unnamed: 0,school_MS,address_U,schoolsup_yes,paid_yes,activities_yes,higher_yes,internet_yes,Mjob_health,Mjob_other,Mjob_services,Mjob_teacher,Fjob_health,Fjob_other,Fjob_services,Fjob_teacher,Medu,Fedu,studytime,failures,freetime,Dalc,final_score
0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,4,4,2,0,3,1,11
1,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,1,1,2,0,3,1,11
2,0,1,1,0,0,1,1,0,0,0,0,0,1,0,0,1,1,2,0,3,2,12
3,0,1,0,0,1,1,1,1,0,0,0,0,0,1,0,4,2,3,0,2,1,14
4,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,3,3,2,0,3,1,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,1,0,0,0,1,1,1,0,0,1,0,0,1,0,0,2,3,3,1,4,1,10
645,1,1,0,0,0,1,1,0,0,0,1,0,0,1,0,3,1,2,0,3,1,16
646,1,1,0,0,1,1,0,0,1,0,0,0,1,0,0,1,1,2,0,1,1,9
647,1,1,0,0,0,1,1,0,0,1,0,0,0,1,0,3,1,1,0,4,3,10


In [21]:
X = df_fin.drop('final_score', axis=1)
y = df_fin.final_score.values

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=40)

## Choosing the best model

In [23]:
from sklearn.model_selection import KFold
from sklearn import metrics

In [25]:
kf = KFold(n_splits=5)

In [26]:
def cross_val_score(model, X=X_train, y=y_train):
    kf = KFold(n_splits=5)
    R2 = []
    MSE = []
    MAE = []
    MQE = []
    for train_index, test_index in kf.split(X, y):
        X_train_f, X_test_f = np.array(X)[train_index], np.array(X)[test_index]
        y_train_f, y_test_f = np.array(y)[train_index], np.array(y)[test_index]
        
    model.fit(X_train_f, y_train_f)
    y_pred = model.predict(X_test_f)
        
    R2.append(metrics.r2_score(y_test_f, y_pred))
    MSE.append(metrics.mean_squared_error(y_test_f, y_pred))
    MAE.append(metrics.mean_absolute_error(y_test_f, y_pred))
    MQE.append(metrics.mean_absolute_error(y_test_f, np.round(y_pred)))

    return {'R2':np.mean(R2), 'MSE':np.mean(MSE), 'MAE':np.mean(MAE), 'MQE':np.mean(MQE)}

In [27]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model

models=[linear_model.LinearRegression(),
        linear_model.LogisticRegression(),
        linear_model.Ridge(),
        linear_model.Lasso(),
        linear_model.ElasticNet(),
        DecisionTreeRegressor(),
        RandomForestRegressor()]

In [28]:
for model in models: 
    print(type(model).__name__)
    print(cross_val_score(model))

LinearRegression
{'R2': 0.21989195381349758, 'MSE': 8.68670252737818, 'MAE': 2.096739005813339, 'MQE': 2.0775862068965516}
LogisticRegression
{'R2': -0.0304332737125923, 'MSE': 11.474137931034482, 'MAE': 2.3706896551724137, 'MQE': 2.3706896551724137}
Ridge
{'R2': 0.22200973598004603, 'MSE': 8.663120481546807, 'MAE': 2.091169632535304, 'MQE': 2.0689655172413794}
Lasso
{'R2': -0.0017972751238040097, 'MSE': 11.155268766011469, 'MAE': 2.495137046861185, 'MQE': 2.4827586206896552}
ElasticNet
{'R2': 0.058472551399491146, 'MSE': 10.484148839811672, 'MAE': 2.411857219060959, 'MQE': 2.4224137931034484}
DecisionTreeRegressor
{'R2': -0.43068421474145047, 'MSE': 15.931034482758621, 'MAE': 2.896551724137931, 'MQE': 2.896551724137931}
RandomForestRegressor


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'R2': 0.13008502603513195, 'MSE': 9.686725627155171, 'MAE': 2.2655014367816095, 'MQE': 2.293103448275862}


## Model parameters optimization

In [None]:
RG = linear_model.Ridge(),
n_estimators = [100, 200, 300]
max_features = ['auto', 'sqrt', 'log2']
max_depth = [x for x in range(1,10)]
max_depth.append(None)
min_samples_split = [2, 3, 5, 8, 10]
min_samples_leaf = [1, 2, 3, 4, 5]
bootstrap = [True, False]

random_grid = {'regression__n_estimators': n_estimators,
               'regression__max_features': max_features,
               'regression__max_depth': max_depth,
               'regression__min_samples_split': min_samples_split,
               'regression__min_samples_leaf': min_samples_leaf,
               'regression__bootstrap': bootstrap,
                }
