In [1]:
import pandas as pd 
import numpy as np


%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree


from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler 
from sklearn import metrics

In [2]:
df = pd.read_csv('../DATASETS/Performance/sp1.csv')

In [3]:
features = df.copy().drop(['G1', 'G2', 'G3'], axis=1)
target = df.copy()['G1']

In [4]:
ohe_columns = []
for col in features.columns:
    if col not in ['age', 'absences']:
        ohe_columns.append(col)
        
features = pd.get_dummies(features, drop_first=True, columns=ohe_columns)

In [5]:
features_train, features_test, target_train, target_test = train_test_split(features,target,test_size=0.2,random_state=5)

In [6]:
scaler = StandardScaler()
features_train[['age', 'absences']] = scaler.fit_transform(features_train[['age', 'absences']])
features_test[['age', 'absences']] = scaler.transform(features_test[['age', 'absences']])

In [7]:
from sklearn.linear_model import LinearRegression
regressor_lr = LinearRegression()
regressor_lr.fit(features_train, target_train)

In [8]:
feature_importances = pd.concat([pd.Series(features_train.columns, name='features'), 
                                 pd.Series(regressor_lr.coef_, name='importance')],
                                axis=1)

feature_importances['importance'] = abs(feature_importances['importance'])

feature_importances = feature_importances.sort_values(by='importance', ascending=False).reset_index(drop=True)

In [9]:
feature_importances.head()

Unnamed: 0,features,importance
0,failures_3,4.125497
1,famrel_2,3.695198
2,freetime_5,3.061721
3,Medu_2,2.570433
4,Medu_3,2.561818


In [10]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_predict
from sklearn.neighbors import KNeighborsRegressor

In [11]:
results_features = pd.DataFrame({'threshold' : [], 
                    'number_of_features' : [],
                    'mse' : []})

for threshold in np.arange(0, 4, 0.05):
    features_truncated = features_train.copy()

    for col in features_truncated.columns:
        if feature_importances.loc[feature_importances['features'] == col]['importance'].values < threshold:
            features_truncated = features_truncated.drop(col, axis=1)

    y_pred = cross_val_predict(KNeighborsRegressor(n_neighbors=6), features_truncated, target_train, cv=5)
    mse = mean_squared_error(target_train, y_pred)
    if results_features.empty or results_features['number_of_features'].iloc[-1] > features_truncated.shape[1]:
        results_features = results_features.append(pd
                                                   .DataFrame([[threshold, 
                                                                features_truncated.shape[1], 
                                                                mse]], 
                                                              columns=results_features.columns))

  results_features = results_features.append(pd
  results_features = results_features.append(pd
  results_features = results_features.append(pd
  results_features = results_features.append(pd
  results_features = results_features.append(pd
  results_features = results_features.append(pd
  results_features = results_features.append(pd
  results_features = results_features.append(pd
  results_features = results_features.append(pd
  results_features = results_features.append(pd
  results_features = results_features.append(pd
  results_features = results_features.append(pd
  results_features = results_features.append(pd
  results_features = results_features.append(pd
  results_features = results_features.append(pd
  results_features = results_features.append(pd
  results_features = results_features.append(pd
  results_features = results_features.append(pd
  results_features = results_features.append(pd
  results_features = results_features.append(pd
  results_features = results_features.ap

In [12]:
results_features.sort_values(by='mse').head()

Unnamed: 0,threshold,number_of_features,mse
0,0.95,30.0,10.561445
0,0.9,32.0,10.614539
0,0.45,41.0,10.755011
0,1.0,28.0,10.772767
0,0.5,39.0,10.895218


In [13]:
features_train_truncated = features_train.copy()

for col in features_train_truncated.columns:
    if feature_importances.loc[feature_importances['features'] == col]['importance'].values < 0.9:
        features_train_truncated = features_train_truncated.drop(col, axis=1)

In [14]:
features_train_truncated.head()

Unnamed: 0,sex_M,famsize_LE3,Medu_1,Medu_2,Medu_3,Medu_4,Mjob_teacher,Fjob_other,Fjob_services,Fjob_teacher,...,famrel_5,freetime_2,freetime_5,goout_4,Dalc_4,Dalc_5,health_2,health_3,health_4,health_5
48,1,0,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1
247,1,0,0,0,1,0,0,0,1,0,...,1,0,0,0,0,1,0,0,0,0
42,1,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
237,0,0,0,1,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1
127,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [1]:
features_test_truncated = features_test.copy()

for col in features_test_truncated.columns:
    if feature_importances.loc[feature_importances['features'] == col]['importance'].values < 0.9:
        features_test_truncated = features_test_truncated.drop(col, axis=1)

NameError: name 'features_test' is not defined

In [16]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

In [17]:
parameters = {'max_depth' : [2, 3, 4],
              'n_estimators' : [20, 30, 50, 75, 100],
              'eta' : [0.5, 0.3, 0.1, 0.05]}
grid_search_xgb = GridSearchCV(estimator=XGBRegressor(random_state=42),
                               param_grid=parameters,
                               scoring='neg_mean_squared_error',
                               cv=5,
                               verbose=1)

grid_search_xgb.fit(features_train_truncated, target_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


In [18]:
grid_search_xgb.best_params_

{'eta': 0.5, 'max_depth': 2, 'n_estimators': 20}

In [19]:
grid_search_xgb.best_score_

-8.925028194937594

In [20]:
regressor = XGBRegressor(max_depth=2, 
                         n_estimators=20,
                         eta=0.5,
                         random_state=42)
regressor.fit(features_train_truncated, target_train)

In [21]:
target_pred = regressor.predict(features_test_truncated)

In [22]:
features_test_truncated.head()

Unnamed: 0,sex_M,famsize_LE3,Medu_1,Medu_2,Medu_3,Medu_4,Mjob_teacher,Fjob_other,Fjob_services,Fjob_teacher,...,famrel_5,freetime_2,freetime_5,goout_4,Dalc_4,Dalc_5,health_2,health_3,health_4,health_5
306,1,0,0,0,1,0,0,1,0,0,...,1,0,1,0,0,0,0,0,0,1
343,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
117,1,0,0,0,1,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
50,0,1,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
316,0,0,0,1,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0


In [30]:
from sklearn.linear_model import Lasso
ls = Lasso(alpha=0.1)
ls.fit(features_train_truncated, target_train)
target_pred_ls = ls.predict(features_test_truncated)

In [31]:
mean_squared_error(target_test, target_pred_ls) ** 0.5

3.087802082708506

In [32]:
mean_squared_error(target_train,  ls.predict(features_train_truncated)) ** 0.5

2.952862705363398

In [23]:
for i in features_test_truncated.columns:
    print(i)

sex_M
famsize_LE3
Medu_1
Medu_2
Medu_3
Medu_4
Mjob_teacher
Fjob_other
Fjob_services
Fjob_teacher
traveltime_3
studytime_3
studytime_4
failures_1
failures_2
failures_3
schoolsup_yes
famsup_yes
higher_yes
famrel_2
famrel_3
famrel_4
famrel_5
freetime_2
freetime_5
goout_4
Dalc_4
Dalc_5
health_2
health_3
health_4
health_5


In [33]:
import joblib
joblib.dump( ls,'LASO')

['LASO']

In [34]:
from sklearn.linear_model import Ridge
clf=Ridge(alpha=1.0)
clf.fit(features_train_truncated, target_train)
target_pred_clf = ls.predict(features_test_truncated)

In [35]:
mean_squared_error(target_test, target_pred_clf) ** 0.5

3.087802082708506

In [36]:
mean_squared_error(target_train,  clf.predict(features_train_truncated)) ** 0.5

2.5721974645300887

In [37]:
from sklearn.linear_model import ElasticNet
regr = ElasticNet(random_state=0)
regr.fit(features_train_truncated, target_train)
target_pred_regr = regr.predict(features_test_truncated)

In [38]:
mean_squared_error(target_test, target_pred_regr) ** 0.5

3.129226832177046

In [39]:
lr= LinearRegression()
lr.fit(features_train_truncated, target_train)
target_pred_lr = lr.predict(features_test_truncated)

In [40]:
mean_squared_error(target_test, target_pred_lr) ** 0.5

3.5200935738606525

In [41]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(random_state=0)
dt.fit(features_train_truncated, target_train)
y_pred_dt = dt.predict(features_test_truncated)

In [43]:
mean_squared_error(target_test, y_pred_dt) ** 0.5

4.138809833206043

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=10, random_state=0)
rf.fit(features_train_truncated, target_train)
y_pred_rf = rf.predict(features_test_truncated)

In [None]:
mean_squared_error(target_test, y_pred_rf) ** 0.5

In [24]:
mean_squared_error(target_test, target_pred) ** 0.5

3.1955217027350007

In [25]:
mean_squared_error(target_train,  regressor.predict(features_train_truncated)) ** 0.5

2.439668739170064

In [26]:
import joblib
joblib.dump( regressor,'PERF')

['PERF']

In [27]:
from sklearn.dummy import DummyRegressor

In [28]:
regressor_mean = DummyRegressor(strategy='mean')
regressor_mean.fit(features_train_truncated, target_train)

In [29]:
mean_squared_error(target_test, regressor_mean.predict(features_test_truncated)) ** 0.5

3.129226832177046