In [46]:
import numpy as np
from sklearn.metrics import mean_squared_error
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.tree import DecisionTreeClassifier, export_graphviz
%matplotlib inline
d = pd.read_csv('student_mat.csv', sep=';')
d['avg_g'] = (d['G1']+d['G2']+d['G3'])/3

In [2]:
categorical_columns = [c for c in d.columns if d[c].dtype.name == 'object']
numerical_columns   = [c for c in d.columns if d[c].dtype.name != 'object']
d_describe = d.describe(include=[object])


In [3]:
binary_columns    = [c for c in categorical_columns if d_describe[c]['unique'] == 2]
nonbinary_columns = [c for c in categorical_columns if d_describe[c]['unique'] > 2]

In [4]:
for c in binary_columns[0:]:
    top = d_describe[c]['top']
    top_items = d[c] == top
    d.loc[top_items, c] = 0
    d.loc[np.logical_not(top_items), c] = 1

In [5]:
d_nonbinary = pd.get_dummies(d[nonbinary_columns])

In [6]:
d_numerical = d[numerical_columns]
d_numerical = (d_numerical - d_numerical.mean()) / d_numerical.std()

In [7]:
d = pd.concat((d_numerical, d[binary_columns], d_nonbinary), axis=1)
d = pd.DataFrame(d, dtype=float)

In [8]:
X = d.drop(['avg_g', 'G1','G2','G3'],axis = 1)
y = d['avg_g']
feature_names = X.columns
feature_names

Index(['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures',
       'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher',
       'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc',
       'health', 'absences', 'sex', 'address', 'famsize', 'Pstatus',
       'Mjob_at_home', 'Mjob_health', 'Mjob_other', 'Mjob_services',
       'Mjob_teacher', 'Fjob_at_home', 'Fjob_health', 'Fjob_other',
       'Fjob_services', 'Fjob_teacher', 'reason_course', 'reason_home',
       'reason_other', 'reason_reputation', 'guardian_father',
       'guardian_mother', 'guardian_other'],
      dtype='object')

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 11)

In [10]:
lin = LinearRegression(normalize=True)
lin.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [11]:
y_lin_pred = lin.predict(X_test)

In [12]:
def rmse(y, p):
    return np.sqrt(mean_squared_error(y, p))

In [13]:
def beatiful_coef(coefs, feature_names=d.columns):
    return pd.DataFrame(coefs, index=feature_names,
                       columns=['coef']).sort_values('coef',
                                                  ascending=False)

In [14]:
rmse(y_test, y_lin_pred)

0.89871158910308002

In [15]:
beatiful_coef(lin.coef_,feature_names=X_train.columns)

Unnamed: 0,coef
reason_other,2116213000000.0
reason_reputation,2116213000000.0
reason_home,2116213000000.0
reason_course,2116213000000.0
Mjob_health,856256000000.0
Mjob_services,856256000000.0
Mjob_at_home,856256000000.0
Mjob_other,856256000000.0
Mjob_teacher,856256000000.0
sex,0.310273


In [16]:
ridge = Ridge(random_state=17, normalize=True)
ridge.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=17, solver='auto', tol=0.001)

In [17]:
beatiful_coef(ridge.coef_,feature_names=X_train.columns)

Unnamed: 0,coef
Fjob_teacher,0.236049
Fjob_at_home,0.22906
Mjob_health,0.146997
sex,0.136118
Mjob_services,0.109901
reason_other,0.091353
famsize,0.078008
Pstatus,0.063077
higher,0.061414
reason_reputation,0.051885


In [18]:
ridge_valid_pred = ridge.predict(X_test)

In [19]:
rmse(y_test, ridge_valid_pred)

0.89187199257737915

In [20]:
alphas = np.logspace(-4, 4, 100)
ridge_cv = RidgeCV(alphas=alphas,cv=5, normalize=True).fit(X_train, y_train)
ridge_cv.alpha_


0.52140082879996896

In [21]:
best_ridge_test_pred = ridge_cv.predict(X_test)
rmse(y_test, best_ridge_test_pred)

0.88588524868828489

In [22]:
lasso = LassoCV(alphas=alphas, cv=5, normalize=True).fit(X_train, y_train)
lasso.alpha_

0.0016297508346206436

In [23]:
best_lasso_test_pred = lasso.predict(X_test)
rmse(y_test, best_lasso_test_pred)


0.871057777621561

In [24]:
beatiful_coef(lasso.coef_,feature_names=X_train.columns)

Unnamed: 0,coef
Fjob_teacher,0.407595
Mjob_health,0.330361
Fjob_at_home,0.314568
Mjob_services,0.263223
sex,0.219016
reason_other,0.088672
famsize,0.081574
higher,0.076873
studytime,0.075803
Medu,0.051287


In [25]:
beatiful_coef(ridge_cv.coef_,feature_names=X_train.columns)


Unnamed: 0,coef
Fjob_teacher,0.314814
Fjob_at_home,0.28434
Mjob_health,0.192033
sex,0.185471
Mjob_services,0.142105
reason_other,0.123186
famsize,0.101955
higher,0.072885
Pstatus,0.070186
studytime,0.06204


In [26]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators=100, random_state=17)
forest.fit(X_train, y_train)
forest_test_pred = forest.predict(X_test)


In [27]:
rmse(y_test, forest_test_pred)


0.83743555911559397

In [28]:
beatiful_coef(forest.feature_importances_,feature_names=X_train.columns)

Unnamed: 0,coef
failures,0.119483
absences,0.11096
freetime,0.058807
age,0.043564
goout,0.042472
Walc,0.037686
health,0.03762
schoolsup,0.036184
studytime,0.03396
Fedu,0.033782
