In [133]:
%matplotlib inline
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression,LinearRegression,Ridge,Lasso
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_curve, roc_auc_score
from sklearn import neighbors, preprocessing
from sklearn.grid_search import GridSearchCV
import seaborn as sns
import matplotlib.pylab as plt

In [134]:
college = pd.read_csv("College.csv")

In [135]:
college.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777 entries, 0 to 776
Data columns (total 18 columns):
Private        777 non-null object
Apps           777 non-null int64
Accept         777 non-null int64
Enroll         777 non-null int64
Top10perc      777 non-null int64
Top25perc      777 non-null int64
F.Undergrad    777 non-null int64
P.Undergrad    777 non-null int64
Outstate       777 non-null int64
Room.Board     777 non-null int64
Books          777 non-null int64
Personal       777 non-null int64
PhD            777 non-null int64
Terminal       777 non-null int64
S.F.Ratio      777 non-null float64
perc.alumni    777 non-null int64
Expend         777 non-null int64
Grad.Rate      777 non-null int64
dtypes: float64(1), int64(16), object(1)
memory usage: 109.3+ KB


In [136]:
college.head()

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [137]:
college['Private'] = college['Private'].astype('category').cat.codes
college.head()

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,1,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,1,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,1,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,1,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,1,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [138]:
df_x = college.drop("Apps",axis=1)
df_y = college['Apps']

In [139]:
df_y.shape

(777,)

In [140]:
x_train,x_test,y_train,y_test=train_test_split(\
    df_x, df_y, test_size=0.3, random_state=42)

In [141]:
lm = LinearRegression()
lm.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [142]:
coeff_df = pd.DataFrame(lm.coef_,x_train.columns,columns=['Coefficient'])
coeff_df

Unnamed: 0,Coefficient
Private,-557.508542
Accept,1.752032
Enroll,-1.31412
Top10perc,32.726179
Top25perc,-3.66962
F.Undergrad,0.035782
P.Undergrad,0.020205
Outstate,-0.079454
Room.Board,0.137947
Books,0.189046


In [143]:
y_pred = lm.predict(x_test)

In [144]:
mse = np.mean((y_pred - y_test)**2)

In [145]:
mse

1931803.1942070164

Fit a ridge regression model on the training set, with $\lambda$ chosen by cross-validation. Report the test error obtained.

In [146]:
param_grid = {"alpha": np.logspace(1, 30, 50)}

In [147]:
clf = Ridge()
cv = GridSearchCV(clf, param_grid, cv=10, n_jobs=8, refit=True, verbose=True)
cv.fit(x_train, y_train)

Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    1.6s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=8,
       param_grid={'alpha': array([  1.00000e+01,   3.90694e+01,   1.52642e+02,   5.96362e+02,
         2.32995e+03,   9.10298e+03,   3.55648e+04,   1.38950e+05,
         5.42868e+05,   2.12095e+06,   8.28643e+06,   3.23746e+07,
         1.26486e+08,   4.94171e+08,   1.93070e+09,   7.54312e+09,
         2....    1.09854e+27,   4.29193e+27,   1.67683e+28,   6.55129e+28,
         2.55955e+29,   1.00000e+30])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=True)

In [148]:
cv.best_estimator_

Ridge(alpha=10.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [149]:
y_pred_ridge = cv.predict(x_test)
mse_ridge = np.mean((y_pred_ridge - y_test)**2)
print(mse_ridge)

1926685.70662


The test MSE is only a little bit lower for ridge regression than for least squares.

Fit a Lasso model on the training set, with $\lambda$ chosen by cross-validation. Report the test error obtained.

In [150]:
param_grid = {"alpha": np.logspace(1, 30, 50)}
clf = Lasso()
cv = GridSearchCV(clf, param_grid, cv=10, n_jobs=8, refit=True, verbose=True)
cv.fit(x_train, y_train)

Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    1.5s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=8,
       param_grid={'alpha': array([  1.00000e+01,   3.90694e+01,   1.52642e+02,   5.96362e+02,
         2.32995e+03,   9.10298e+03,   3.55648e+04,   1.38950e+05,
         5.42868e+05,   2.12095e+06,   8.28643e+06,   3.23746e+07,
         1.26486e+08,   4.94171e+08,   1.93070e+09,   7.54312e+09,
         2....    1.09854e+27,   4.29193e+27,   1.67683e+28,   6.55129e+28,
         2.55955e+29,   1.00000e+30])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=True)

In [151]:
cv.best_estimator_

Lasso(alpha=10.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [152]:
y_pred_lasso = cv.predict(x_test)
mse_lasso = np.mean((y_pred_lasso - y_test)**2)
print(mse_lasso)

1927960.35781


The test MSE is also higher for Lasso than for least squares.

In [153]:
print(cv.best_estimator_.coef_)

[ -4.12738767e+02   1.74962234e+00  -1.30666395e+00   3.20048010e+01
  -3.24295227e+00   4.09616699e-02   2.06106517e-02  -8.66365518e-02
   1.35043626e-01   1.83872530e-01   7.96299546e-02  -8.75002914e+00
  -4.20956115e-01   2.02624371e+01   9.87783683e-01   6.35930294e-02
   7.34325118e+00]


In [154]:
test_avg = np.mean(y_test)
lm_r2 = 1 - np.mean((y_pred - y_test)**2) / np.mean((test_avg - y_test)**2)
ridge_r2 = 1 - np.mean((y_pred_ridge - y_test)**2) / np.mean((test_avg - y_test)**2)
lasso_r2 = 1 - np.mean((y_pred_lasso - y_test)**2) / np.mean((test_avg - y_test)**2)

In [155]:
print(lm_r2)
print(ridge_r2)
print(lasso_r2)

0.85730559846
0.857683606339
0.857589453069


Based on the above analysis, the test $R^2$ for least squares is 0.8573, the test $R^2$ for ridge is 0.8576, the test $R^2$ for lasso is 0.8575. The highest accuracy is reached using ridge regression. Generally, the accuracy of predicting college applications is high.