# Practice problem

In [1]:
import numpy as np
import pandas as pd
import math

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
%matplotlib inline



In [2]:
#import training dataset
train_df = pd.read_csv('all/train.csv', index_col='ID')

#see the columns in our data
train_df.info()

# take a look at the head of the dataset
train_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 333 entries, 1 to 506
Data columns (total 14 columns):
crim       333 non-null float64
zn         333 non-null float64
indus      333 non-null float64
chas       333 non-null int64
nox        333 non-null float64
rm         333 non-null float64
age        333 non-null float64
dis        333 non-null float64
rad        333 non-null int64
tax        333 non-null int64
ptratio    333 non-null float64
black      333 non-null float64
lstat      333 non-null float64
medv       333 non-null float64
dtypes: float64(11), int64(3)
memory usage: 39.0 KB


Unnamed: 0_level_0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
7,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9


## Create train test split (70-30 %)

In [3]:
#create our X and y
X = train_df.drop('medv', axis=1)
y = train_df['medv']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

## Fit a simple Linear regression model

In [4]:
lr_model = LinearRegression(fit_intercept=True)
lr_model.fit(X_train, y_train)

  linalg.lstsq(X, y)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [5]:
# R2 score
print('Training score: {}'.format(lr_model.score(X_train, y_train)))
print('Test score: {}'.format(lr_model.score(X_test, y_test)))


Training score: 0.7268827869293253
Test score: 0.7254687959254545


## Get RMSE for test and train set

In [6]:
y_train_pred = lr_model.predict(X_train)
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = math.sqrt(train_mse)
print('RMSE for train: {}'.format(train_rmse))

y_pred_test = lr_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred_test)
test_rmse = math.sqrt(test_mse)

print('RMSE for test: {}'.format(test_rmse))


RMSE for train: 4.862113965519084
RMSE for test: 4.587100299689446


In [30]:
def train_test_rmse(my_model, X_train, y_train, X_test, y_test):
    y_train_pred = my_model.predict(X_train)
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_rmse = math.sqrt(train_mse)
    print('RMSE for train: {}'.format(train_rmse))

    y_pred_test = my_model.predict(X_test)
    test_mse = mean_squared_error(y_test, y_pred_test)
    test_rmse = math.sqrt(test_mse)

    print('RMSE for test: {}'.format(test_rmse))
    
    print('Accuracies: Train: {}'.format(my_model.score(X_train, y_train)))
    print('Accuracies: Test: {}'.format(my_model.score(X_test, y_test)))
    
    
    return train_rmse, test_rmse

In [8]:
train_test_rmse(lr_model, X_train, y_train, X_test, y_test)

RMSE for train: 4.862113965519084
RMSE for test: 4.587100299689446
Accuracies: Train: 0.7268827869293253,    Test: 0.7254687959254545


(4.862113965519084, 4.587100299689446)

## Standardization

In [9]:
d = 2
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=d)),
    ('model', LinearRegression())
]

pipeline = Pipeline(steps)

pipeline.fit(X_train, y_train)
print('degree: {}'.format(d))
train_test_rmse(pipeline, X_train, y_train, X_test, y_test)
print('\n\n\n')

degree: 2
RMSE for train: 2.1301614815105023
RMSE for test: 6.387787364377503
Accuracies: Train: 0.947576760069103,    Test: 0.4676268497187783






  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)


In [10]:
# Change degree to 5 and see what happens

## L2 or Ridge regression

In [11]:
a = 5
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', Ridge(alpha=a, fit_intercept=True))
]

ridge_pipe = Pipeline(steps)
ridge_pipe.fit(X_train, y_train)
print(a)
train_test_rmse(ridge_pipe, X_train, y_train, X_test, y_test)
print('\n\n')


5
RMSE for train: 2.516975242690539
RMSE for test: 3.5520485232100376
Accuracies: Train: 0.9268091667908349,    Test: 0.8353836295367754





  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)


## L1 or Lasso regularization

In [12]:
alp = 0.022
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', Lasso(alpha=alp, fit_intercept=True))
]

lasso_pipe = Pipeline(steps)
print(alp)
lasso_pipe.fit(X_train, y_train)
train_test_rmse(lasso_pipe, X_train, y_train, X_test, y_test)
print('\n\n')


0.022
RMSE for train: 2.5077183818747946
RMSE for test: 3.4897355548326265
Accuracies: Train: 0.9273465351903244,    Test: 0.8411086413402028





  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)


In [13]:
z = lasso_pipe.named_steps['model'].coef_
z.shape

(105,)

## Elastic Net

In [14]:
X = train_df.drop('medv', axis=1)
y = train_df['medv']

In [15]:
alpha = 0.0265
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', ElasticNet(alpha=alpha, fit_intercept=True))
]
elastic_pipe = Pipeline(steps)
elastic_pipe.fit(X_train, y_train)
print(alpha)
train_test_rmse(elastic_pipe, X_train, y_train, X_test, y_test)
print('\n\n')


0.0265
RMSE for train: 2.5576145301146624
RMSE for test: 3.4873697959495176
Accuracies: Train: 0.9244265959197951,    Test: 0.841323999336723





  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)


In [15]:
# #import training dataset
# test_df = pd.read_csv('all/test.csv', index_col='ID')

# #see the columns in our data
# test_df.info()

# # take a look at the head of the dataset
# test_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 173 entries, 3 to 505
Data columns (total 13 columns):
crim       173 non-null float64
zn         173 non-null float64
indus      173 non-null float64
chas       173 non-null int64
nox        173 non-null float64
rm         173 non-null float64
age        173 non-null float64
dis        173 non-null float64
rad        173 non-null int64
tax        173 non-null int64
ptratio    173 non-null float64
black      173 non-null float64
lstat      173 non-null float64
dtypes: float64(10), int64(3)
memory usage: 18.9 KB


Unnamed: 0_level_0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
6,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21
8,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15
9,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93
10,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1


In [17]:
y_test_pred = elastic_pipe.predict(test_df)
type(y_test_pred)
print(len(y_test_pred))

173


  Xt = transform.transform(Xt)


In [18]:
# y_test_pred
# submission_df = pd.Dataframe
submission_df = pd.DataFrame(test_df.index)
submission_df['medv'] = y_test_pred
submission_df.set_index('ID', inplace=True)
submission_df.head()
# type(ids)
# submission_df = pd.join(ids, pd.Series(y_test_pred))
submission_df.to_csv('all/submission.csv')

# Feature selection

## Forward selection

In [31]:
def my_elas_func(X_train, y_train, X_test, y_test):
    alpha = 0.0265
    steps = [
        ('scalar', StandardScaler()),
        ('poly', PolynomialFeatures(degree=2)),
        ('model', ElasticNet(alpha=alpha, fit_intercept=True))
    ]
    elastic_pipe = Pipeline(steps)
    elastic_pipe.fit(X_train, y_train)
    print(alpha)
    train_test_rmse(elastic_pipe, X_train, y_train, X_test, y_test)
    print('\n\n')


In [32]:
my_elas_func(X_train, y_train, X_test, y_test)

0.0265
RMSE for train: 2.5576145301146624
RMSE for test: 3.4873697959495176
Accuracies: Train: 0.9244265959197951
Accuracies: Test: 0.841323999336723





  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)


In [38]:
col_list = list(X_train.columns)
print(col_list)
len(col_list)

['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat']


13

In [21]:
X_train[['crim', 'zn', 'indus']].head()

Unnamed: 0_level_0,crim,zn,indus
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
40,0.02763,75.0,2.95
441,22.0511,0.0,18.1
454,8.24809,0.0,18.1
84,0.03551,25.0,4.86
262,0.53412,20.0,3.97


In [23]:
def select_cols(df, col_list):
    return df[[item for item in col_list]]

In [None]:
https://tinyurl.com/y7re2fga
Test: 0.8915664150919732

In [48]:
# for i in range(len(col_list)):
#     feature_list = ['lstat', 'rm', 'ptratio', 'age',
#                     'tax', 'dis', 'crim', 'nox', 'zn']
#     curr_feature_name = col_list[i]
#     if(curr_feature_name not in feature_list):
#         feature_list.append(curr_feature_name)
#         print(feature_list)
    
#         new_X_train = select_cols(X_train, feature_list)
#         new_X_test = select_cols(X_test, feature_list)
#         my_elas_func(new_X_train, y_train, new_X_test, y_test)
    

## Backward selection

In [63]:
def reject_cols(df, to_remove_col_list):
    complete_col_list = list(df.columns)
    keep_col_list = [i for i in complete_col_list if i not in to_remove_col_list]
    print('keep_col_list:  ', keep_col_list)
    return df[[item for item in keep_col_list]]

In [64]:
# X_train.head()

In [65]:
# reject_cols(X_train, ['crim', 'zn', 'indus'])

In [71]:
for i in range(len(col_list)):
    feature_list = ['black', 'chas', 'rad', 'indus']
    curr_feature_name = col_list[i]
    if(curr_feature_name not in feature_list):
        feature_list.append(curr_feature_name)
        print('remove:  ', feature_list)
    
        new_X_train = reject_cols(X_train, feature_list)
        new_X_test = reject_cols(X_test, feature_list)
        my_elas_func(new_X_train, y_train, new_X_test, y_test)
    

remove:   ['black', 'chas', 'rad', 'indus', 'crim']
keep_col_list:   ['zn', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'lstat']
keep_col_list:   ['zn', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'lstat']
0.0265
RMSE for train: 3.196207625172519
RMSE for test: 2.934964104263578
Accuracies: Train: 0.8819764237377498
Accuracies: Test: 0.887611770348411



remove:   ['black', 'chas', 'rad', 'indus', 'zn']
keep_col_list:   ['crim', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'lstat']
keep_col_list:   ['crim', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'lstat']
0.0265
RMSE for train: 3.190816790295037
RMSE for test: 2.885078307211168
Accuracies: Train: 0.8823742133239402
Accuracies: Test: 0.8913998430645871



remove:   ['black', 'chas', 'rad', 'indus', 'nox']
keep_col_list:   ['crim', 'zn', 'rm', 'age', 'dis', 'tax', 'ptratio', 'lstat']
keep_col_list:   ['crim', 'zn', 'rm', 'age', 'dis', 'tax', 'ptratio', 'lstat']
0.0265
RMSE for train: 3.2012994279632045
RMSE for test: 2.91974174524

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(

In [75]:
removed = ['black', 'chas', 'rad', 'indus']
all_fs = list(X_train.columns)
selected = [i for i in all_fs if i not in removed]
print(sorted(selected))

['age', 'crim', 'dis', 'lstat', 'nox', 'ptratio', 'rm', 'tax', 'zn']


In [74]:
feature_list = ['lstat', 'rm', 'ptratio', 'age',
                'tax', 'dis', 'crim', 'nox', 'zn']
sorted(feature_list)

['age', 'crim', 'dis', 'lstat', 'nox', 'ptratio', 'rm', 'tax', 'zn']