# Practice problem

In [1]:
import numpy as np
import pandas as pd
import math

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
%matplotlib inline



In [3]:
#import training dataset
train_df = pd.read_csv('/Users/sunilhariharan/Downloads/kaggle1stcomp/train.csv', index_col='ID')

#see the columns in our data
train_df.info()

# take a look at the head of the dataset
train_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 333 entries, 1 to 506
Data columns (total 14 columns):
crim       333 non-null float64
zn         333 non-null float64
indus      333 non-null float64
chas       333 non-null int64
nox        333 non-null float64
rm         333 non-null float64
age        333 non-null float64
dis        333 non-null float64
rad        333 non-null int64
tax        333 non-null int64
ptratio    333 non-null float64
black      333 non-null float64
lstat      333 non-null float64
medv       333 non-null float64
dtypes: float64(11), int64(3)
memory usage: 39.0 KB


Unnamed: 0_level_0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
7,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9


## Create train test split (70-30 %)

In [4]:
#create our X and y
X = train_df.drop('medv', axis=1)
y = train_df['medv']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

## Fit a simple Linear regression model

In [5]:
lr_model = LinearRegression(fit_intercept=True)
lr_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [6]:
# R2 score
print('Training score: {}'.format(lr_model.score(X_train, y_train)))
print('Test score: {}'.format(lr_model.score(X_test, y_test)))


Training score: 0.7268827869293253
Test score: 0.7254687959254545


## Get RMSE for test and train set

In [7]:
y_train_pred = lr_model.predict(X_train)
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = math.sqrt(train_mse)
print('RMSE for train: {}'.format(train_rmse))

y_pred_test = lr_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred_test)
test_rmse = math.sqrt(test_mse)

print('RMSE for test: {}'.format(test_rmse))


RMSE for train: 4.862113965519083
RMSE for test: 4.587100299689446


In [8]:
def train_test_rmse(my_model, X_train, y_train, X_test, y_test):
    y_train_pred = my_model.predict(X_train)
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_rmse = math.sqrt(train_mse)
    print('RMSE for train: {}'.format(train_rmse))

    y_pred_test = my_model.predict(X_test)
    test_mse = mean_squared_error(y_test, y_pred_test)
    test_rmse = math.sqrt(test_mse)

    print('RMSE for test: {}'.format(test_rmse))
    
    print('Accuracies: Train: {},    Test: {}'.format(
        my_model.score(X_train, y_train),
        my_model.score(X_test, y_test))
         )
    
    
    return train_rmse, test_rmse

In [9]:
train_test_rmse(lr_model, X_train, y_train, X_test, y_test)

RMSE for train: 4.862113965519083
RMSE for test: 4.587100299689446
Accuracies: Train: 0.7268827869293253,    Test: 0.7254687959254545


(4.862113965519083, 4.587100299689446)

## Standardization

In [10]:
d = 2
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=d)),
    ('model', LinearRegression())
]

pipeline = Pipeline(steps)

pipeline.fit(X_train, y_train)
print('degree: {}'.format(d))
train_test_rmse(pipeline, X_train, y_train, X_test, y_test)
print('\n\n\n')

degree: 2
RMSE for train: 2.130161481510501
RMSE for test: 6.387787364377362
Accuracies: Train: 0.9475767600691032,    Test: 0.4676268497188017






In [None]:
# Change degree to 5 and see what happens

## L2 or Ridge regression

In [11]:
a = 5
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', Ridge(alpha=a, fit_intercept=True))
]

ridge_pipe = Pipeline(steps)
ridge_pipe.fit(X_train, y_train)
print(a)
train_test_rmse(ridge_pipe, X_train, y_train, X_test, y_test)
print('\n\n')


5
RMSE for train: 2.51697524269054
RMSE for test: 3.5520485232100354
Accuracies: Train: 0.9268091667908349,    Test: 0.8353836295367756





## L1 or Lasso regularization

In [12]:
alp = 0.022
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', Lasso(alpha=alp, fit_intercept=True))
]

lasso_pipe = Pipeline(steps)
print(alp)
lasso_pipe.fit(X_train, y_train)
train_test_rmse(lasso_pipe, X_train, y_train, X_test, y_test)
print('\n\n')


0.022
RMSE for train: 2.507718381874795
RMSE for test: 3.4897355548326274
Accuracies: Train: 0.9273465351903244,    Test: 0.8411086413402027





## Elastic Net

In [13]:
X = train_df.drop('medv', axis=1)
y = train_df['medv']

In [39]:
def my_elas_func(X_train,y_train,X_test,y_test):
    alpha = 0.0265
    steps = [
        ('scalar', StandardScaler()),
        ('poly', PolynomialFeatures(degree=2)),
        ('model', ElasticNet(alpha=alpha, fit_intercept=True))
    ]
    elastic_pipe = Pipeline(steps)
    elastic_pipe.fit(X_train, y_train)
    print(alpha)
    train_test_rmse(elastic_pipe, X_train, y_train, X_test, y_test)
    print('\n\n')


In [40]:
my_elas_func(X_train,y_train,X_test,y_test)

0.0265
RMSE for train: 2.557614530114663
RMSE for test: 3.487369795949514
Accuracies: Train: 0.9244265959197951,    Test: 0.8413239993367233





In [41]:
#import training dataset
test_df = pd.read_csv('/Users/sunilhariharan/Downloads/kaggle1stcomp/test.csv', index_col='ID')

#see the columns in our data
test_df.info()

# take a look at the head of the dataset
test_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 173 entries, 3 to 505
Data columns (total 13 columns):
crim       173 non-null float64
zn         173 non-null float64
indus      173 non-null float64
chas       173 non-null int64
nox        173 non-null float64
rm         173 non-null float64
age        173 non-null float64
dis        173 non-null float64
rad        173 non-null int64
tax        173 non-null int64
ptratio    173 non-null float64
black      173 non-null float64
lstat      173 non-null float64
dtypes: float64(10), int64(3)
memory usage: 18.9 KB


Unnamed: 0_level_0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
6,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21
8,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15
9,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93
10,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1


In [42]:
y_test_pred = elastic_pipe.predict(test_df)

In [17]:
type(y_test_pred)
print(len(y_test_pred))

173


In [20]:
# y_test_pred
# submission_df = pd.Dataframe
submission_df = pd.DataFrame(test_df.index)
submission_df['medv'] = y_test_pred
submission_df.set_index('ID', inplace=True)
submission_df.head()
# type(ids)
# submission_df = pd.join(ids, pd.Series(y_test_pred))

Unnamed: 0_level_0,medv
ID,Unnamed: 1_level_1
3,32.168273
6,25.311255
8,16.184712
9,16.765992
10,17.247958


In [19]:
submission_df.to_csv('/Users/sunilhariharan/Downloads/kaggle1stcomp/mysub.csv')

In [24]:
col_list=list(X_train.columns)

In [25]:
col_list

['crim',
 'zn',
 'indus',
 'chas',
 'nox',
 'rm',
 'age',
 'dis',
 'rad',
 'tax',
 'ptratio',
 'black',
 'lstat']

In [32]:
def select_cols(df,col_list): 
    return df[[item for item in col_list]]

In [47]:
for i in range(len(col_list)):
    feature_list=[]
    curr_feature_name=col_list[i]
    feature_list.append(curr_feature_name)
    print(feature_list)
    
    new_X_train=select_cols(X_train,feature_list)
    new_X_test=select_cols(X_test,feature_list)
    my_elas_func(new_X_train,y_train,new_X_test,y_test)

['crim']
0.0265
RMSE for train: 8.289624058011556
RMSE for test: 7.903731207468503
Accuracies: Train: 0.20609511479140652,    Test: 0.18495879270463145



['zn']
0.0265
RMSE for train: 8.651890594263087
RMSE for test: 8.336625437244807
Accuracies: Train: 0.135189723494339,    Test: 0.09323276180977091



['indus']
0.0265
RMSE for train: 7.813931551377416
RMSE for test: 8.058917578775905
Accuracies: Train: 0.2945958525788539,    Test: 0.152638612117742



['chas']
0.0265
RMSE for train: 9.087170809778211
RMSE for test: 8.7167044868434
Accuracies: Train: 0.04598283972122785,    Test: 0.008666259609597327



['nox']
0.0265
RMSE for train: 8.254165981364064
RMSE for test: 8.49868800944949
Accuracies: Train: 0.21287229428121623,    Test: 0.0576352882230049



['rm']
0.0265
RMSE for train: 6.351671207673917
RMSE for test: 5.130434748943688
Accuracies: Train: 0.5339045950662946,    Test: 0.6565815853278247



['age']
0.0265
RMSE for train: 8.51686196067812
RMSE for test: 8.362641759682154
Acc

In [48]:
for i in range(len(col_list)):
    feature_list=['lstat']
    curr_feature_name=col_list[i]
    
    if curr_feature_name not in feature_list:
        feature_list.append(curr_feature_name)
        print(feature_list)

        new_X_train=select_cols(X_train,feature_list)
        new_X_test=select_cols(X_test,feature_list)
        my_elas_func(new_X_train,y_train,new_X_test,y_test)

['lstat', 'crim']
0.0265
RMSE for train: 5.416135104140316
RMSE for test: 4.968795731820715
Accuracies: Train: 0.661095151546357,    Test: 0.677880120074376



['lstat', 'zn']
0.0265
RMSE for train: 5.682551964550244
RMSE for test: 5.040205765779965
Accuracies: Train: 0.6269340303482236,    Test: 0.6685547680294401



['lstat', 'indus']
0.0265
RMSE for train: 5.584652529629084
RMSE for test: 4.675696457448462
Accuracies: Train: 0.6396777193779135,    Test: 0.7147616859141159



['lstat', 'chas']
0.0265
RMSE for train: 5.399517445009608
RMSE for test: 5.132539009226015
Accuracies: Train: 0.6631716010641816,    Test: 0.6562998197612733



['lstat', 'nox']
0.0265
RMSE for train: 5.59085340599285
RMSE for test: 4.678245617264092
Accuracies: Train: 0.6388771129559032,    Test: 0.7144505809205304



['lstat', 'rm']
0.0265
RMSE for train: 4.787110802834541
RMSE for test: 3.3835278447705925
Accuracies: Train: 0.7352440291215128,    Test: 0.8506329678368497



['lstat', 'age']
0.0265
RMSE for t

In [49]:
for i in range(len(col_list)):
    feature_list=['lstat','rm']
    curr_feature_name=col_list[i]
    
    if curr_feature_name not in feature_list:
        feature_list.append(curr_feature_name)
        print(feature_list)

        new_X_train=select_cols(X_train,feature_list)
        new_X_test=select_cols(X_test,feature_list)
        my_elas_func(new_X_train,y_train,new_X_test,y_test)

['lstat', 'rm', 'crim']
0.0265
RMSE for train: 4.406721134952346
RMSE for test: 3.351773059883591
Accuracies: Train: 0.7756480025128301,    Test: 0.8534234639388154



['lstat', 'rm', 'zn']
0.0265
RMSE for train: 4.636555320098605
RMSE for test: 3.4533199395390755
Accuracies: Train: 0.7516353997446655,    Test: 0.8444074226937768



['lstat', 'rm', 'indus']
0.0265
RMSE for train: 4.263882771470475
RMSE for test: 3.901458376324602
Accuracies: Train: 0.7899564665570812,    Test: 0.8014045940042167



['lstat', 'rm', 'chas']
0.0265
RMSE for train: 4.4653463061579854
RMSE for test: 3.8286702367215617
Accuracies: Train: 0.7696389256092905,    Test: 0.8087457183657727



['lstat', 'rm', 'nox']
0.0265
RMSE for train: 4.526322163475639
RMSE for test: 3.6967287820060046
Accuracies: Train: 0.7633046492910244,    Test: 0.8217003790648605



['lstat', 'rm', 'age']
0.0265
RMSE for train: 4.41701272418665
RMSE for test: 3.4035793007640796
Accuracies: Train: 0.7745988620232868,    Test: 0.84885736487

In [50]:
for i in range(len(col_list)):
    feature_list=['lstat','rm','ptratio']
    curr_feature_name=col_list[i]
    
    if curr_feature_name not in feature_list:
        feature_list.append(curr_feature_name)
        print(feature_list)

        new_X_train=select_cols(X_train,feature_list)
        new_X_test=select_cols(X_test,feature_list)
        my_elas_func(new_X_train,y_train,new_X_test,y_test)

['lstat', 'rm', 'ptratio', 'crim']
0.0265
RMSE for train: 4.198261966471088
RMSE for test: 3.3314266421894234
Accuracies: Train: 0.79637182322959,    Test: 0.8551976029134793



['lstat', 'rm', 'ptratio', 'zn']
0.0265
RMSE for train: 4.446533426368245
RMSE for test: 3.2717738093118722
Accuracies: Train: 0.7715758978692426,    Test: 0.860336865359188



['lstat', 'rm', 'ptratio', 'indus']
0.0265
RMSE for train: 4.057163236378224
RMSE for test: 3.6893508036594644
Accuracies: Train: 0.8098292292496072,    Test: 0.8224113741545033



['lstat', 'rm', 'ptratio', 'chas']
0.0265
RMSE for train: 4.13843370065814
RMSE for test: 3.492526387797163
Accuracies: Train: 0.8021341670271829,    Test: 0.8408544005534452



['lstat', 'rm', 'ptratio', 'nox']
0.0265
RMSE for train: 4.113223657120427
RMSE for test: 3.5531963095735244
Accuracies: Train: 0.8045374980105907,    Test: 0.8352772261748058



['lstat', 'rm', 'ptratio', 'age']
0.0265
RMSE for train: 4.126634841106781
RMSE for test: 3.159559962052029

In [51]:
for i in range(len(col_list)):
    feature_list=['lstat','rm','ptratio','age']
    curr_feature_name=col_list[i]
    
    if curr_feature_name not in feature_list:
        feature_list.append(curr_feature_name)
        print(feature_list)

        new_X_train=select_cols(X_train,feature_list)
        new_X_test=select_cols(X_test,feature_list)
        my_elas_func(new_X_train,y_train,new_X_test,y_test)

['lstat', 'rm', 'ptratio', 'age', 'crim']
0.0265
RMSE for train: 3.957868743922238
RMSE for test: 3.1740763976297073
Accuracies: Train: 0.8190237526713948,    Test: 0.868553207375354



['lstat', 'rm', 'ptratio', 'age', 'zn']
0.0265
RMSE for train: 4.092407208917886
RMSE for test: 3.1266544194285038
Accuracies: Train: 0.8065109083984929,    Test: 0.872451602314638



['lstat', 'rm', 'ptratio', 'age', 'indus']
0.0265
RMSE for train: 3.973154989540313
RMSE for test: 3.471804519121671
Accuracies: Train: 0.8176231050546086,    Test: 0.8427372845207344



['lstat', 'rm', 'ptratio', 'age', 'chas']
0.0265
RMSE for train: 3.861598448388116
RMSE for test: 3.2833554041872115
Accuracies: Train: 0.8277207286495529,    Test: 0.8593463416660525



['lstat', 'rm', 'ptratio', 'age', 'nox']
0.0265
RMSE for train: 3.9814594320294425
RMSE for test: 3.232868189753232
Accuracies: Train: 0.8168599225275588,    Test: 0.8636386669833684



['lstat', 'rm', 'ptratio', 'age', 'dis']
0.0265
RMSE for train: 3.6848

In [52]:
for i in range(len(col_list)):
    feature_list=['lstat','rm','ptratio','age','tax']
    curr_feature_name=col_list[i]
    
    if curr_feature_name not in feature_list:
        feature_list.append(curr_feature_name)
        print(feature_list)

        new_X_train=select_cols(X_train,feature_list)
        new_X_test=select_cols(X_test,feature_list)
        my_elas_func(new_X_train,y_train,new_X_test,y_test)

['lstat', 'rm', 'ptratio', 'age', 'tax', 'crim']
0.0265
RMSE for train: 3.598590030615084
RMSE for test: 3.091424118900504
Accuracies: Train: 0.8503889921417926,    Test: 0.8753097705822672



['lstat', 'rm', 'ptratio', 'age', 'tax', 'zn']
0.0265
RMSE for train: 3.5873635533040713
RMSE for test: 3.115000377330201
Accuracies: Train: 0.8513210153230878,    Test: 0.8734006577275079



['lstat', 'rm', 'ptratio', 'age', 'tax', 'indus']
0.0265
RMSE for train: 3.617377156307951
RMSE for test: 3.299424796280343
Accuracies: Train: 0.8488227687919938,    Test: 0.8579661988027114



['lstat', 'rm', 'ptratio', 'age', 'tax', 'chas']
0.0265
RMSE for train: 3.4900032647153023
RMSE for test: 3.306205836667184
Accuracies: Train: 0.8592817355586053,    Test: 0.8573817777412264



['lstat', 'rm', 'ptratio', 'age', 'tax', 'nox']
0.0265
RMSE for train: 3.5444745102370474
RMSE for test: 3.16368059256774
Accuracies: Train: 0.8548548532992848,    Test: 0.8694128322070032



['lstat', 'rm', 'ptratio', 'age', '

In [53]:
for i in range(len(col_list)):
    feature_list=['lstat','rm','ptratio','age','tax','dis']
    curr_feature_name=col_list[i]
    
    if curr_feature_name not in feature_list:
        feature_list.append(curr_feature_name)
        print(feature_list)

        new_X_train=select_cols(X_train,feature_list)
        new_X_test=select_cols(X_test,feature_list)
        my_elas_func(new_X_train,y_train,new_X_test,y_test)

['lstat', 'rm', 'ptratio', 'age', 'tax', 'dis', 'crim']
0.0265
RMSE for train: 3.275219941262844
RMSE for test: 2.9052386515629545
Accuracies: Train: 0.8760690603005618,    Test: 0.8898767882793579



['lstat', 'rm', 'ptratio', 'age', 'tax', 'dis', 'zn']
0.0265
RMSE for train: 3.2684302750066445
RMSE for test: 2.9228835696900717
Accuracies: Train: 0.8765823556238893,    Test: 0.8885350631911471



['lstat', 'rm', 'ptratio', 'age', 'tax', 'dis', 'indus']
0.0265
RMSE for train: 3.257363328613276
RMSE for test: 3.1955956174801683
Accuracies: Train: 0.8774167280509766,    Test: 0.8667648313668106



['lstat', 'rm', 'ptratio', 'age', 'tax', 'dis', 'chas']
0.0265
RMSE for train: 3.243796349206395
RMSE for test: 3.08414628433717
Accuracies: Train: 0.8784357248095089,    Test: 0.8758961713071581



['lstat', 'rm', 'ptratio', 'age', 'tax', 'dis', 'nox']
0.0265
RMSE for train: 3.2563707468068217
RMSE for test: 2.9407564216371163
Accuracies: Train: 0.8774914236713519,    Test: 0.8871677235709231


In [57]:
for i in range(len(col_list)):
    feature_list=['lstat','rm','ptratio','age','tax','dis','crim']
    curr_feature_name=col_list[i]
    
    if curr_feature_name not in feature_list:
        feature_list.append(curr_feature_name)
        print(feature_list)

        new_X_train=select_cols(X_train,feature_list)
        new_X_test=select_cols(X_test,feature_list)
        my_elas_func(new_X_train,y_train,new_X_test,y_test)
        

['lstat', 'rm', 'ptratio', 'age', 'tax', 'dis', 'crim', 'zn']
0.0265
RMSE for train: 3.2013437699127905
RMSE for test: 2.920095528857279
Accuracies: Train: 0.88159680307669,    Test: 0.8887476071203622



['lstat', 'rm', 'ptratio', 'age', 'tax', 'dis', 'crim', 'indus']
0.0265
RMSE for train: 3.2000368707471187
RMSE for test: 3.075380900350856
Accuracies: Train: 0.8816934558981746,    Test: 0.8766005943670125



['lstat', 'rm', 'ptratio', 'age', 'tax', 'dis', 'crim', 'chas']
0.0265
RMSE for train: 3.1256275680226424
RMSE for test: 3.0878853134437416
Accuracies: Train: 0.887131368063024,    Test: 0.8755950772186807



['lstat', 'rm', 'ptratio', 'age', 'tax', 'dis', 'crim', 'nox']
0.0265
RMSE for train: 3.1909371751599624
RMSE for test: 2.8864498725657994
Accuracies: Train: 0.8823653374576675,    Test: 0.8912965615569024



['lstat', 'rm', 'ptratio', 'age', 'tax', 'dis', 'crim', 'rad']
0.0265
RMSE for train: 3.090670288391985
RMSE for test: 3.018445513496709
Accuracies: Train: 0.889641914

In [55]:
for i in range(len(col_list)):
    feature_list=['lstat','rm','ptratio','age','tax','dis','crim','nox']
    curr_feature_name=col_list[i]
    
    if curr_feature_name not in feature_list:
        feature_list.append(curr_feature_name)
        print(feature_list)

        new_X_train=select_cols(X_train,feature_list)
        new_X_test=select_cols(X_test,feature_list)
        my_elas_func(new_X_train,y_train,new_X_test,y_test)

['lstat', 'rm', 'ptratio', 'age', 'tax', 'dis', 'crim', 'nox', 'zn']
0.0265
RMSE for train: 3.118266535279123
RMSE for test: 2.882864876886123
Accuracies: Train: 0.8876623663035237,    Test: 0.8915664150919732



['lstat', 'rm', 'ptratio', 'age', 'tax', 'dis', 'crim', 'nox', 'indus']
0.0265
RMSE for train: 3.1210723782365566
RMSE for test: 2.9636389531153116
Accuracies: Train: 0.887460110617285,    Test: 0.8854049572596332



['lstat', 'rm', 'ptratio', 'age', 'tax', 'dis', 'crim', 'nox', 'chas']
0.0265
RMSE for train: 2.993125310162765
RMSE for test: 3.054980433210635
Accuracies: Train: 0.8964980332345263,    Test: 0.878232298553111



['lstat', 'rm', 'ptratio', 'age', 'tax', 'dis', 'crim', 'nox', 'rad']
0.0265
RMSE for train: 3.0049032402781717
RMSE for test: 2.9510182974951498
Accuracies: Train: 0.8956818713567327,    Test: 0.8863788850115687



['lstat', 'rm', 'ptratio', 'age', 'tax', 'dis', 'crim', 'nox', 'black']
0.0265
RMSE for train: 3.043199505302007
RMSE for test: 3.1693647064

In [56]:
for i in range(len(col_list)):
    feature_list=['lstat','rm','ptratio','age','tax','dis','crim','nox','zn']
    curr_feature_name=col_list[i]
    
    if curr_feature_name not in feature_list:
        feature_list.append(curr_feature_name)
        print(feature_list)

        new_X_train=select_cols(X_train,feature_list)
        new_X_test=select_cols(X_test,feature_list)
        my_elas_func(new_X_train,y_train,new_X_test,y_test)

['lstat', 'rm', 'ptratio', 'age', 'tax', 'dis', 'crim', 'nox', 'zn', 'indus']
0.0265
RMSE for train: 3.0605017353580837
RMSE for test: 2.918802548095823
Accuracies: Train: 0.8917858471797698,    Test: 0.8888461075669345



['lstat', 'rm', 'ptratio', 'age', 'tax', 'dis', 'crim', 'nox', 'zn', 'chas']
0.0265
RMSE for train: 2.9383226911503955
RMSE for test: 3.0689660057783863
Accuracies: Train: 0.9002534733837531,    Test: 0.877114851699928



['lstat', 'rm', 'ptratio', 'age', 'tax', 'dis', 'crim', 'nox', 'zn', 'rad']
0.0265
RMSE for train: 2.9479295626359145
RMSE for test: 3.0197664838386817
Accuracies: Train: 0.8996001628593998,    Test: 0.8810232878239332



['lstat', 'rm', 'ptratio', 'age', 'tax', 'dis', 'crim', 'nox', 'zn', 'black']
0.0265
RMSE for train: 2.943063184560385
RMSE for test: 3.197653036153243
Accuracies: Train: 0.8999313650026717,    Test: 0.8665932146792712





In [59]:
def reject_cols(df,to_rem_col_list):
    complete_col_list=list(df.columns)
    keep_col_list=[i for i in complete_col_list if i not in to_rem_col_list]
    print('keep_col_list : ',keep_col_list)
    return df[[item for item in keep_col_list]]

In [60]:
reject_cols(X_train,['crim'])

keep_col_list :  ['zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat']


Unnamed: 0_level_0,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
40,75.0,2.95,0,0.4280,6.595,21.8,5.4011,3,252,18.3,395.63,4.32
441,0.0,18.10,0,0.7400,5.818,92.4,1.8662,24,666,20.2,391.45,22.11
454,0.0,18.10,0,0.7130,7.393,99.3,2.4527,24,666,20.2,375.87,16.74
84,25.0,4.86,0,0.4260,6.167,46.7,5.4007,4,281,19.0,390.64,7.51
262,20.0,3.97,0,0.6470,7.520,89.4,2.1398,5,264,13.0,388.37,7.26
16,0.0,8.14,0,0.5380,5.834,56.5,4.4986,4,307,21.0,395.62,8.47
341,0.0,5.19,0,0.5150,5.968,58.5,4.8122,5,224,20.2,396.90,9.29
343,0.0,1.89,0,0.5180,6.540,59.7,6.2669,1,422,15.9,389.96,8.65
121,0.0,25.65,0,0.5810,5.870,69.7,2.2577,2,188,19.1,389.15,14.37
225,0.0,6.20,0,0.5040,8.266,78.3,2.8944,8,307,17.4,385.05,4.14


In [62]:
for i in range(len(col_list)):
    feature_list=[]
    curr_feature_name=col_list[i]
    
    if curr_feature_name not in feature_list:
        feature_list.append(curr_feature_name)
        print(feature_list)

        new_X_train=reject_cols(X_train,feature_list)
        new_X_test=reject_cols(X_test,feature_list)
        my_elas_func(new_X_train,y_train,new_X_test,y_test)

['crim']
keep_col_list :  ['zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat']
keep_col_list :  ['zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat']
0.0265
RMSE for train: 2.698351347173638
RMSE for test: 3.6315823476361473
Accuracies: Train: 0.9158806711503036,    Test: 0.8279292565978139



['zn']
keep_col_list :  ['crim', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat']
keep_col_list :  ['crim', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat']
0.0265
RMSE for train: 2.620901351823035
RMSE for test: 3.4134126665270785
Accuracies: Train: 0.9206402751661438,    Test: 0.8479827633742489



['indus']
keep_col_list :  ['crim', 'zn', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat']
keep_col_list :  ['crim', 'zn', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat']
0.0265
RMSE for 

In [63]:
for i in range(len(col_list)):
    feature_list=['black']
    curr_feature_name=col_list[i]
    
    if curr_feature_name not in feature_list:
        feature_list.append(curr_feature_name)
        print(feature_list)

        new_X_train=reject_cols(X_train,feature_list)
        new_X_test=reject_cols(X_test,feature_list)
        my_elas_func(new_X_train,y_train,new_X_test,y_test)

['black', 'crim']
keep_col_list :  ['zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'lstat']
keep_col_list :  ['zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'lstat']
0.0265
RMSE for train: 2.883575451971911
RMSE for test: 3.518373150678726
Accuracies: Train: 0.9039358281736203,    Test: 0.8384901410424676



['black', 'zn']
keep_col_list :  ['crim', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'lstat']
keep_col_list :  ['crim', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'lstat']
0.0265
RMSE for train: 2.782216367800053
RMSE for test: 3.195998389696819
Accuracies: Train: 0.9105705404606438,    Test: 0.86673124338354



['black', 'indus']
keep_col_list :  ['crim', 'zn', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'lstat']
keep_col_list :  ['crim', 'zn', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'lstat']
0.0265
RMSE for train: 2.7863014075542774
RMSE 

In [64]:
for i in range(len(col_list)):
    feature_list=['black','chas']
    curr_feature_name=col_list[i]
    
    if curr_feature_name not in feature_list:
        feature_list.append(curr_feature_name)
        print(feature_list)

        new_X_train=reject_cols(X_train,feature_list)
        new_X_test=reject_cols(X_test,feature_list)
        my_elas_func(new_X_train,y_train,new_X_test,y_test)

['black', 'chas', 'crim']
keep_col_list :  ['zn', 'indus', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'lstat']
keep_col_list :  ['zn', 'indus', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'lstat']
0.0265
RMSE for train: 3.0004462531531026
RMSE for test: 3.2630285333908113
Accuracies: Train: 0.8959910991137499,    Test: 0.86108249165306



['black', 'chas', 'zn']
keep_col_list :  ['crim', 'indus', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'lstat']
keep_col_list :  ['crim', 'indus', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'lstat']
0.0265
RMSE for train: 2.957319904649346
RMSE for test: 3.0148500615308875
Accuracies: Train: 0.8989595163516295,    Test: 0.8814103797401648



['black', 'chas', 'indus']
keep_col_list :  ['crim', 'zn', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'lstat']
keep_col_list :  ['crim', 'zn', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'lstat']
0.0265
RMSE for train: 2.947929133826009
RMSE for test: 3.0198422527

In [65]:
for i in range(len(col_list)):
    feature_list=['black','chas','rad']
    curr_feature_name=col_list[i]
    
    if curr_feature_name not in feature_list:
        feature_list.append(curr_feature_name)
        print(feature_list)

        new_X_train=reject_cols(X_train,feature_list)
        new_X_test=reject_cols(X_test,feature_list)
        my_elas_func(new_X_train,y_train,new_X_test,y_test)

['black', 'chas', 'rad', 'crim']
keep_col_list :  ['zn', 'indus', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'lstat']
keep_col_list :  ['zn', 'indus', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'lstat']
0.0265
RMSE for train: 3.1430403723135534
RMSE for test: 3.0908119014346886
Accuracies: Train: 0.885870287628337,    Test: 0.8753591523393398



['black', 'chas', 'rad', 'zn']
keep_col_list :  ['crim', 'indus', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'lstat']
keep_col_list :  ['crim', 'indus', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'lstat']
0.0265
RMSE for train: 3.121110897551907
RMSE for test: 2.9634533661049822
Accuracies: Train: 0.8874573327347409,    Test: 0.8854193089980348



['black', 'chas', 'rad', 'indus']
keep_col_list :  ['crim', 'zn', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'lstat']
keep_col_list :  ['crim', 'zn', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'lstat']
0.0265
RMSE for train: 3.118177573132445
RMSE for test: 2.881053370811167
Accuracies: Tr

In [66]:
for i in range(len(col_list)):
    feature_list=['black','chas','rad','indus']
    curr_feature_name=col_list[i]
    
    if curr_feature_name not in feature_list:
        feature_list.append(curr_feature_name)
        print(feature_list)

        new_X_train=reject_cols(X_train,feature_list)
        new_X_test=reject_cols(X_test,feature_list)
        my_elas_func(new_X_train,y_train,new_X_test,y_test)

['black', 'chas', 'rad', 'indus', 'crim']
keep_col_list :  ['zn', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'lstat']
keep_col_list :  ['zn', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'lstat']
0.0265
RMSE for train: 3.196207625172519
RMSE for test: 2.934964104263578
Accuracies: Train: 0.8819764237377498,    Test: 0.8876117703484109



['black', 'chas', 'rad', 'indus', 'zn']
keep_col_list :  ['crim', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'lstat']
keep_col_list :  ['crim', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'lstat']
0.0265
RMSE for train: 3.190816790295037
RMSE for test: 2.885078307211169
Accuracies: Train: 0.8823742133239402,    Test: 0.8913998430645871



['black', 'chas', 'rad', 'indus', 'nox']
keep_col_list :  ['crim', 'zn', 'rm', 'age', 'dis', 'tax', 'ptratio', 'lstat']
keep_col_list :  ['crim', 'zn', 'rm', 'age', 'dis', 'tax', 'ptratio', 'lstat']
0.0265
RMSE for train: 3.2012994279632045
RMSE for test: 2.9197417452479186
Accuracies: Train: 0.8816000830694716, 

In [None]:
forward selection
['lstat', 'rm', 'ptratio', 'age', 'tax', 'dis', 'crim', 'nox', 'zn']
0.0265
RMSE for train: 3.118266535279123
RMSE for test: 2.882864876886123
Accuracies: Train: 0.8876623663035237,    Test: 0.8915664150919732

In [None]:

backward elimination
['black', 'chas', 'rad', 'indus']
keep_col_list :  ['crim', 'zn', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'lstat']
keep_col_list :  ['crim', 'zn', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'lstat']
0.0265
RMSE for train: 3.118177573132445
RMSE for test: 2.881053370811167
Accuracies: Train: 0.8876687760535401,    Test: 0.8917026451210622