In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.read_csv("train.csv")

In [2]:
df.shape

(1460, 81)

In [3]:
df.head(n=5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


### Create Custom Features

In [4]:
# Create Custom Feature: Adjusted Total Basement Area
df['AdjTotalBsmtSF'] = df['BsmtFinSF1']+df['BsmtFinSF2']+0.9*df['BsmtUnfSF']

In [5]:
# Create Custom Feature: Adjusted Total Bathroom 
df['AdjTotalBath'] = df['FullBath']+df['BsmtFullBath']+0.6*(df['HalfBath']+df['BsmtHalfBath'])

In [6]:
# Create Custom Feature: HouseAge
df['HouseAge'] = df['YrSold']-(0.6*df['YearRemodAdd']+0.4*df['YearBuilt'])

In [7]:
# Transform SalePrice to LogSalePrice 
df['LogSalePrice'] = np.log(df['SalePrice'])

In [8]:
df['AdjOutdoorSF'] = df['3SsnPorch']+df['EnclosedPorch']+df['OpenPorchSF']+df['ScreenPorch']+df['WoodDeckSF']

In [9]:
df.shape

(1460, 86)

# (1) Helper Functions for Pipeline

### Function for Removing Rows

In [10]:
# Given a list of indices (rows) to drop, this returns a copy of the original df without those rows
def remove_rows_from_df(df, list_of_rows_to_drop, printDetails = False):
    df_rows_removed = df.drop(df.index[list_of_rows_to_drop])
    if printDetails:
        print(len(set(list_of_rows_to_drop)),"observations (rows) dropped from dataframe")
        print("Dimensions of new dataframe:", df_rows_removed.shape)
    return df_rows_removed

# Example of usage
list_of_rows_to_remove = [57, 57, 72, 101]
df_with_rows_removed = remove_rows_from_df(df, list_of_rows_to_remove, printDetails = True)

3 observations (rows) dropped from dataframe
Dimensions of new dataframe: (1457, 86)


### Function for Removing Outlier Rows

In [11]:
# Input: dataframe, column name, and threshold for determining if observation is outlier in column
# Ouput: a set whose elements are the indices (rows) of the outliers in this data frame
def indices_of_outliers(df, column_name, stdev_threshold, printDetails = False):
    mask_of_outliers = np.abs(df[column_name] - df[column_name].mean()) >= (stdev_threshold*df[column_name].std())
    set_of_indices_of_outliers = set(df.index[mask_of_outliers].tolist())
    if printDetails:
        print("There are",len(set_of_indices_of_outliers),"outliers in",column_name,"and their indices are:")
        print(set_of_indices_of_outliers)
    return set_of_indices_of_outliers

# "Vectorized" version of set_of_outliers and removes all of them from dataframe
# Non-mutatable, i.e. - original dataframe is not modified. Returns a new dataframe without outliers
def remove_rows_with_outliers(df, list_of_column_names, stdev_threshold, printDetails = False):
    set_of_indices_of_all_outliers = set()
    for column_name in list_of_column_names:
        set_of_indices_of_outliers = indices_of_outliers(df, column_name, stdev_threshold, printDetails)
        if printDetails:
            print("")
        set_of_indices_of_all_outliers = set_of_indices_of_all_outliers.union(set_of_indices_of_outliers)
    list_of_indices_to_drop = list(set_of_indices_of_all_outliers)
    if printDetails:
        print("Dropping outliers based on", stdev_threshold, "standard deviation criteria...")
    df_without_outliers = remove_rows_from_df(df, list_of_indices_to_drop, printDetails)
    return df_without_outliers

# Example of usage
list_of_column_names = ["LotFrontage", "OverallQual"]
df_without_outliers = remove_rows_with_outliers(df, list_of_column_names, stdev_threshold = 3, printDetails = True)

There are 12 outliers in LotFrontage and their indices are:
{197, 934, 807, 231, 1127, 1337, 909, 1298, 1107, 313, 1211, 1182}

There are 2 outliers in OverallQual and their indices are:
{533, 375}

Dropping outliers based on 3 standard deviation criteria...
14 observations (rows) dropped from dataframe
Dimensions of new dataframe: (1446, 86)


### Function for finding where MSSubClass conflicts with HouseStyle, i.e. - we don't know how many floors the house has

In [12]:
# Define two auxiliary functions to help us comb through this data
#   to find conflicts between the labels of the columns "MSSubClass" and "HouseStyle"
def HouseStyle_Value_Count_for_MSSubClass(df, MSSubClass_label, description):
    print("")
    print("#"*75)
    print("For MSSubClass =",MSSubClass_label,"(",description,")","\nThe values & frequencies of HouseStyle are:")
    print(df.loc[df["MSSubClass"]==MSSubClass_label,"HouseStyle"].value_counts())
    print("")

def Find_MSSubClass_HouseStyle_Conflict(df, MSSubClass_label, HouseStyle_label):
    rows_with_conflict = (df["MSSubClass"]==MSSubClass_label) & (df["HouseStyle"]==HouseStyle_label)
    if (sum(rows_with_conflict) >= 1):
        print("Grabbing problematic observations where MSSubClass =",MSSubClass_label,"but","HouseStyle =", HouseStyle_label,"...")
    else:
        raise ValueError("No rows matching this criterion!")
    return rows_with_conflict

# Example of usage
# MSSubClass = 20: 1-STORY 1946 & NEWER ALL STYLES
HouseStyle_Value_Count_for_MSSubClass(df, 20, "1-STORY 1946 & NEWER ALL STYLES")
mask_1 = Find_MSSubClass_HouseStyle_Conflict(df, 20, "2Story")
print(sum(mask_1),"observation(s)")

print("")

mask_2 = Find_MSSubClass_HouseStyle_Conflict(df, 20, "SLvl")
print(sum(mask_2),"observation(s)")


###########################################################################
For MSSubClass = 20 ( 1-STORY 1946 & NEWER ALL STYLES ) 
The values & frequencies of HouseStyle are:
1Story    534
SLvl        1
2Story      1
Name: HouseStyle, dtype: int64

Grabbing problematic observations where MSSubClass = 20 but HouseStyle = 2Story ...
1 observation(s)

Grabbing problematic observations where MSSubClass = 20 but HouseStyle = SLvl ...
1 observation(s)


### Function for finding houses that fail the sanity check, i.e. - square footage of basement = 0 but there is a bathroom in the basement, the renovation date is after the sell date, etc.

In [13]:
df.loc[df["YearRemodAdd"] > df["YrSold"], ["YearRemodAdd","YrSold","YearBuilt"]]

# Age of house = YearSold - 0.6*YearRemod - 0.4*YrBuilt
# This would produce a negative age
# Proposed solution: add 1.0 year to every single house's age...then this observation would become positive

Unnamed: 0,YearRemodAdd,YrSold,YearBuilt
523,2008,2007,2007


In [14]:
# MasVnrArea = 0, but the MasVnrType is "Stone" or "BrkFace"

x = df.loc[df["MasVnrArea"]==0]
x["MasVnrType"].value_counts()

None       859
BrkFace      1
Stone        1
Name: MasVnrType, dtype: int64

### Function to convert NA category to None

- Ordinal Categories that need to convert NA to None:

BsmtExposure, BsmtQual, FireplaceQu, GarageQual

In [15]:
def convert_NA_category(df,series_names):
    for series_name in series_names:
        df.loc[df[series_name].isnull(),series_name] = 'None'
        print(df[series_name].value_counts(sort = True))
        print('*'*20)

In [16]:
convert_NA_category(df,['BsmtExposure','BsmtQual','FireplaceQu','GarageQual'])

No      953
Av      221
Gd      134
Mn      114
None     38
Name: BsmtExposure, dtype: int64
********************
TA      649
Gd      618
Ex      121
None     37
Fa       35
Name: BsmtQual, dtype: int64
********************
None    690
Gd      380
TA      313
Fa       33
Ex       24
Po       20
Name: FireplaceQu, dtype: int64
********************
TA      1311
None      81
Fa        48
Gd        14
Po         3
Ex         3
Name: GarageQual, dtype: int64
********************


### Function to create dummify features 

In [17]:
def dummify_categorical(df, columns):
    for col in columns:
        # value_counts (sort by frequency)
        grouped = df.groupby(col)[[col]].agg('count').rename(columns={col:'count'}).sort_values(by = "count", ascending = False)
        # dummify all categories
        df = pd.get_dummies(df, columns=[col], prefix=col, prefix_sep='_') 
        # categories with frequency less than 70
        columns_to_drop = list(grouped.loc[grouped['count']<65].index)
        columns_to_drop = [col + "_" + i for i in columns_to_drop]
        # drop dummified columns
        if len(columns_to_drop) > 0:
            df = df.drop(columns = columns_to_drop,axis = 1)
        else:
            # when all the categories have frequency higher than 70
            df = df.drop(col + "_" + grouped.index[0], axis=1)
    return df

In [18]:
df_dummified = dummify_categorical(df,['CentralAir','Exterior1st','Foundation','Functional',\
                        'LotConfig','LotShape','MasVnrType','MSZoning','PavedDrive'])
df_dummified.head(5)

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,Street,Alley,LandContour,Utilities,LandSlope,Neighborhood,...,LotShape_IR1,LotShape_Reg,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,MSZoning_FV,MSZoning_RL,MSZoning_RM,PavedDrive_N,PavedDrive_Y
0,1,60,65.0,8450,Pave,,Lvl,AllPub,Gtl,CollgCr,...,0,1,1,0,0,0,1,0,0,1
1,2,20,80.0,9600,Pave,,Lvl,AllPub,Gtl,Veenker,...,0,1,0,1,0,0,1,0,0,1
2,3,60,68.0,11250,Pave,,Lvl,AllPub,Gtl,CollgCr,...,1,0,1,0,0,0,1,0,0,1
3,4,70,60.0,9550,Pave,,Lvl,AllPub,Gtl,Crawfor,...,1,0,0,1,0,0,1,0,0,1
4,5,60,84.0,14260,Pave,,Lvl,AllPub,Gtl,NoRidge,...,1,0,1,0,0,0,1,0,0,1


In [19]:
df_dummified.columns

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LandContour', 'Utilities', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior2nd',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
       'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Fireplaces',
       'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish',
       'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'PoolQC', 'Fence', 'MiscFeature',

### Function for ordinal variables

In [20]:
def impute_ordinal(df, list_of_dic_to_replace):
    for dic in list_of_dic_to_replace:
        df = df.replace(dic)
    return df

In [21]:
list_of_dic_to_replace = [
    {"BsmtExposure": {"Gd": 5, "Av": 4, "Mn": 3, "No": 2, "None": 0}},
    {"BsmtQual": {"Ex": 7, "Gd": 4, "TA": 2, "Fa": 1, "Po": 0, "None": 0}},
    {"ExterQual": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 1}}, # "Po": 0
    {"FireplaceQu": {"Ex": 5, "Gd": 3, "TA": 2.5, "Fa": 1.5, "Po":0, "None": 0.5}},
    {"KitchenQual": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1}},    #"Po": 0
    {"GarageQual": {"Ex": 5, "Gd": 5, "TA": 4.5, "Fa": 2.5, "Po": 1, "None": 1}},
    {"HeatingQC": {"Ex": 3, "Gd": 2, "TA": 1.5, "Fa": 1, "Po": 0}}]

df_ordinal_dummified = impute_ordinal(df_dummified,list_of_dic_to_replace)
df_ordinal_dummified.head(5)

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,Street,Alley,LandContour,Utilities,LandSlope,Neighborhood,...,LotShape_IR1,LotShape_Reg,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,MSZoning_FV,MSZoning_RL,MSZoning_RM,PavedDrive_N,PavedDrive_Y
0,1,60,65.0,8450,Pave,,Lvl,AllPub,Gtl,CollgCr,...,0,1,1,0,0,0,1,0,0,1
1,2,20,80.0,9600,Pave,,Lvl,AllPub,Gtl,Veenker,...,0,1,0,1,0,0,1,0,0,1
2,3,60,68.0,11250,Pave,,Lvl,AllPub,Gtl,CollgCr,...,1,0,1,0,0,0,1,0,0,1
3,4,70,60.0,9550,Pave,,Lvl,AllPub,Gtl,Crawfor,...,1,0,0,1,0,0,1,0,0,1
4,5,60,84.0,14260,Pave,,Lvl,AllPub,Gtl,NoRidge,...,1,0,1,0,0,0,1,0,0,1


# (2) Actual Pipeline Using Helper Functions

### Remove outliers for all numeric features (columns)

In [22]:
list_of_continuous_columns = ['AdjTotalBsmtSF', 'GarageArea', 'GrLivArea', 'HouseAge', 'LotArea',\
                        'AdjTotalBath', 'TotRmsAbvGrd', 'AdjOutdoorSF', 'OverallQual']
df_without_outliers = remove_rows_with_outliers(df_ordinal_dummified, list_of_continuous_columns, stdev_threshold = 3, printDetails = False)

### Remove rows where MSSubClass conflicts with HouseStyle.  Just showing this for cases where MSSubClass = 20.  We need to continue searching for all cases, such as MSSubClass = 30, 40, etc...

In [23]:
df['MasVnrType'].isnull().sum()

8

# Training Linear Regression Model

In [24]:
list_of_dummified_categorical = ['CentralAir_N', 'Exterior1st_HdBoard', 'Exterior1st_MetalSd',
       'Exterior1st_Plywood', 'Exterior1st_VinylSd', 'Exterior1st_Wd Sdng',
       'Foundation_BrkTil', 'Foundation_CBlock', 'Foundation_PConc',
       'Functional_Typ', 'LotConfig_Corner', 'LotConfig_CulDSac',
       'LotConfig_Inside', 'LotShape_IR1', 'LotShape_Reg',
       'MasVnrType_BrkFace', 'MasVnrType_None', 'MasVnrType_Stone',
       'MSZoning_FV', 'MSZoning_RL', 'MSZoning_RM', 'PavedDrive_N',
       'PavedDrive_Y']

In [25]:
list_of_ordinal_categorical = ['BsmtExposure', 'BsmtQual', 'ExterQual', 'FireplaceQu', 'GarageQual',\
'HeatingQC', 'KitchenQual'] 

In [26]:
list_of_predictors = list_of_continuous_columns+list_of_dummified_categorical+list_of_ordinal_categorical
len(list_of_predictors)

39

In [27]:
X = df_ordinal_dummified[list_of_predictors]# # convert to an array 
Y = df_ordinal_dummified['LogSalePrice'].to_numpy() # convert to an array 

In [28]:
X

Unnamed: 0,AdjTotalBsmtSF,GarageArea,GrLivArea,HouseAge,LotArea,AdjTotalBath,TotRmsAbvGrd,AdjOutdoorSF,OverallQual,CentralAir_N,...,MSZoning_RM,PavedDrive_N,PavedDrive_Y,BsmtExposure,BsmtQual,ExterQual,FireplaceQu,GarageQual,HeatingQC,KitchenQual
0,841.0,548,1710,5.0,8450,3.6,8,61,7,0,...,0,0,1,2,4,4,0.5,4.5,3.0,3
1,1233.6,460,1262,31.0,9600,2.6,6,298,6,0,...,0,0,1,5,4,3,2.5,4.5,3.0,2
2,876.6,608,1786,6.4,11250,3.6,6,42,7,0,...,0,0,1,3,4,4,2.5,4.5,3.0,3
3,702.0,642,1717,58.0,9550,2.0,7,307,7,0,...,0,0,1,2,2,3,3.0,4.5,2.0,3
4,1096.0,836,2198,8.0,14260,3.6,9,276,8,0,...,0,0,1,4,4,4,2.5,4.5,3.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,857.7,460,1647,7.4,7917,2.6,7,40,6,0,...,0,0,1,2,4,3,2.5,4.5,3.0,2
1456,1483.1,500,2073,26.0,13175,3.0,7,349,6,0,...,0,0,1,2,4,3,2.5,4.5,1.5,2
1457,1064.3,252,2340,30.0,9042,2.0,9,60,7,0,...,0,0,1,2,2,5,3.0,4.5,3.0,3
1458,1078.0,240,1078,32.4,9717,2.0,5,478,5,0,...,0,0,1,3,2,3,0.5,4.5,2.0,3


# Joe testing if CV and learning curve works on dummy data

In [29]:
# Dummy data set to test if our pipeline is working
# The optimal solution should be: B_0 = 11, B_1 = -6, B_2 = 1, B_3 = 0
n_points = 161
x = np.linspace(-10, 10, n_points)
epsilon = np.random.normal(0, 1, n_points)

x1 = x 
x2 = x**2
x3 = x**3
x4 = x1

y = x**2 - 6*x + 11 + epsilon

d = {"x1": x1, "x2": x2, "x3": x3, "x4": x4, "y": y}
df_test = pd.DataFrame(data = d)
df_test

Unnamed: 0,x1,x2,x3,x4,y
0,-10.000,100.000000,-1000.000000,-10.000,171.766926
1,-9.875,97.515625,-962.966797,-9.875,168.298622
2,-9.750,95.062500,-926.859375,-9.750,165.941016
3,-9.625,92.640625,-891.666016,-9.625,163.150682
4,-9.500,90.250000,-857.375000,-9.500,158.790118
...,...,...,...,...,...
156,9.500,90.250000,857.375000,9.500,43.300382
157,9.625,92.640625,891.666016,9.625,44.424260
158,9.750,95.062500,926.859375,9.750,48.011682
159,9.875,97.515625,962.966797,9.875,50.423013


In [30]:
X_all = np.array(df_test.iloc[:,0:4])
X_all.shape

(161, 4)

In [31]:
y_all = np.array(df_test.iloc[:,-1]).reshape(-1,1)
y_all.shape

(161, 1)

In [32]:
## use train_test_split to split the dataset into training and test datasets
import 
X_train, X_test, y_train, y_test = ms.train_test_split(X_all, y_all, train_size=0.8, test_size=0.2)

NameError: name 'ms' is not defined

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error


n_splits = 4
alpha_vector = [0.0, 0.04, 0.08, 0.1, 0.2, 0.4, 1.0]
MSE_test_vector = []
MSE_train_vector = []

for alpha in alpha_vector:
    kf = KFold(n_splits = n_splits, shuffle = True)

    MSE_tests = []
    MSE_trains = []
    for train_index, test_index in kf.split(X_train):
        # print("TRAIN:", train_index)
        # print("TEST:", test_index)
        X_train_KF, X_test_KF = X_train[train_index], X_train[test_index]
        y_train_KF, y_test_KF = y_train[train_index], y_train[test_index]
        
        clf = linear_model.Lasso(alpha = alpha)
        clf.fit(X_train_KF, y_train_KF)
        
        y_test_KF_predicted = clf.predict(X_test_KF)
        y_train_KF_predicted = clf.predict(X_train_KF)
        
        MSE_test = mean_squared_error(y_test_KF_predicted, y_test_KF)        
        MSE_train = mean_squared_error(y_train_KF_predicted, y_train_KF)
        
        MSE_tests.append(MSE_test)
        MSE_trains.append(MSE_train)
        
    MSE_test_outer = mean(MSE_tests)
    MSE_train_outer = mean(MSE_trains)
    
    MSE_test_vector.append(MSE_test_outer)
    MSE_train_vector.append(MSE_train_outer)
    
    print("Average test MSE for alpha =",alpha,"is:",'{:,.2f}'.format(MSE_test_outer))
    print("Average train MSE for alpha =",alpha,"is:",'{:,.2f}'.format(MSE_train_outer))
    print("")

In [None]:
plt.plot(np.log(alpha_vector), MSE_test_vector)
plt.title("Test MSE")
plt.xlabel("log(alpha)")

In [None]:
plt.plot(np.log(alpha_vector), MSE_train_vector)
plt.title("Train MSE")
plt.xlabel("log(alpha)")

# End of Joe's testing

In [None]:
# Using 80% of train dataset to find the best lambda 
n_splits = 5 # number of folds 

list_of_exponents = np.linspace(-5,3,100)
alpha_vector = np.power(10.0, list_of_exponents) # vector of alpha/lambda 
RMSE_train_vector = []
RMSE_test_vector = []
# coefs_vector = []
selected_model = Lasso()



for alpha in alpha_vector:
    
    RMSE_train_per_fold = []
    RMSE_test_per_fold = []
    kf = KFold(n_splits = n_splits, shuffle = False)
    print("kf:",kf)
    
    for train_index, test_index in kf.split(X_train):
        # print("TRAIN:", train_index, "TEST:", test_index)
        X_train_KF, X_test_KF = X_train[train_index], X_train[test_index]
        y_train_KF, y_test_KF = y_train[train_index], y_train[test_index]


        selected_model.set_params(alpha = alpha)
        #print("y_train:",y_train)
        #print("train_index:",train_index)
        #print("y_train_KF = y_train[train_index]:",y_train_KF)
        selected_model.fit(X_train_KF, y_train_KF)
        y_train_KF_predicted = selected_model.predict(X_train_KF)
        y_test_KF_predicted = selected_model.predict(X_test_KF)
        RMSE_train =sqrt(mean_squared_error(y_train_KF, y_train_KF_predicted)) # 
        RMSE_test = sqrt(mean_squared_error(y_test_KF, y_test_KF_predicted))
        R2 = r2_score(y_test_KF, y_test_KF_predicted)
        #print("R-squared=",R2)
        #print("RMSE =",'{:,.0f}'.format(RMSE))
        #print("")
        #coefs_per_fold.append(selected_model.coef_)
        RMSE_train_per_fold.append(RMSE_train)
        RMSE_test_per_fold.append(RMSE_test)
    
#     coefs = mean()
    RMSE_train_for_given_alpha = mean(RMSE_train_per_fold)
    RMSE_test_for_given_alpha = mean(RMSE_test_per_fold)
    RMSE_train_vector.append(RMSE_train_for_given_alpha)
    RMSE_test_vector.append(RMSE_test_for_given_alpha)
    
    print("For alpha =",alpha)
    print("RMSE_train = ", '{:,.2f}'.format(RMSE_train_for_given_alpha))
    print("RMSE_test = ", '{:,.2f}'.format(RMSE_test_for_given_alpha))
    print("")

print(alpha_vector)
print(RMSE_train_vector)
print(RMSE_test_vector)


In [None]:
plt.plot(list_of_exponents, RMSE_train_vector)
plt.title("Learning Curve")
plt.xlabel('log(alpha) = regularization strength')
plt.ylabel('RMSE')
plt.show()

In [None]:
plt.plot(list_of_exponents, RMSE_test_vector)
plt.title("Learning Curve")
plt.xlabel('log(alpha) = regularization strength')
plt.ylabel('RMSE')
plt.show()

In [None]:
alphas = np.logspace(-10,-2,100)
coefs = []
scores = []
lasso = Lasso()
for alpha in alphas:
    lasso.set_params(alpha=alpha)
    lasso.fit(X,Y)
    coefs.append(lasso.coef_)
    scores.append(lasso.score(X,Y))
coefs = pd.DataFrame(coefs, index = alphas, columns = X.columns )
print(coefs)

plt.rcParams['figure.figsize'] = (10,5)
for name in coefs.columns:
    plt.plot(coefs.index, coefs[name], label=name)
plt.legend(loc=4)   
plt.xlabel(r'hyperparameter $\lambda$')
plt.ylabel(r'slope values')

In [None]:
# Use the best lambda and refit using the 80% train dataset

In [None]:
# Use the coefficients to check MSE on the 20% test dataset