In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('data/ames.csv')
data

Unnamed: 0,Order,PID,area,price,MS.SubClass,MS.Zoning,Lot.Frontage,Lot.Area,Street,Alley,...,Screen.Porch,Pool.Area,Pool.QC,Fence,Misc.Feature,Misc.Val,Mo.Sold,Yr.Sold,Sale.Type,Sale.Condition
0,1,526301100,1656,215000,20,RL,141.0,31770,Pave,,...,0,0,,,,0,5,2010,WD,Normal
1,2,526350040,896,105000,20,RH,80.0,11622,Pave,,...,120,0,,MnPrv,,0,6,2010,WD,Normal
2,3,526351010,1329,172000,20,RL,81.0,14267,Pave,,...,0,0,,,Gar2,12500,6,2010,WD,Normal
3,4,526353030,2110,244000,20,RL,93.0,11160,Pave,,...,0,0,,,,0,4,2010,WD,Normal
4,5,527105010,1629,189900,60,RL,74.0,13830,Pave,,...,0,0,,MnPrv,,0,3,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,2926,923275080,1003,142500,80,RL,37.0,7937,Pave,,...,0,0,,GdPrv,,0,3,2006,WD,Normal
2926,2927,923276100,902,131000,20,RL,,8885,Pave,,...,0,0,,MnPrv,,0,6,2006,WD,Normal
2927,2928,923400125,970,132000,85,RL,62.0,10441,Pave,,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal
2928,2929,924100070,1389,170000,20,RL,77.0,10010,Pave,,...,0,0,,,,0,4,2006,WD,Normal


In [3]:
def fix_na(df, column, value):
    '''
    Fill missing data points with a specific function
    '''
    df[column] = df[column].fillna(value)

def change_dtype(df, column, map_fxn):
    '''
    Convert a column to a new data type
    '''
    df[column] = df[column].map(map_fxn)

In [4]:
fix_na(data, 'Lot.Frontage', data['Lot.Frontage'].mean())

In [5]:
fix_na(data, 'Mas.Vnr.Type', 'None')

In [6]:
fix_na(data, 'Mas.Vnr.Area', 0)
fix_na(data, 'Electrical', 'SBrkr')
fix_na(data, 'BsmtFin.SF.1', 0)
fix_na(data, 'BsmtFin.SF.2', 0)
fix_na(data, 'Bsmt.Unf.SF', 0)
fix_na(data, 'Total.Bsmt.SF', 0)
fix_na(data, 'Bsmt.Full.Bath', 0)
fix_na(data, 'Bsmt.Half.Bath', 0)
fix_na(data, ['Garage.Cars', 'Garage.Area'], 0)

In [7]:
cols_fix = ['Garage.Type', 'Garage.Yr.Blt', 'Garage.Finish', 'Garage.Qual', 'Garage.Cond', 'Alley', 'Bsmt.Qual',
            'Bsmt.Cond', 'Bsmt.Exposure', 'BsmtFin.Type.1', 'BsmtFin.Type.2', 'Fireplace.Qu', 'Pool.QC',
            'Fence', 'Misc.Feature']
for col in cols_fix:
    fix_na(data, col, 'NA')

In [8]:
int_to_string = ['Order', 'PID']

for item in int_to_string:
    change_dtype(data, item, str)

In [9]:
categorical_vars = data.select_dtypes(exclude = ['number'])
dropped_cats = ['Id', 'PID', 'Kitchen.Qual', 'BsmtFin.Type.1', 'Sale.Type', 'Mas.Vnr.Type', 
                'Condition.1', 'MS.SubClass', 'Neighborhood', 'MS.Zoning']
dummify_list_1 = [category for category in categorical_vars if category not in dropped_cats]
train_and_test_1 = pd.get_dummies(data = data, columns = dummify_list_1, drop_first = True)
train_and_test_1['MS.SubClass'] = train_and_test_1['MS.SubClass'].apply(lambda x:int(x))
train_and_test_1

Unnamed: 0,PID,area,price,MS.SubClass,MS.Zoning,Lot.Frontage,Lot.Area,Neighborhood,Condition.1,Overall.Qual,...,Misc.Feature_Gar2,Misc.Feature_NA,Misc.Feature_Othr,Misc.Feature_Shed,Misc.Feature_TenC,Sale.Condition_AdjLand,Sale.Condition_Alloca,Sale.Condition_Family,Sale.Condition_Normal,Sale.Condition_Partial
0,526301100,1656,215000,20,RL,141.00000,31770,NAmes,Norm,6,...,0,1,0,0,0,0,0,0,1,0
1,526350040,896,105000,20,RH,80.00000,11622,NAmes,Feedr,5,...,0,1,0,0,0,0,0,0,1,0
2,526351010,1329,172000,20,RL,81.00000,14267,NAmes,Norm,6,...,1,0,0,0,0,0,0,0,1,0
3,526353030,2110,244000,20,RL,93.00000,11160,NAmes,Norm,7,...,0,1,0,0,0,0,0,0,1,0
4,527105010,1629,189900,60,RL,74.00000,13830,Gilbert,Norm,5,...,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,923275080,1003,142500,80,RL,37.00000,7937,Mitchel,Norm,6,...,0,1,0,0,0,0,0,0,1,0
2926,923276100,902,131000,20,RL,69.22459,8885,Mitchel,Norm,5,...,0,1,0,0,0,0,0,0,1,0
2927,923400125,970,132000,85,RL,62.00000,10441,Mitchel,Norm,5,...,0,0,0,1,0,0,0,0,1,0
2928,924100070,1389,170000,20,RL,77.00000,10010,Mitchel,Norm,5,...,0,1,0,0,0,0,0,0,1,0


In [10]:
drop_col = []
for i in train_and_test_1.columns:
    ty = type(train_and_test_1[i][0])
    if ty != np.int64 and ty != np.float64 and ty != np.uint8:
        drop_col.append(i)
        print(i, ty)

PID <class 'str'>
MS.Zoning <class 'str'>
Neighborhood <class 'str'>
Condition.1 <class 'str'>
Mas.Vnr.Type <class 'str'>
BsmtFin.Type.1 <class 'str'>
Kitchen.Qual <class 'str'>
Sale.Type <class 'str'>


In [11]:
train_and_test = train_and_test_1.drop(drop_col, 1)
train_and_test

Unnamed: 0,area,price,MS.SubClass,Lot.Frontage,Lot.Area,Overall.Qual,Overall.Cond,Year.Built,Year.Remod.Add,Mas.Vnr.Area,...,Misc.Feature_Gar2,Misc.Feature_NA,Misc.Feature_Othr,Misc.Feature_Shed,Misc.Feature_TenC,Sale.Condition_AdjLand,Sale.Condition_Alloca,Sale.Condition_Family,Sale.Condition_Normal,Sale.Condition_Partial
0,1656,215000,20,141.00000,31770,6,5,1960,1960,112.0,...,0,1,0,0,0,0,0,0,1,0
1,896,105000,20,80.00000,11622,5,6,1961,1961,0.0,...,0,1,0,0,0,0,0,0,1,0
2,1329,172000,20,81.00000,14267,6,6,1958,1958,108.0,...,1,0,0,0,0,0,0,0,1,0
3,2110,244000,20,93.00000,11160,7,5,1968,1968,0.0,...,0,1,0,0,0,0,0,0,1,0
4,1629,189900,60,74.00000,13830,5,5,1997,1998,0.0,...,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,1003,142500,80,37.00000,7937,6,6,1984,1984,0.0,...,0,1,0,0,0,0,0,0,1,0
2926,902,131000,20,69.22459,8885,5,5,1983,1983,0.0,...,0,1,0,0,0,0,0,0,1,0
2927,970,132000,85,62.00000,10441,5,5,1992,1992,0.0,...,0,0,0,1,0,0,0,0,1,0
2928,1389,170000,20,77.00000,10010,5,5,1974,1975,0.0,...,0,1,0,0,0,0,0,0,1,0


In [29]:
X = train_and_test.drop('price', 1)
y = train_and_test['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1234567)

In [30]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
#np.sqrt(mean_squared_error(y_pred_train, train))
r2_score(y_pred_train, y_train)

1.0

In [31]:
y_pred_test = lr.predict(X_test)
r2_score(y_pred_test, y_test)

0.7860424191406232

In [32]:
for i in np.arange(0, 10, 1):
    ridge = Ridge(alpha = i)
    ridge.fit(X_train, y_train)
    y_pred_train = ridge.predict(X_train)
    r2_train = r2_score(y_pred_train, y_train)
    
    y_pred_test = ridge.predict(X_test)
    r2_test = r2_score(y_pred_test, y_test)
    
    print("alpha: ", i)
    print("train r2: ", r2_train)
    print("test r2: ", r2_test)
    print("-------------")



alpha:  0
train r2:  0.9999999998676217
test r2:  0.7860424628919164
-------------
alpha:  1
train r2:  0.9753652039988399
test r2:  0.789649614208366
-------------
alpha:  2
train r2:  0.9545044654352925
test r2:  0.7911367238062242
-------------
alpha:  3
train r2:  0.9410657273391808
test r2:  0.7917728809345007
-------------
alpha:  4
train r2:  0.9318005173340317
test r2:  0.7919917689734745
-------------
alpha:  5
train r2:  0.9249996415302628
test r2:  0.7919852925855277
-------------
alpha:  6
train r2:  0.9197637574002311
test r2:  0.791849536585887
-------------
alpha:  7
train r2:  0.9155836083854912
test r2:  0.7916368856173861
-------------
alpha:  8
train r2:  0.9121505164840954
test r2:  0.791377686828366
-------------
alpha:  9
train r2:  0.9092666488537015
test r2:  0.7910904853799949
-------------


In [33]:
lasso = Lasso()
lasso.fit(X_train, y_train)
y_pred_train = lasso.predict(X_train)
r2_score(y_pred_train, y_train)

  positive)


0.9992339959372303

In [34]:
y_pred_test = lasso.predict(X_test)
r2_score(y_pred_test, y_test)

0.7738809398957179