# Predicting_House_Sale_Prices

In this project, we worked with housing data for the city of Ames, Iowa, United States from 2006 to 2010. You can read more about why the data was collected [here](https://doi.org/10.1080/10691898.2011.11889627). You can also read about the different columns in the data [here](https://s3.amazonaws.com/dq-content/307/data_description.txt).

In [1]:
# importing libraries

import pandas as pd
pd.options.display.max_columns = 999
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [2]:
# reading file into pandas dataframe
data_frame = pd.read_csv('AmesHousing.tsv', delimiter='\t')
data_frame.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Roof Style,Roof Matl,Exterior 1st,Exterior 2nd,Mas Vnr Type,Mas Vnr Area,Exter Qual,Exter Cond,Foundation,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin SF 1,BsmtFin Type 2,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,Heating,Heating QC,Central Air,Electrical,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,Kitchen Qual,TotRms AbvGrd,Functional,Fireplaces,Fireplace Qu,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,5,1960,1960,Hip,CompShg,BrkFace,Plywood,Stone,112.0,TA,TA,CBlock,TA,Gd,Gd,BLQ,639.0,Unf,0.0,441.0,1080.0,GasA,Fa,Y,SBrkr,1656,0,0,1656,1.0,0.0,1,0,3,1,TA,7,Typ,2,Gd,Attchd,1960.0,Fin,2.0,528.0,TA,TA,P,210,62,0,0,0,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,,Attchd,1961.0,Unf,1.0,730.0,TA,TA,Y,140,0,0,0,120,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,,Attchd,1958.0,Unf,1.0,312.0,TA,TA,Y,393,36,0,0,0,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,7,5,1968,1968,Hip,CompShg,BrkFace,BrkFace,,0.0,Gd,TA,CBlock,TA,TA,No,ALQ,1065.0,Unf,0.0,1045.0,2110.0,GasA,Ex,Y,SBrkr,2110,0,0,2110,1.0,0.0,2,1,3,1,Ex,8,Typ,2,TA,Attchd,1968.0,Fin,2.0,522.0,TA,TA,Y,0,0,0,0,0,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,1997.0,Fin,2.0,482.0,TA,TA,Y,212,34,0,0,0,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [3]:
# creating functions

def transform_features(data_frame):
    return data_frame

def select_features(data_frame):
    return data_frame[['Gr Liv Area', 'SalePrice']]

def train_and_test(data_frame):
    train = data_frame[:1460]
    test = data_frame[1460:]
    
    # selecting all numercial columns except
    # 'SalePrice' column (the target column)
    numeric_train = train.select_dtypes(include=['int','float'])
    numeric_test = test.select_dtypes(include=['int', 'float'])
    
    features = numeric_train.columns.drop('SalePrice')
    lr = LinearRegression()
    lr.fit(train[features], train['SalePrice'])
    predictions = lr.predict(test[features])
    mse = mean_squared_error(test['SalePrice'], predictions)
    root_mse = np.sqrt(mse)
    return root_mse

# transforming features
transform_dataframe = transform_features(data_frame)
filtered_dataframe = select_features(transform_dataframe)
root_mean_squared_error = train_and_test(filtered_dataframe)
root_mean_squared_error

57088.25161263909

#### Feature Engineering

In [4]:
# count of missing values

count_missing = data_frame.isnull().sum()

count_missing

Order                0
PID                  0
MS SubClass          0
MS Zoning            0
Lot Frontage       490
Lot Area             0
Street               0
Alley             2732
Lot Shape            0
Land Contour         0
Utilities            0
Lot Config           0
Land Slope           0
Neighborhood         0
Condition 1          0
Condition 2          0
Bldg Type            0
House Style          0
Overall Qual         0
Overall Cond         0
Year Built           0
Year Remod/Add       0
Roof Style           0
Roof Matl            0
Exterior 1st         0
Exterior 2nd         0
Mas Vnr Type        23
Mas Vnr Area        23
Exter Qual           0
Exter Cond           0
                  ... 
Bedroom AbvGr        0
Kitchen AbvGr        0
Kitchen Qual         0
TotRms AbvGrd        0
Functional           0
Fireplaces           0
Fireplace Qu      1422
Garage Type        157
Garage Yr Blt      159
Garage Finish      159
Garage Cars          1
Garage Area          1
Garage Qual

In [5]:
# dropping columns containing more than 5% missing values
drop_missing_cols = count_missing[(count_missing > len(data_frame)*0.05)
                                 ].sort_values()
data_frame = data_frame.drop(drop_missing_cols.index, axis=1)

In [6]:
data_frame.isnull().sum()

Order               0
PID                 0
MS SubClass         0
MS Zoning           0
Lot Area            0
Street              0
Lot Shape           0
Land Contour        0
Utilities           0
Lot Config          0
Land Slope          0
Neighborhood        0
Condition 1         0
Condition 2         0
Bldg Type           0
House Style         0
Overall Qual        0
Overall Cond        0
Year Built          0
Year Remod/Add      0
Roof Style          0
Roof Matl           0
Exterior 1st        0
Exterior 2nd        0
Mas Vnr Type       23
Mas Vnr Area       23
Exter Qual          0
Exter Cond          0
Foundation          0
Bsmt Qual          80
                   ..
Electrical          1
1st Flr SF          0
2nd Flr SF          0
Low Qual Fin SF     0
Gr Liv Area         0
Bsmt Full Bath      2
Bsmt Half Bath      2
Full Bath           0
Half Bath           0
Bedroom AbvGr       0
Kitchen AbvGr       0
Kitchen Qual        0
TotRms AbvGrd       0
Functional          0
Fireplaces

In [7]:
# dropping text columns containing one or more missing values
count_text_mv = (data_frame.select_dtypes(include=['object'])
                 .isnull().sum().sort_values(ascending=False)
                )
drop_text_mv = count_text_mv[count_text_mv > 0]

data_frame = data_frame.drop(drop_text_mv.index, axis=1)

In [8]:
# filling numerical columns that contain less than 5% missing values
# with common values of each column

missing_count = data_frame.select_dtypes(include=['int', 'float']).isnull().sum()

# filtering numeric columns fulfilling criteria of < 5% missing values
count_numeric_cols = missing_count[(missing_count < len(data_frame)*0.05) & 
                                   (missing_count > 0)
                                  ].sort_values()
count_numeric_cols

BsmtFin SF 1       1
BsmtFin SF 2       1
Bsmt Unf SF        1
Total Bsmt SF      1
Garage Cars        1
Garage Area        1
Bsmt Full Bath     2
Bsmt Half Bath     2
Mas Vnr Area      23
dtype: int64

In [9]:
# calculating common values of each column for replacement

replacement_val_dict = (data_frame[count_numeric_cols.index].mode()
                        .to_dict(orient='records')[0]
                       )
replacement_val_dict

{'Bsmt Full Bath': 0.0,
 'Bsmt Half Bath': 0.0,
 'Bsmt Unf SF': 0.0,
 'BsmtFin SF 1': 0.0,
 'BsmtFin SF 2': 0.0,
 'Garage Area': 0.0,
 'Garage Cars': 2.0,
 'Mas Vnr Area': 0.0,
 'Total Bsmt SF': 0.0}

In [10]:
# filling na values in numercial columns fulfilling criteria
# with common values in each column

data_frame = data_frame.fillna(replacement_val_dict)

In [11]:
# verifying that every column has 0 missing values

data_frame.isnull().sum().value_counts()

0    64
dtype: int64

In [12]:
# creating new features that better capture the information
# in some of the features

# examining unusual difference in non-numerical columns
# ('Yr Sold') & ('Year Built')
years_sold = data_frame['Yr Sold'] - data_frame['Year Built']
years_sold[years_sold < 0]

2180   -1
dtype: int64

In [13]:
# examining unusual difference in non-numerical columns
# ('Yr Sold') & ('Year Remod/Add')

years_since_remod = data_frame['Yr Sold'] - data_frame['Year Remod/Add']
years_since_remod[years_since_remod < 0]

1702   -1
2180   -2
2181   -1
dtype: int64

In [14]:
# creating new columns

data_frame['Years Before Sale'] = years_sold
data_frame['Years Since Remodeling'] = years_since_remod

# removing rows having unusual difference for 
# both of these new features columns
data_frame = data_frame.drop([1702, 2180, 2181], axis=0)

# dropping old columns 'Year Built' & 'Year Remod/Add'
data_frame = data_frame.drop(['Year Built', 'Year Remod/Add'],
                             axis=1)

In [15]:
# dropping columns that are not useful for machine learning
data_frame = data_frame.drop(['PID', 'Order'], axis=1)

# dropping columns that leak information about the final sale
data_frame = data_frame.drop(['Mo Sold', 'Sale Condition',
                              'Sale Type', 'Yr Sold'], axis=1
                            )

In [16]:
# updating transform_features() so that any column from the data frame
# with more than 25% (or another cutoff value) missing values
# is dropped

def transform_features(data_frame):
    count_missing = data_frame.isnull().sum()
    drop_missing_cols = count_missing[(count_missing > len(data_frame)*0.05)].sort_values()
    data_frame = data_frame.drop(drop_missing_cols.index, axis=1)
    
    count_text_mv = data_frame.select_dtypes(include=['object']).isnull().sum().sort_values(ascending=False)
    drop_text_mv = count_text_mv[count_text_mv > 0]
    data_frame = data_frame.drop(drop_text_mv.index, axis=1)
    
    missing_count = data_frame.select_dtypes(include=['int', 'float']).isnull().sum()
    # filtering numeric columns fulfilling criteria of < 5% missing values
    count_numeric_cols = missing_count[(missing_count < len(data_frame)*0.05) & (missing_count > 0)].sort_values()
    replacement_val_dict = data_frame[count_numeric_cols.index].mode().to_dict(orient='records')[0]
    data_frame = data_frame.fillna(replacement_val_dict)
    
    years_sold = data_frame['Yr Sold'] - data_frame['Year Built']
    years_since_remod = data_frame['Yr Sold'] - data_frame['Year Remod/Add']
    data_frame['Years Before Sale'] = years_sold
    data_frame['Years Since Remodeling'] = years_since_remod
    data_frame = data_frame.drop([1702, 2180, 2181], axis=0)

    data_frame = data_frame.drop(['Year Built', 'Year Remod/Add', 'PID', 'Order', 'Mo Sold', 'Sale Condition', 'Sale Type', 'Yr Sold'], axis=1)
    
    return data_frame

def select_features(data_frame):
    return data_frame[['Gr Liv Area', 'SalePrice']]

def train_and_test(data_frame):
    train = data_frame[:1460]
    test = data_frame[1460:]
    
    # selecting all numercial columns except
    # 'SalePrice' column (the target column)
    numeric_train = train.select_dtypes(include=['int','float'])
    numeric_test = test.select_dtypes(include=['int', 'float'])
    
    features = numeric_train.columns.drop('SalePrice')
    lr = LinearRegression()
    lr.fit(train[features], train['SalePrice'])
    predictions = lr.predict(test[features])
    mse = mean_squared_error(test['SalePrice'], predictions)
    root_mse = np.sqrt(mse)
    return root_mse

# re-reading file into dataframe
data_frame = pd.read_csv('AmesHousing.tsv', delimiter='\t')

# transforming features
transform_dataframe = transform_features(data_frame)
filtered_dataframe = select_features(transform_dataframe)
root_mean_squared_error = train_and_test(filtered_dataframe)
root_mean_squared_error

55275.36731241307

#### Features selection for numerical features

In [17]:
numerical_dataframe = transform_dataframe.select_dtypes(include=['int', 'float'])
numerical_dataframe.head()

Unnamed: 0,MS SubClass,Lot Area,Overall Qual,Overall Cond,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,TotRms AbvGrd,Fireplaces,Garage Cars,Garage Area,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,SalePrice,Years Before Sale,Years Since Remodeling
0,20,31770,6,5,112.0,639.0,0.0,441.0,1080.0,1656,0,0,1656,1.0,0.0,1,0,3,1,7,2,2.0,528.0,210,62,0,0,0,0,0,215000,50,50
1,20,11622,5,6,0.0,468.0,144.0,270.0,882.0,896,0,0,896,0.0,0.0,1,0,2,1,5,0,1.0,730.0,140,0,0,0,120,0,0,105000,49,49
2,20,14267,6,6,108.0,923.0,0.0,406.0,1329.0,1329,0,0,1329,0.0,0.0,1,1,3,1,6,0,1.0,312.0,393,36,0,0,0,0,12500,172000,52,52
3,20,11160,7,5,0.0,1065.0,0.0,1045.0,2110.0,2110,0,0,2110,1.0,0.0,2,1,3,1,8,2,2.0,522.0,0,0,0,0,0,0,0,244000,42,42
4,60,13830,5,5,0.0,791.0,0.0,137.0,928.0,928,701,0,1629,0.0,0.0,2,1,3,1,6,1,2.0,482.0,212,34,0,0,0,0,0,189900,13,12


In [18]:
# generating correlation between the features and target columns

absolute_corr_coeff = numerical_dataframe.corr()['SalePrice'].abs().sort_values()
absolute_corr_coeff

BsmtFin SF 2              0.006127
Misc Val                  0.019273
3Ssn Porch                0.032268
Bsmt Half Bath            0.035875
Low Qual Fin SF           0.037629
Pool Area                 0.068438
MS SubClass               0.085128
Overall Cond              0.101540
Screen Porch              0.112280
Kitchen AbvGr             0.119760
Enclosed Porch            0.128685
Bedroom AbvGr             0.143916
Bsmt Unf SF               0.182751
Lot Area                  0.267520
2nd Flr SF                0.269601
Bsmt Full Bath            0.276258
Half Bath                 0.284871
Open Porch SF             0.316262
Wood Deck SF              0.328183
BsmtFin SF 1              0.439284
Fireplaces                0.474831
TotRms AbvGrd             0.498574
Mas Vnr Area              0.506983
Years Since Remodeling    0.534985
Full Bath                 0.546118
Years Before Sale         0.558979
1st Flr SF                0.635185
Garage Area               0.641425
Total Bsmt SF       

In [19]:
# keeping features having correlation coefficient of larger
# than 0.4 (this is subjective, change this to examine diff results)

absolute_corr_coeff[absolute_corr_coeff > 0.4]

BsmtFin SF 1              0.439284
Fireplaces                0.474831
TotRms AbvGrd             0.498574
Mas Vnr Area              0.506983
Years Since Remodeling    0.534985
Full Bath                 0.546118
Years Before Sale         0.558979
1st Flr SF                0.635185
Garage Area               0.641425
Total Bsmt SF             0.644012
Garage Cars               0.648361
Gr Liv Area               0.717596
Overall Qual              0.801206
SalePrice                 1.000000
Name: SalePrice, dtype: float64

In [20]:
# dropping features columns with less than 0.4 correlation
# coefficient with SalePrice

transform_dataframe = transform_dataframe.drop(absolute_corr_coeff[absolute_corr_coeff < 0.4].index, axis=1)

In [21]:
transform_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2927 entries, 0 to 2929
Data columns (total 39 columns):
MS Zoning                 2927 non-null object
Street                    2927 non-null object
Lot Shape                 2927 non-null object
Land Contour              2927 non-null object
Utilities                 2927 non-null object
Lot Config                2927 non-null object
Land Slope                2927 non-null object
Neighborhood              2927 non-null object
Condition 1               2927 non-null object
Condition 2               2927 non-null object
Bldg Type                 2927 non-null object
House Style               2927 non-null object
Overall Qual              2927 non-null int64
Roof Style                2927 non-null object
Roof Matl                 2927 non-null object
Exterior 1st              2927 non-null object
Exterior 2nd              2927 non-null object
Mas Vnr Area              2927 non-null float64
Exter Qual                2927 non-null object


In [22]:
# columns that can be categorized as nominal variables are candidates for
# being coverted to categorical. creating a list of column names from
# documentation that are meant to be categorical

nominal_features = ['PID', 'MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Land Contour', 'Lot Config', 'Neighborhood', 
                    'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 
                    'Exterior 2nd', 'Mas Vnr Type', 'Foundation', 'Heating', 'Central Air', 'Garage Type', 
                    'Misc Feature', 'Sale Type', 'Sale Condition']

In [23]:
# testing which categorical columns have we still carried with us

# creating list of categorical columns after cross verification of column names
transform_category_cols = []
for col in nominal_features:
    if col in transform_dataframe.columns:
        transform_category_cols.append(col)

# examining unique values in each categorical column
unique_counts = transform_dataframe[transform_category_cols].apply(lambda col: len(col.value_counts())).sort_values()
# subjective cutoff (10 unique values)
drop_non_unique_cols = unique_counts[unique_counts > 10].index
transform_dataframe = transform_dataframe.drop(drop_non_unique_cols, axis=1)

In [24]:
# selecting the remaining text columns to convert them into categorical cols

text_cols = transform_dataframe.select_dtypes(include=['object'])
for col in text_cols:
    transform_dataframe[col] = transform_dataframe[col].astype('category')

# creating dummy columns for representation in regression model
# and adding them back to dataframe
dummies_dataframe = pd.get_dummies(transform_dataframe.select_dtypes(include=['category']))

transform_dataframe = pd.concat([transform_dataframe, dummies_dataframe], axis=1)

# dropping old text columns from dataframe
transform_dataframes = transform_dataframe.drop(text_cols,axis=1)

In [25]:
# updating select_features() to take in the new, modified train and test
# dataframes that were returned from transform_features()

def transform_features(data_frame):
    count_missing = data_frame.isnull().sum()
    drop_missing_cols = count_missing[(count_missing > len(data_frame)*0.05)].sort_values()
    data_frame = data_frame.drop(drop_missing_cols.index, axis=1)
    
    count_text_mv = data_frame.select_dtypes(include=['object']).isnull().sum().sort_values(ascending=False)
    drop_text_mv = count_text_mv[count_text_mv > 0]
    data_frame = data_frame.drop(drop_text_mv.index, axis=1)
    
    missing_count = data_frame.select_dtypes(include=['int', 'float']).isnull().sum()
    count_numeric_cols = missing_count[(missing_count < len(data_frame)*0.05) & (missing_count > 0)].sort_values()
    replacement_val_dict = data_frame[count_numeric_cols.index].mode().to_dict(orient='records')[0]
    data_frame = data_frame.fillna(replacement_val_dict)
    
    years_sold = data_frame['Yr Sold'] - data_frame['Year Built']
    years_since_remod = data_frame['Yr Sold'] - data_frame['Year Remod/Add']
    data_frame['Years Before Sale'] = years_sold
    data_frame['Years Since Remodeling'] = years_since_remod
    data_frame = data_frame.drop([1702, 2180, 2181], axis=0)

    data_frame = data_frame.drop(['Year Built', 'Year Remod/Add', 'PID', 'Order', 'Mo Sold', 'Sale Condition', 'Sale Type', 'Yr Sold'], axis=1)
    
    return data_frame

def select_features(data_frame, coefficient_threshold=0.4, unique_threshold=10):
    numerical_dataframe = data_frame.select_dtypes(include=['int', 'float'])
    absolute_corr_coeff = numerical_dataframe.corr()['SalePrice'].abs().sort_values()
    data_frame = data_frame.drop(absolute_corr_coeff[absolute_corr_coeff < coefficient_threshold].index, axis=1)
    
    nominal_features = ['PID', 'MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Land Contour', 'Lot Config', 'Neighborhood', 
                        'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 
                        'Exterior 2nd', 'Mas Vnr Type', 'Foundation', 'Heating', 'Central Air', 'Garage Type', 
                        'Misc Feature', 'Sale Type', 'Sale Condition']

    transform_category_cols = []
    for col in nominal_features:
        if col in data_frame.columns:
            transform_category_cols.append(col)

    unique_counts = data_frame[transform_category_cols].apply(lambda col: len(col.value_counts())).sort_values()
    drop_non_unique_cols = unique_counts[unique_counts > 10].index
    data_frame = data_frame.drop(drop_non_unique_cols, axis=1)

    text_cols = data_frame.select_dtypes(include=['object'])
    for col in text_cols:
        data_frame[col] = data_frame[col].astype('category')

    dummies_dataframe = pd.get_dummies(data_frame.select_dtypes(include=['category']))
    data_frame = pd.concat([data_frame, dummies_dataframe], axis=1)
    data_frame = transform_dataframe.drop(text_cols,axis=1)
    
    return data_frame
    
def train_and_test(data_frame, k=0):
    numeric_dataframe = data_frame.select_dtypes(include=['int','float'])
    features = numeric_dataframe.columns.drop('SalePrice')
    lr = LinearRegression()
    
    if k == 0:
        train = data_frame[:1460]
        test = data_frame[1460:]
        
        lr.fit(train[features], train['SalePrice'])
        predictions = lr.predict(test[features])
        mse = mean_squared_error(test['SalePrice'], predictions)
        root_mse = np.sqrt(mse)
        return root_mse
    
    if k == 1:
        # shuffling teh ordering of rows in dataframe
        shuffled_dataframe = data_frame.sample(frac=1,)
        fold_one = data_frame[:1460]
        fold_two = data_frame[1460:]
        
        lr.fit(fold_one[features], fold_one['SalePrice'])
        predictions_one = lr.predict(fold_two[features])
        
        mse_one = mean_squared_error(fold_two['SalePrice'], predictions_one)
        root_mse_one = np.sqrt(mse_one)
        
        lr.fit(fold_two[features], fold_two['SalePrice'])
        predictions_two = lr.predict(fold_one[features])
        
        mse_two = mean_squared_error(fold_one['SalePrice'], predictions_two)
        root_mse_two = np.sqrt(mse_two)
        
        avg_root_mse = np.mean([mse_one, mse_two])
        print(root_mse_one)
        print(root_mse_two)
        return avg_root_mse
        
    else:
        kf = KFold(n_splits=k, shuffle=True)
        root_mse_values = []
        
        for train_index, test_index in kf.split(data_frame):
            train = data_frame.iloc[train_index]
            test = data_frame.iloc[test_index]
            lr.fit(train[features], train['SalePrice'])
            predictions = lr.predict(test[features])
            mse = mean_squared_error(test['SalePrice'], predictions)
            root_mse = np.sqrt(mse)
            root_mse_values.append(root_mse)
        print(root_mse_values)
        avg_root_mse = np.mean(root_mse_values)
        return avg_root_mse
    
# re-reading file into dataframe
data_frame = pd.read_csv('AmesHousing.tsv', delimiter='\t')

# transforming features
transform_dataframe = transform_features(data_frame)
filtered_dataframe = select_features(transform_dataframe)
root_mean_squared_error = train_and_test(filtered_dataframe, k=4)

root_mean_squared_error

[31906.496303371943, 28146.71208444267, 39354.357572623536, 28649.4203392131]


32014.246574912813