# Submission for Housing Prices Kaggle Competition

Practice using Linear Regression model

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from scipy import stats

## Load data

In [2]:
df = pd.read_csv('train.csv')

X = df[['LotFrontage','YearBuilt','YearRemodAdd','Foundation','OverallQual']]
y = df['SalePrice']

df = pd.DataFrame(data={'Id':df['Id'],
                        'LotFrontage':X['LotFrontage'],
                        'YearBuilt':X['YearBuilt'],
                        'YearRemodAdd':X['YearRemodAdd'],
                        'Foundation':X['Foundation'],
                        'OverallQual':X['OverallQual'],
                        'SalePrice':y})

df.to_csv('enhanced set.csv')

df.head(10)

Unnamed: 0,Id,LotFrontage,YearBuilt,YearRemodAdd,Foundation,OverallQual,SalePrice
0,1,65.0,2003,2003,PConc,7,208500
1,2,80.0,1976,1976,CBlock,6,181500
2,3,68.0,2001,2002,PConc,7,223500
3,4,60.0,1915,1970,BrkTil,7,140000
4,5,84.0,2000,2000,PConc,8,250000
5,6,85.0,1993,1995,Wood,5,143000
6,7,75.0,2004,2005,PConc,8,307000
7,8,,1973,1973,CBlock,7,200000
8,9,51.0,1931,1950,BrkTil,7,129900
9,10,50.0,1939,1950,BrkTil,5,118000


## Eliminate NaN rows

In [3]:
df['LotFrontage'] = df['LotFrontage'].fillna(df['LotFrontage'].median())

column_transformer = make_column_transformer((OneHotEncoder(),['Foundation']),
                                             (MinMaxScaler(),['LotFrontage','YearBuilt','YearRemodAdd','OverallQual']),
                                             remainder='passthrough')

X = df[['LotFrontage','YearBuilt','YearRemodAdd','Foundation','OverallQual']]

X = column_transformer.fit_transform(X)
X = pd.DataFrame(data=X, columns=column_transformer.get_feature_names_out())

y = df['SalePrice']

X.head(10)

Unnamed: 0,onehotencoder__Foundation_BrkTil,onehotencoder__Foundation_CBlock,onehotencoder__Foundation_PConc,onehotencoder__Foundation_Slab,onehotencoder__Foundation_Stone,onehotencoder__Foundation_Wood,minmaxscaler__LotFrontage,minmaxscaler__YearBuilt,minmaxscaler__YearRemodAdd,minmaxscaler__OverallQual
0,0.0,0.0,1.0,0.0,0.0,0.0,0.150685,0.949275,0.883333,0.666667
1,0.0,1.0,0.0,0.0,0.0,0.0,0.202055,0.753623,0.433333,0.555556
2,0.0,0.0,1.0,0.0,0.0,0.0,0.160959,0.934783,0.866667,0.666667
3,1.0,0.0,0.0,0.0,0.0,0.0,0.133562,0.311594,0.333333,0.666667
4,0.0,0.0,1.0,0.0,0.0,0.0,0.215753,0.927536,0.833333,0.777778
5,0.0,0.0,0.0,0.0,0.0,1.0,0.219178,0.876812,0.75,0.444444
6,0.0,0.0,1.0,0.0,0.0,0.0,0.184932,0.956522,0.916667,0.777778
7,0.0,1.0,0.0,0.0,0.0,0.0,0.164384,0.731884,0.383333,0.666667
8,1.0,0.0,0.0,0.0,0.0,0.0,0.10274,0.427536,0.0,0.666667
9,1.0,0.0,0.0,0.0,0.0,0.0,0.099315,0.485507,0.0,0.444444


## Separate training and testing data

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

## Create and fit a linear regression model

In [5]:
clf = LinearRegression().fit(X_train, y_train)

## Make predictions

In [6]:
predictions = clf.predict(X_test)

print('Predictions:\n',predictions[:5])

print('\nTrue:\n',y.head())

difference = y_test.to_numpy() - predictions

print('\nDifference:\n',difference[:5])

Predictions:
 [237248. 183488. 258432. 241920. 212672.]

True:
 0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

Difference:
 [ 51752.  -5488. 100668. -44920.  11328.]


## Create file to export

In [7]:
df = pd.read_csv('test.csv')

column_transformer = make_column_transformer((OneHotEncoder(),['Foundation']),
                                             (MinMaxScaler(),['LotFrontage','YearBuilt','YearRemodAdd','OverallQual']),
                                             remainder='passthrough')

X_test = df[['LotFrontage','YearBuilt','YearRemodAdd','Foundation','OverallQual']]
X_test['LotFrontage'] = X_test['LotFrontage'].fillna(X_test['LotFrontage'].median())

X_test = column_transformer.fit_transform(X_test)
X_test = pd.DataFrame(data=X_test, columns=column_transformer.get_feature_names_out())



predicted_prices = clf.predict(X_test)

df = pd.DataFrame(data={'SalePrice':predicted_prices},index=df['Id'])

df.head()


df.to_csv('5_predictions.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['LotFrontage'] = X_test['LotFrontage'].fillna(X_test['LotFrontage'].median())
