In [1]:
#importing
import pandas as pd
import numpy as np
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score, accuracy_score, mean_squared_error
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [2]:
#Splitting
numericColumns = train.select_dtypes([np.number]).dropna() #Select_dtypes filters columns that meet certain condition. np.number returns list of numeric columns
nonNumeric = train.drop(numericColumns, axis = 1)




#Selected every column which has r^2 w/ SalesPrice > 0.3
numericFeaturesSelected = numericColumns[["OverallQual", "YearBuilt", "YearRemodAdd", "MasVnrArea", "LotArea", "LotFrontage", "BsmtFinSF1", "1stFlrSF", "TotalBsmtSF", "2ndFlrSF", "GrLivArea", "TotRmsAbvGrd", "GarageArea", "WoodDeckSF", "OpenPorchSF", "SalePrice"]]

#Selected columns which sounded potnetially meannigful
nonnumericFeaturesSelected = nonNumeric[["Neighborhood", "SaleCondition"]]

#Adding Dummise
nonnumericDummiesOnly = pd.get_dummies(nonnumericFeaturesSelected, prefix = "Dummy")
nonnumericDummiesOnly = nonnumericDummiesOnly.select_dtypes([np.number])

train = pd.concat([numericFeaturesSelected, nonnumericDummiesOnly], axis = 1)
numericFeaturesSelected

Unnamed: 0,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,LotArea,LotFrontage,BsmtFinSF1,1stFlrSF,TotalBsmtSF,2ndFlrSF,GrLivArea,TotRmsAbvGrd,GarageArea,WoodDeckSF,OpenPorchSF,SalePrice
0,7,2003,2003,196.0,8450,65.0,706,856,856,854,1710,8,548,0,61,208500
1,6,1976,1976,0.0,9600,80.0,978,1262,1262,0,1262,6,460,298,0,181500
2,7,2001,2002,162.0,11250,68.0,486,920,920,866,1786,6,608,0,42,223500
3,7,1915,1970,0.0,9550,60.0,216,961,756,756,1717,7,642,0,35,140000
4,8,2000,2000,350.0,14260,84.0,655,1145,1145,1053,2198,9,836,192,84,250000
5,5,1993,1995,0.0,14115,85.0,732,796,796,566,1362,5,480,40,30,143000
6,8,2004,2005,186.0,10084,75.0,1369,1694,1686,0,1694,7,636,255,57,307000
8,7,1931,1950,0.0,6120,51.0,0,1022,952,752,1774,8,468,90,0,129900
9,5,1939,1950,0.0,7420,50.0,851,1077,991,0,1077,5,205,0,4,118000
10,5,1965,1965,0.0,11200,70.0,906,1040,1040,0,1040,5,384,0,0,129500


In [3]:
#Cleaning (Initial row ct 1460)
train = train.dropna() #Removed null values (Row ct 1121)

In [4]:
#KEY PERFORM SAME ACTIONS ON BOTH X TEST AND X TRAIN BUT ONLY LOOK AT X TRAIN TO FIGURE OUT WHAT MATTERS 
y = train["SalePrice"]
x = train.drop(["SalePrice"], axis = 1) #Needs to be in list, axis = 1 look for columns, default axis =0 looks for rows 
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.2, random_state = 2)
xTrain

Unnamed: 0,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,LotArea,LotFrontage,BsmtFinSF1,1stFlrSF,TotalBsmtSF,2ndFlrSF,...,Dummy_Somerst,Dummy_StoneBr,Dummy_Timber,Dummy_Veenker,Dummy_Abnorml,Dummy_AdjLand,Dummy_Alloca,Dummy_Family,Dummy_Normal,Dummy_Partial
1028,5.0,1941.0,1950.0,0.0,9492.0,79.0,368.0,968.0,768.0,408.0,...,0,0,0,0,0,0,0,0,1,0
605,7.0,1965.0,1990.0,176.0,13600.0,85.0,454.0,1186.0,768.0,800.0,...,0,0,0,0,0,0,0,0,1,0
1136,6.0,1950.0,1950.0,0.0,9600.0,80.0,280.0,1032.0,1032.0,220.0,...,0,0,0,0,1,0,0,0,0,0
381,7.0,2006.0,2006.0,0.0,7200.0,60.0,0.0,1301.0,1293.0,0.0,...,1,0,0,0,0,0,0,0,0,1
93,6.0,1910.0,1998.0,0.0,7200.0,60.0,1046.0,1260.0,1214.0,1031.0,...,0,0,0,0,0,0,0,0,1,0
1392,5.0,1967.0,1967.0,0.0,7838.0,68.0,769.0,900.0,864.0,0.0,...,0,0,0,0,0,0,0,0,1,0
1360,5.0,1921.0,1998.0,0.0,9842.0,51.0,0.0,990.0,612.0,1611.0,...,0,0,0,0,0,0,0,0,1,0
1113,5.0,1953.0,2006.0,0.0,8923.0,66.0,643.0,1008.0,1008.0,0.0,...,0,0,0,0,0,0,0,0,1,0
295,6.0,1984.0,1984.0,0.0,7937.0,37.0,819.0,1003.0,1003.0,0.0,...,0,0,0,0,0,0,0,0,1,0
1095,6.0,2006.0,2006.0,0.0,9317.0,78.0,24.0,1314.0,1314.0,0.0,...,0,0,0,0,0,0,0,0,1,0


In [5]:
#Regression 
lin = LinearRegression()
lin.fit(xTrain, yTrain)




LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [6]:
print(lin.coef_)

[ 1.36756902e+04  2.41312047e+02  3.15355727e+02  1.77476248e+01
  9.50756076e-01  1.68245156e+02  2.55192051e+01  6.92294914e+01
  2.01895960e+01  6.94952211e+01 -1.99686283e+01  4.68494257e+02
  2.15970383e+01  1.49698212e+01 -1.06116702e+01 -1.54653301e+04
 -4.38261992e+02 -1.68217206e+04  8.96787092e+03  2.36199082e+03
 -7.33321314e+03  2.01221313e+04 -1.80936100e+04 -1.13862675e+04
 -4.62186935e+00 -8.41215730e+03 -1.70864631e+04 -8.88924483e+03
 -4.44645268e+03 -1.75967128e+04  3.53867883e+04  2.94465223e+04
 -1.06508593e+04 -5.47439252e+03 -1.00519941e+04 -1.14948891e+04
  2.04633553e+03  4.26539296e+04 -7.60639089e+02  2.34212611e+04
 -8.73489446e+03  3.13813737e+04 -1.28756507e+04 -2.47314736e+04
 -2.83034759e+03  1.77909927e+04]


In [7]:
print(lin.intercept_)

-1144258.47153079


In [8]:
print(r2_score(lin.predict(xTrain), yTrain))

0.8362522780892871


In [9]:
lin.predict(xTest)

array([126398.79422412, 115538.23885192, 114394.00770683, 150853.64561598,
       214463.36062002, 293244.34822436, 132707.39112347, 146364.24645916,
       107348.66666836, 193969.159198  , 348060.04743574, 388032.14080373,
       219639.78867542, 140552.66600226, 300896.17478449, 156156.42908576,
       130538.80935393, 162883.06910396, 200361.58214897, 166196.40137341,
       144637.00214451,  97616.68546016, 204261.91820635, 296076.38078917,
       263214.44427804, 154174.56048464, 188089.6239895 , 212928.66555624,
       137608.99475671, 139486.84520506, 235236.3651652 , 167014.27223975,
       190684.69108301, 120680.20357601, 149806.33762793, 238020.73631534,
       265881.24236559, 114889.94633025, 102998.15385703, 101865.128894  ,
       233131.98859527, 284847.63156268, 276590.51033853, 124390.71367309,
       126913.14271218, 317102.57849744, 313020.5935332 ,  78857.51791195,
       106837.25364384, 141572.34799647,  98174.65397543,  84346.29086312,
       332701.96881537, 1

In [10]:
print(r2_score(lin.predict(xTest), yTest))

0.6343147054107763
