# Linear Regression with Multiple Variables

### Problem Statement: Predict Price of House based on other params
### Simple Regression: h(X) = theta0+theta1(X)+theta2(X)+___

## Import Library

In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

## Import Dataset

In [8]:
dataset = pd.read_csv('Data/kc_house_data.csv')
print(dataset.head())

           id             date     price  bedrooms  bathrooms  sqft_living  \
0  7129300520  20141013T000000  221900.0         3       1.00         1180   
1  6414100192  20141209T000000  538000.0         3       2.25         2570   
2  5631500400  20150225T000000  180000.0         2       1.00          770   
3  2487200875  20141209T000000  604000.0         4       3.00         1960   
4  1954400510  20150218T000000  510000.0         3       2.00         1680   

   sqft_lot  floors  waterfront  view  ...  grade  sqft_above  sqft_basement  \
0      5650     1.0           0     0  ...      7        1180              0   
1      7242     2.0           0     0  ...      7        2170            400   
2     10000     1.0           0     0  ...      6         770              0   
3      5000     1.0           0     0  ...      7        1050            910   
4      8080     1.0           0     0  ...      8        1680              0   

   yr_built  yr_renovated  zipcode      lat     lo

## Dataset Preprocessing

In [9]:
dataset.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [10]:
dropColumns = ['id', 'date', 'zipcode']
dataset.drop(dropColumns, axis=1, inplace=True)
dataset.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,47.5112,-122.257,1340,5650
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,47.721,-122.319,1690,7639
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,47.7379,-122.233,2720,8062
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,47.5208,-122.393,1360,5000
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,47.6168,-122.045,1800,7503


In [11]:
dataset.shape
dataset.describe()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15
count,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0
mean,540088.1,3.370842,2.114757,2079.899736,15106.97,1.494309,0.007542,0.234303,3.40943,7.656873,1788.390691,291.509045,1971.005136,84.402258,47.560053,-122.213896,1986.552492,12768.455652
std,367127.2,0.930062,0.770163,918.440897,41420.51,0.539989,0.086517,0.766318,0.650743,1.175459,828.090978,442.575043,29.373411,401.67924,0.138564,0.140828,685.391304,27304.179631
min,75000.0,0.0,0.0,290.0,520.0,1.0,0.0,0.0,1.0,1.0,290.0,0.0,1900.0,0.0,47.1559,-122.519,399.0,651.0
25%,321950.0,3.0,1.75,1427.0,5040.0,1.0,0.0,0.0,3.0,7.0,1190.0,0.0,1951.0,0.0,47.471,-122.328,1490.0,5100.0
50%,450000.0,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,3.0,7.0,1560.0,0.0,1975.0,0.0,47.5718,-122.23,1840.0,7620.0
75%,645000.0,4.0,2.5,2550.0,10688.0,2.0,0.0,0.0,4.0,8.0,2210.0,560.0,1997.0,0.0,47.678,-122.125,2360.0,10083.0
max,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,13.0,9410.0,4820.0,2015.0,2015.0,47.7776,-121.315,6210.0,871200.0


In [12]:
# checking presence of missing data

dataset.isnull().sum()

price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

In [13]:

#X, y = data.drop('price', axis=1), data['price']
#print(X.shape, y.shape)
X = dataset.drop('price',axis=1)
print(X.head())
print(X.shape)

   bedrooms  bathrooms  sqft_living  sqft_lot  floors  waterfront  view  \
0         3       1.00         1180      5650     1.0           0     0   
1         3       2.25         2570      7242     2.0           0     0   
2         2       1.00          770     10000     1.0           0     0   
3         4       3.00         1960      5000     1.0           0     0   
4         3       2.00         1680      8080     1.0           0     0   

   condition  grade  sqft_above  sqft_basement  yr_built  yr_renovated  \
0          3      7        1180              0      1955             0   
1          3      7        2170            400      1951          1991   
2          3      6         770              0      1933             0   
3          5      7        1050            910      1965             0   
4          3      8        1680              0      1987             0   

       lat     long  sqft_living15  sqft_lot15  
0  47.5112 -122.257           1340        5650  
1  47.

In [14]:
y = dataset['price']
print(y.head())
print(y.shape)

0    221900.0
1    538000.0
2    180000.0
3    604000.0
4    510000.0
Name: price, dtype: float64
(21613,)


In [15]:
#X = dataset.iloc[:, dataset.columns!='price']
#X = dataset.iloc[:, [1,2]]
#print(X)

## Splitting Dataset - Training & Testing Dataset

In [16]:
## Splitting Dataset - Training & Testing Dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
print("X_train size:",X_train.shape)
print("X_test size:",X_test.shape)

X_train size: (17290, 17)
X_test size: (4323, 17)


## Train Linear Regression on Training Dataset

In [17]:
## Train Linear Regression on Training Dataset
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [18]:
# print the coefficients
# a0 - is the intercept (the value of  y  when  x =0)
# a1 - is the slope (the change in  y  divided by change in  x )
# linear regression model basically finds the best value for the intercept and slope
print(" \nThe value of Coefficients a0 and a1:  \n",regressor.intercept_,regressor.coef_ )

 
The value of Coefficients a0 and a1:  
 -38090652.77592235 [-3.31909409e+04  4.03002875e+04  1.11182027e+02  1.71960381e-01
  1.80909018e+03  6.09402810e+05  4.94058679e+04  3.07240636e+04
  9.47870908e+04  7.27869230e+01  3.83951039e+01 -2.45235695e+03
  2.16772868e+01  5.63493777e+05 -1.26112686e+05  2.97732884e+01
 -4.76002806e-01]


## Predict Testing Dataset Result

In [25]:
y_pred = regressor.predict(X_test)


## Calculating Cost

In [26]:
#cost = np.sum((regressor.predict(X_test) - y_test) ** 2)
cost = np.sum((y_pred-y_test)**2)
print("Cost (Caluculated Manually): ",cost)

Cost (Caluculated Manually):  159923204555336.88


In [27]:
print("Actual_Value\tPredicted_value\tDifference")

for x, y in zip(y_test, y_pred):
    #print(x, y, sep='\t\t')
    print('{:0.2f}\t{:0.2f}\t{:0.2f}'.format(x, y,x-y))

Actual_Value	Predicted_value	Difference
297000.00	380896.08	-83896.08
1578000.00	1519127.40	58872.60
562100.00	528789.94	33310.06
631500.00	575165.33	56334.67
780000.00	1007193.13	-227193.13
485000.00	369014.20	115985.80
340000.00	406662.82	-66662.82
335606.00	410234.45	-74628.45
425000.00	576099.72	-151099.72
490000.00	1212129.10	-722129.10
732000.00	695828.25	36171.75
389700.00	362881.82	26818.18
450000.00	359355.96	90644.04
357000.00	311183.26	45816.74
960000.00	846562.21	113437.79
257000.00	406090.71	-149090.71
448000.00	333167.74	114832.26
610000.00	665761.51	-55761.51
230950.00	316120.12	-85170.12
377500.00	569083.73	-191583.73
375000.00	360563.39	14436.61
410000.00	392925.20	17074.80
459000.00	651386.53	-192386.53
190000.00	206751.00	-16751.00
585000.00	599966.25	-14966.25
280000.00	272955.10	7044.90
500000.00	467159.83	32840.17
465000.00	602629.76	-137629.76
802000.00	911289.02	-109289.02
440000.00	497851.62	-57851.62
1452000.00	1453647.31	-1647.31
575000.00	451460.29	123539.71

330000.00	480077.50	-150077.50
295000.00	132218.86	162781.14
690700.00	694119.31	-3419.31
469950.00	506356.33	-36406.33
565000.00	405675.74	159324.26
570000.00	661661.56	-91661.56
333700.00	292568.96	41131.04
361500.00	300675.86	60824.14
510000.00	386628.80	123371.20
850000.00	891299.97	-41299.97
485000.00	836424.86	-351424.86
579000.00	538957.10	40042.90
475580.00	457825.15	17754.85
359950.00	416941.91	-56991.91
289950.00	340860.71	-50910.71
620000.00	568900.22	51099.78
425000.00	385037.08	39962.92
439000.00	522573.86	-83573.86
440000.00	445747.41	-5747.41
725000.00	716442.56	8557.44
480000.00	774543.87	-294543.87
1050000.00	1046151.71	3848.29
454950.00	405653.70	49296.30
1800000.00	1292973.19	507026.81
404500.00	441439.14	-36939.14
365500.00	511609.61	-146109.61
2200000.00	1179903.18	1020096.82
185000.00	216260.75	-31260.75
540000.00	532824.25	7175.75
368000.00	428233.47	-60233.47
334000.00	407315.86	-73315.86
315000.00	398866.35	-83866.35
470000.00	473576.35	-3576.35
335000.00	18708

In [22]:
#df = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten(), 'Difference':(y_test-y_pred).flatten() })
#df

In [28]:
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))



score_value_train = regressor.score(X_train, y_train)
score_value_test = regressor.score(X_test, y_test)

print(" \nScore Values")
print("Training Score: ",score_value_train)
print("Testing Score: ",score_value_test)


Mean Absolute Error: 122658.980055755
Mean Squared Error: 36993570334.33654
Root Mean Squared Error: 192337.1267705134
 
Score Values
Training Score:  0.696560271438684
Testing Score:  0.6889316995334356


# Trying Feature Scaling and then Applying Linear Regression

## Feature Scaling

In [30]:
## Predict Testing Dataset Resultfrom sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

print(X_train_scaled[0:5,:])

[[-0.39003389 -1.44813729 -0.55395019 -0.23023661 -0.91794048 -0.09067554
  -0.30872938  0.9125681  -0.56106598 -0.86348039  0.46635412 -1.05018286
  -0.21397278  0.96485795 -0.74469432 -0.31691025 -0.23245796]
 [-0.39003389  0.49429638 -0.32648492 -0.10035437 -0.91794048 -0.09067554
  -0.30872938  0.9125681   0.28898929 -0.69518396  0.6241477   0.30645855
  -0.21397278 -0.02200988  0.46319487  0.46999159 -0.11288783]
 [-0.39003389 -0.8006594  -1.07387079 -0.12727713 -0.91794048 -0.09067554
  -0.30872938 -0.62557797  0.28898929 -0.83943804 -0.66074282  0.54387079
  -0.21397278  0.46998125  1.26608592 -0.40434378 -0.10705339]
 [ 0.67029408  0.49429638  0.13927729 -0.19311726  0.92508233 -0.09067554
  -0.30872938 -0.62557797  0.28898929  0.50693336 -0.66074282  0.74736701
  -0.21397278 -1.00887771  0.22161703 -0.02546512 -0.2101286 ]
 [-0.39003389  0.49429638 -0.30482156 -0.24790911  0.92508233 -0.09067554
  -0.30872938 -0.62557797 -0.56106598 -0.65912044  0.60160576  0.47603872
  -0.213

## Fitting/Training Linear Regressor on Scaled Values

In [31]:
from sklearn.linear_model import LinearRegression
regressor_scaled = LinearRegression()
regressor_scaled.fit(X_train_scaled, y_train)

LinearRegression()

In [35]:
print('Training Data Score (non scaled values): {:.2f}'.format(regressor.score(X_train, y_train)))
print('Testing Data Score (non scaled values): {:.2f}'.format(regressor.score(X_test, y_test)))

print('Training Data Score (scaled values): {:.2f}'.format(regressor_scaled.score(X_train_scaled, y_train)))
print('Testing Data Score (scaled values): {:.2f}'.format(regressor_scaled.score(X_test_scaled, y_test)))

Training Data Score (non scaled values): 0.70
Testing Data Score (non scaled values): 0.69
Training Data Score (scaled values): 0.70
Testing Data Score (scaled values): 0.69


# Trying Random Forest Regression

In [37]:
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor(n_estimators=100).fit(X_train_scaled, y_train)
print('Training Data Score: {:.2f}'.format(reg.score(X_train_scaled, y_train)))
print('Testing Data Score: {:.2f}'.format(reg.score(X_test_scaled, y_test)))

Training Data Score: 0.98
Testing Data Score: 0.89
