In [1]:
import pandas as pd
import numpy as np

In [2]:
data_preprocessed = pd.read_csv('kc_house_data_preprocessed.csv')

In [3]:
data_preprocessed.head()

Unnamed: 0,year,month,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15,price
0,2014,10,3,1.0,1180,5650,1.0,0,3,7,1180,0,1955,0,47.5112,-122.257,1340,5650,221900.0
1,2014,12,3,2.25,2570,7242,2.0,0,3,7,2170,400,1951,1991,47.721,-122.319,1690,7639,538000.0
2,2015,2,2,1.0,770,10000,1.0,0,3,6,770,0,1933,0,47.7379,-122.233,2720,8062,180000.0
3,2014,12,4,3.0,1960,5000,1.0,0,5,7,1050,910,1965,0,47.5208,-122.393,1360,5000,604000.0
4,2015,2,3,2.0,1680,8080,1.0,0,3,8,1680,0,1987,0,47.6168,-122.045,1800,7503,510000.0


### Creating tagets

In [4]:
data_preprocessed['price'].median()

450000.0

In [5]:
targets = np.where(data_preprocessed['price'] >
                   data_preprocessed['price'].median(), 1, 0)

In [6]:
targets

array([0, 1, 0, ..., 0, 0, 0])

In [7]:
data_preprocessed['above_average_price'] = targets

In [8]:
data_preprocessed.head()

Unnamed: 0,year,month,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15,price,above_average_price
0,2014,10,3,1.0,1180,5650,1.0,0,3,7,1180,0,1955,0,47.5112,-122.257,1340,5650,221900.0,0
1,2014,12,3,2.25,2570,7242,2.0,0,3,7,2170,400,1951,1991,47.721,-122.319,1690,7639,538000.0,1
2,2015,2,2,1.0,770,10000,1.0,0,3,6,770,0,1933,0,47.7379,-122.233,2720,8062,180000.0,0
3,2014,12,4,3.0,1960,5000,1.0,0,5,7,1050,910,1965,0,47.5208,-122.393,1360,5000,604000.0,1
4,2015,2,3,2.0,1680,8080,1.0,0,3,8,1680,0,1987,0,47.6168,-122.045,1800,7503,510000.0,1


### Checking to see if targets is balanced

In [9]:
targets.sum() / targets.shape[0]

0.4973395641512053

In [10]:
data_with_targets = data_preprocessed.drop(['price'], axis=1)

In [11]:
data_with_targets.head()

Unnamed: 0,year,month,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15,above_average_price
0,2014,10,3,1.0,1180,5650,1.0,0,3,7,1180,0,1955,0,47.5112,-122.257,1340,5650,0
1,2014,12,3,2.25,2570,7242,2.0,0,3,7,2170,400,1951,1991,47.721,-122.319,1690,7639,1
2,2015,2,2,1.0,770,10000,1.0,0,3,6,770,0,1933,0,47.7379,-122.233,2720,8062,0
3,2014,12,4,3.0,1960,5000,1.0,0,5,7,1050,910,1965,0,47.5208,-122.393,1360,5000,1
4,2015,2,3,2.0,1680,8080,1.0,0,3,8,1680,0,1987,0,47.6168,-122.045,1800,7503,1


### Seperating targets from rest of data

In [12]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

In [13]:
unscaled_inputs.head()

Unnamed: 0,year,month,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15
0,2014,10,3,1.0,1180,5650,1.0,0,3,7,1180,0,1955,0,47.5112,-122.257,1340,5650
1,2014,12,3,2.25,2570,7242,2.0,0,3,7,2170,400,1951,1991,47.721,-122.319,1690,7639
2,2015,2,2,1.0,770,10000,1.0,0,3,6,770,0,1933,0,47.7379,-122.233,2720,8062
3,2014,12,4,3.0,1960,5000,1.0,0,5,7,1050,910,1965,0,47.5208,-122.393,1360,5000
4,2015,2,3,2.0,1680,8080,1.0,0,3,8,1680,0,1987,0,47.6168,-122.045,1800,7503


### Standardizing the unscaled inputs 

In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [15]:
scaler.fit(unscaled_inputs)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [16]:
scaled_inputs = scaler.transform(unscaled_inputs)

In [17]:
scaled_inputs

array([[-0.69065478,  1.09962055, -0.39873715, ..., -0.30607896,
        -0.9433552 , -0.26071541],
       [-0.69065478,  1.74162654, -0.39873715, ..., -0.74634143,
        -0.43268619, -0.18786773],
       [ 1.44790136, -1.46840343, -1.47395936, ..., -0.13565477,
         1.07013975, -0.17237524],
       ...,
       [-0.69065478, -0.18439144, -1.47395936, ..., -0.60432128,
        -1.41025258, -0.39414129],
       [ 1.44790136, -1.78940643, -0.39873715, ...,  1.02891048,
        -0.8412214 , -0.42051149],
       [-0.69065478,  1.09962055, -1.47395936, ..., -0.60432128,
        -1.41025258, -0.41794772]])

In [18]:
scaled_inputs.shape

(21613, 18)

### Splitting the data

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 18)

In [21]:
print(x_train.shape, y_train.shape)

(17290, 18) (17290,)


In [22]:
print(x_test.shape, y_test.shape)

(4323, 18) (4323,)


### Logistic regression with sklearn 

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [24]:
reg = LogisticRegression()

In [25]:
reg.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
reg.score(x_train, y_train)

0.843956043956044

### Creating summary table with intercept and coefficients

In [27]:
feature_name = unscaled_inputs.columns.values

In [28]:
summary_table = pd.DataFrame(columns=['Feature Name'], data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)

In [29]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature Name,Coefficient
0,Intercept,0.24713
1,year,0.14951
2,month,-0.020326
3,bedrooms,-0.184116
4,bathrooms,0.404407
5,sqft_living,0.468149
6,sqft_lot,0.207354
7,floors,0.391623
8,waterfront,0.301724
9,condition,0.2098


### Interpreting the coefficients

In [30]:
summary_table['Odds_Ratio'] = np.exp(summary_table.Coefficient)

In [31]:
summary_table.sort_values('Odds_Ratio', ascending=False)
# A feature is not particularly important if its coefficient is around 0 and its odds ratio around 1

Unnamed: 0,Feature Name,Coefficient,Odds_Ratio
10,grade,1.51264,4.538696
15,lat,1.41262,4.106699
17,sqft_living15,0.74569,2.107896
5,sqft_living,0.468149,1.597035
4,bathrooms,0.404407,1.498413
7,floors,0.391623,1.47938
12,sqft_basement,0.357337,1.429518
11,sqft_above,0.328247,1.388532
8,waterfront,0.301724,1.352188
0,Intercept,0.24713,1.280345


### Testing the model

In [33]:
reg.score(x_test, y_test)

0.8371501272264631

### Visualizing the test results

In [34]:
# Finding the probability that the result will be a 1
predicted_proba = reg.predict_proba(x_test)
predicted_proba[:,1]

array([1.84858839e-01, 2.95040023e-01, 9.99903070e-01, ...,
       4.10319879e-01, 2.89482336e-04, 9.21359354e-01])

In [35]:
# Finding if the answer was correct using boolean
model_outputs = reg.predict(x_test)
model_outputs == y_test

array([False, False,  True, ...,  True,  True,  True])

In [36]:
# Manually checking the results
np.sum((model_outputs==y_test)) / model_outputs.shape[0]

0.8371501272264631

In [37]:
# Correct answers
np.sum((model_outputs==y_test))

3619

In [38]:
# Total answers
model_outputs.shape[0]

4323