In [44]:
# Supress warnings:
import warnings
warnings.filterwarnings("ignore")

In [45]:
#loading data
import pandas as pd
from pathlib import Path

file_path = Path("houseData/kc_house_data.csv")
house = pd.read_csv(file_path)

house.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [46]:

#step 6: dropping 15 columns
columns_to_drop = ['id', 'date', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated',                   'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15']

house.drop(columns=columns_to_drop, axis=1, inplace=True)
house.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors
0,221900.0,3,1.0,1180,5650,1.0
1,538000.0,3,2.25,2570,7242,2.0
2,180000.0,2,1.0,770,10000,1.0
3,604000.0,4,3.0,1960,5000,1.0
4,510000.0,3,2.0,1680,8080,1.0


In [47]:
#replacing categories with numerical labels for algorithms
from sklearn.preprocessing import LabelEncoder

# categorical_cols = house.select_dtypes(include='object').columns.tolist()
# le = LabelEncoder()
# for col in categorical_cols:
#     house[col] = le.fit_transform(house[col])

le = LabelEncoder()
house = house.apply(le.fit_transform)


#step 7: checking for null values
house.isnull().sum()

price          0
bedrooms       0
bathrooms      0
sqft_living    0
sqft_lot       0
floors         0
dtype: int64

In [54]:
#import NumPy library
import numpy as np

#importing LR algorithm
from sklearn.linear_model import LogisticRegression

#data partition by PRICE (Dependent variable)
X = house.drop(['price'], axis=1)
Y = house['price']

# Taking 80% of House data as training set, and remaining 20% as test set.
X_train = np.array(X[0:int(0.80*len(X))])
Y_train = np.array(Y[0:int(0.80*len(Y))])
X_test = np.array(X[int(0.80*len(X)):])
Y_test = np.array(Y[int(0.80*len(Y)):])
len(X_train), len(Y_train), len(X_test), len(Y_test)

(17290, 17290, 4323, 4323)

In [57]:
#measuring initial execution time
import time
start_time = time.time()

#step 8
#initializing variable
LR = LogisticRegression()

#training data
LR_fit = LR.fit(X_train, Y_train)

#Predict
LR_pred = LR_fit.predict(X_test)

#printing results
from sklearn.metrics import accuracy_score
print("Logistic Regression is %f percent accurate" % (accuracy_score(LR_pred, Y_test)*100))

#measuring final execution time
end_time = time.time()

elapsed_time = end_time - start_time
elapsed_minutes = elapsed_time / 60

print("Elapsed time: %.2f minutes" % elapsed_minutes)

Logistic Regression is 0.370113 percent accurate
Elapsed time: 7.73 minutes


In [58]:
#measuring initial execution time
import time
start_time = time.time()

#step 9
#cross validation

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

seed=0
cv = KFold(n_splits=5,random_state=2, shuffle=True)


#return_score function
def return_score(model,X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    return score
scores = []

#LR model validation
model = LogisticRegression()
for train_index, test_index in cv.split(X,Y):
    X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)
    score = return_score(model,X_train, X_test, y_train, y_test)
    scores.append(score)
print("Accuracy score in each iteration: {}".format(scores))
print("K-Fold Score: {}".format(np.mean(scores)))

#measuring final execution time
end_time = time.time()

elapsed_time = end_time - start_time
elapsed_minutes = elapsed_time / 60

print("Elapsed time: %.2f minutes" % elapsed_minutes)

Accuracy score in each iteration: [0.005783021050196623, 0.006245662734212352, 0.0071709461022438125, 0.005089058524173028, 0.0037011334721258385]
K-Fold Score: 0.0055979643765903305
Elapsed time: 34.84 minutes
