In [13]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from random import randrange

housing = pd.read_csv("train.csv")
housing.fillna(value="NA", inplace=True)

housing["LotFrontage"].replace(to_replace="NA", value=0, inplace=True)
housing["Reduced SalePrice"] = housing["SalePrice"] // 50000

## Features, Target, and Utilities

In [2]:
features = ["YearBuilt","OverallQual","MSSubClass"]
target = "Reduced SalePrice"

accuracy = lambda x : (x - housing[target]).value_counts()[0] / 1460
topct = lambda x : round(x * 100, 2)

## Baseline

In [3]:
baseline = (housing[target].value_counts()[housing[target].mode()[0]]) / 1460
print(f"Baseline: {baseline}")

Baseline: 0.34315068493150686


# Decision Tree

In [40]:
# Create our sample set.
decision_tree_sample = housing.sample(randrange(0, 1460))

# Create our features and our target.
dtree_features = decision_tree_sample[features]
dtree_target = decision_tree_sample[target]

# Create the decision tree and predict.
dtree = DecisionTreeClassifier()
dtree.fit(dtree_features, dtree_target)
pred1 = dtree.predict(housing[features])

# Score accuracy and print.
acc = accuracy(pred1)
pct = topct(acc)
print(f"Accuracy: {pct}%")

# K-fold cross validation.
kfold = KFold(n_splits=50, random_state=45)
results = cross_val_score(dtree, housing[features], housing[target], cv=kfold)
results = topct(results.mean())
print(f"K-fold cross validation average: {results}%")

Accuracy: 52.88%
K-fold cross validation average: 49.72%


# SVM

In [37]:
svm_sample = housing.sample(randrange(0, 1460))

svm_features = svm_sample[features]
svm_target = svm_sample[target]

svc = SVC()
svc.fit(svm_features, svm_target)
pred2 = svc.predict(housing[features])

acc = accuracy(pred2)
pct = topct(acc)
print(f"Accuracy: {pct}%")

# K-fold cross validation.
kfold = KFold(n_splits=50, random_state=12)
results = cross_val_score(svc, housing[features], housing[target], cv=kfold)
results = topct(results.mean())
print(f"K-fold cross validation average: {results}%")

Accuracy: 61.37%
K-fold cross validation average: 50.49%


# Logistic regression 

In [23]:
logistic_regression_sample = housing.sample(randrange(0,1460))

logistic_regression_features = logistic_regression_sample[features]
logistic_regression_target = logistic_regression_sample[target]

model = LogisticRegression()
model.fit(logistic_regression_features,logistic_regression_target)
pred3 = model.predict(housing[features])

acc = accuracy(pred3)
pct = topct(acc)
print(f"Accuracy: {pct}%")

loo = LeaveOneOut()
#Give this a moment.
results = cross_val_score(model, housing[features], housing[target], cv = loo) 
results = topct(results.mean())
print(f"Leave P Out cross validation average: {results}%")

Accuracy: 44.73%
Leave P Out cross validation average: 42.67%


In [41]:
models_predict = pd.DataFrame({'pred1':pred1, 'pred2':pred2, 'pred3':pred3})
pred4 = (models_predict.mode(axis=1)[0]).values
acc = accuracy(pred4)
pct = topct(acc)
print(f"Bagging Accuracy: {pct}%")

Bagging Accuracy: 55.21%
