In [11]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import train_test_split

housing = pd.read_csv("train.csv")

In [3]:
housing.fillna(value="NA", inplace=True)
housing["LotFrontage"].replace(to_replace="NA", value=0, inplace=True)

In [4]:
def train_model_splits_and_print(func):
    print(f"Function = {func.__name__}")
    for i, split in enumerate(np.arange(0.5, 1, 0.1)):
        acc = func(train_size=split)
        print(f"""----------------------------
Split {i+1}
Train Split: {round(split, 2)}
Test Split:  {round(1-split, 2)}
Accuracy:    {round(acc*100, 2)}%
----------------------------
""")

# Logistic Regression

In [5]:
feature = housing[["SalePrice", "1stFlrSF"]]
def train_logistic_regression(train_size=0.5):
    train_features, valid_features, train_target, valid_target = train_test_split(feature,
                                                                                  housing["OverallQual"],
                                                                                  test_size=(1 - train_size))
    
    model = LogisticRegression()
    model = model.fit(X=train_features, y=train_target)
    predictions = model.predict(X=valid_features)
    predictions = predictions > 5
    valid_target = valid_target > 5
    
    qualAboveAverage = housing['OverallQual'] > 5
    baseline = max(qualAboveAverage.value_counts())/len(housing['OverallQual'])
    acc = accuracy_score(valid_target, predictions)

    return acc

In [6]:
train_model_splits_and_print(train_logistic_regression)

Function = train_logistic_regression
----------------------------
Split 1
Train Split: 0.5
Test Split:  0.5
Accuracy:    71.37%
----------------------------

----------------------------
Split 2
Train Split: 0.6
Test Split:  0.4
Accuracy:    72.95%
----------------------------

----------------------------
Split 3
Train Split: 0.7
Test Split:  0.3
Accuracy:    67.88%
----------------------------

----------------------------
Split 4
Train Split: 0.8
Test Split:  0.2
Accuracy:    70.99%
----------------------------

----------------------------
Split 5
Train Split: 0.9
Test Split:  0.1
Accuracy:    68.71%
----------------------------



# SVM

In [7]:
feature = housing[["MSSubClass"]]
def train_svm(train_size=0.5):
    hstyle = housing["HouseStyle"].astype("category").cat.codes
    train_features, valid_features, train_target, valid_target = train_test_split(feature,
                                                                                  hstyle,
                                                                                  test_size=(1 - train_size))
    
    svc = SVC()
    svc.fit(train_features, train_target)
    pred = svc.predict(valid_features)
    
    baseline = max(hstyle.value_counts() / len(hstyle))
    acc = accuracy_score(valid_target, pred)
    return (acc - baseline) / baseline

In [8]:
train_model_splits_and_print(train_svm)

Function = train_svm
----------------------------
Split 1
Train Split: 0.5
Test Split:  0.5
Accuracy:    91.46%
----------------------------

----------------------------
Split 2
Train Split: 0.6
Test Split:  0.4
Accuracy:    90.43%
----------------------------

----------------------------
Split 3
Train Split: 0.7
Test Split:  0.3
Accuracy:    92.4%
----------------------------

----------------------------
Split 4
Train Split: 0.8
Test Split:  0.2
Accuracy:    92.87%
----------------------------

----------------------------
Split 5
Train Split: 0.9
Test Split:  0.1
Accuracy:    94.26%
----------------------------



# Decision Tree

In [9]:
feature = housing[["YearBuilt","SalePrice","MSSubClass"]]
def train_decision_tree(train_size=0.5):
    train_features, valid_features, train_target, valid_target = train_test_split(feature,
                                                                                  housing["Neighborhood"],
                                                                                  test_size=(1 - train_size))

    dTree = DecisionTreeClassifier(max_leaf_nodes=15)
    dTree.fit(train_features, train_target)
    pred = dTree.predict(valid_features)

    baseline = 225 / len(housing)
    acc = accuracy_score(y_true=valid_target, y_pred=pred)
    
    return (acc - baseline) / baseline

In [10]:
train_model_splits_and_print(train_decision_tree)

Function = train_decision_tree
----------------------------
Split 1
Train Split: 0.5
Test Split:  0.5
Accuracy:    140.89%
----------------------------

----------------------------
Split 2
Train Split: 0.6
Test Split:  0.4
Accuracy:    176.67%
----------------------------

----------------------------
Split 3
Train Split: 0.7
Test Split:  0.3
Accuracy:    164.58%
----------------------------

----------------------------
Split 4
Train Split: 0.8
Test Split:  0.2
Accuracy:    150.25%
----------------------------

----------------------------
Split 5
Train Split: 0.9
Test Split:  0.1
Accuracy:    213.41%
----------------------------

