In [1]:
from pandas.io import sql
from sqlalchemy import create_engine, inspect

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
data = pd.read_csv("gather_data.csv")
df = data.iloc[:, 1:]
df['City'] = [x.strip() for x in df['City']]
df['City'] = df['City'].astype('category')

In [4]:
df.head()
df.columns

Index(['MLSNumber', 'Street', 'City', 'Price', 'BR', 'Bath', 'Footage',
       'PricePerSqft', 'Zipcode'],
      dtype='object')

## Classifiers

In [5]:
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import warnings
warnings.filterwarnings('ignore')

In [None]:
def fitClassifiers(df, x_list, y, classifiers, classifierNames):
    overall_max = [list(x_list)[0], classifierNames[0], 0]
    for x in x_list:
        X = df[list(x)]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
        print(list(x))
        local_max = [list(x), classifierNames[0], 0]
        for name, classifier in zip(classifierNames, classifiers):
            classifier.fit(X_train, y_train)
            score = classifier.score(X_test, y_test)
            if score > local_max[2]:
                local_max = [list(x), name, score]
        print(local_max)
        if local_max[2] > overall_max[2]:
            overall_max = local_max
    print("Best is: " , overall_max)
    

## Predicting City From Other Features

In [None]:
y = df[['City']]
x_vars = ['Price', 'BR', 'Bath', 'Footage', 'PricePerSqft']

# get power set
import itertools
x_list = list(itertools.chain.from_iterable(itertools.combinations(x_vars,n) for n in range(1, len(x_vars)+1)))

cityClassifiers = [
    KNeighborsClassifier(len(df['City'].unique())),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_features='log2', max_depth=len(df['City'].unique())),
    RandomForestClassifier(n_estimators=len(df['City'].unique()), max_depth=10),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB()
]

cityClassifiersName = [
    "KNN",
    "SVM",
    "Decision Tree",
    "Random Forest",
    "Multi-layer Perception",
    "Ada Boost",
    "Gaussian"
]

fitClassifiers(df, x_list, y, cityClassifiers, cityClassifiersName)

['Price']
[['Price'], 'KNN', 0.22362869198312235]
['BR']
[['BR'], 'Multi-layer Perception', 0.18565400843881857]
['Bath']
[['Bath'], 'SVM', 0.18143459915611815]
['Footage']
[['Footage'], 'SVM', 0.2109704641350211]
['PricePerSqft']
[['PricePerSqft'], 'KNN', 0.24472573839662448]
['Price', 'BR']
[['Price', 'BR'], 'Random Forest', 0.24050632911392406]
['Price', 'Bath']
[['Price', 'Bath'], 'KNN', 0.21940928270042195]
['Price', 'Footage']
[['Price', 'Footage'], 'Random Forest', 0.29535864978902954]
['Price', 'PricePerSqft']
[['Price', 'PricePerSqft'], 'Random Forest', 0.23628691983122363]
['BR', 'Bath']
[['BR', 'Bath'], 'SVM', 0.189873417721519]
['BR', 'Footage']
[['BR', 'Footage'], 'SVM', 0.189873417721519]
['BR', 'PricePerSqft']
[['BR', 'PricePerSqft'], 'KNN', 0.24472573839662448]
['Bath', 'Footage']
[['Bath', 'Footage'], 'SVM', 0.19831223628691982]
['Bath', 'PricePerSqft']
[['Bath', 'PricePerSqft'], 'KNN', 0.24472573839662448]
['Footage', 'PricePerSqft']
[['Footage', 'PricePerSqft'], 'Ran

## Predicting Price Segment Based on Other Features

In [None]:
price = df['Price']

In [None]:
price.describe()

In [None]:
plt.hist(bins=10, x=df['Price'])
plt.show()

### Splitting into 5 groups
    1. 0 - 199,999
    2. 200,000 - 399,999
    3. 400,000 - 599,999
    4. 600,000 - 799,999
    5. 800,000 and Above

In [None]:
def getPriceCategory(numBins):   
    labs = [('%d00K and under' % x) for x in range(1,numBins+1)]
    labs.append("1 Mil and above")
    binCutOffs = [x for x in range(0,1000001, int(1000000/numBins))]
    binCutOffs.append(1000000000)
    return pd.cut(df['Price'], 
           bins = binCutOffs,
           labels=labs)

In [None]:
price_cat = getPriceCategory(10)

In [None]:
dummies = pd.get_dummies(df['City'], prefix="City")

In [None]:
cityDf = df[['BR', 'Bath', 'PricePerSqft']]

In [None]:
cityDf = pd.concat([cityDf, dummies], axis = 1)

In [None]:
y = price_cat
#x_vars = ['BR', 'Bath', 'PricePerSqft']
x_list = [list(cityDf.columns)]
import itertools
#x_list = list(itertools.chain.from_iterable(itertools.combinations(x_vars,n) for n in range(1, len(x_vars)+1)))

cityClassifiers = [
    KNeighborsClassifier(len(df['City'].unique())),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_features='log2', max_depth=10),
    RandomForestClassifier(n_estimators=10, max_depth=10),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB()
]

cityClassifiersName = [
    "KNN",
    "SVM",
    "Decision Tree",
    "Random Forest",
    "Multi-layer Perception",
    "Ada Boost",
    "Gaussian"
]

fitClassifiers(cityDf, x_list, y, cityClassifiers, cityClassifiersName)

## Predict Zipcode

In [None]:
y = df[['Zipcode']]
x_vars = ['BR', 'Bath', 'Footage', 'Price', 'PricePerSqft']

import itertools
x_list = list(itertools.chain.from_iterable(itertools.combinations(x_vars,n) for n in range(1, len(x_vars)+1)))

cityClassifiers = [
    KNeighborsClassifier(len(df['Zipcode'].unique())),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_features='log2', max_depth=10),
    RandomForestClassifier(n_estimators=10, max_depth=10),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB()
]

cityClassifiersName = [
    "KNN",
    "SVM",
    "Decision Tree",
    "Random Forest",
    "Multi-layer Perception",
    "Ada Boost",
    "Gaussian"
]

fitClassifiers(df, x_list, y, cityClassifiers, cityClassifiersName)

In [None]:
# have a list of classifiers we will use
# write model functions that will return score of model and the error, and computation time
# Logistic Regression - multiclass
# kNN
# AdaBoostClassifier
# Random Forest
# DecisionTreeClassifier
# SVM
# ensemble
    # Voting
    # Bagging
    # Boosting

# Voting

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
y = df[['Zipcode']]
x_vars = ['BR', 'Bath', 'Footage', 'Price', 'PricePerSqft']

import itertools
x_list = list(itertools.chain.from_iterable(itertools.combinations(x_vars,n) for n in range(1, len(x_vars)+1)))


cityClassifiers = [
    KNeighborsClassifier(len(df['Zipcode'].unique())),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_features='log2', max_depth=10),
    RandomForestClassifier(n_estimators=10, max_depth=10),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB()
]

cityClassifiersName = [
    "KNN",
    "SVM",
    "Decision Tree",
    "Random Forest",
    "Multi-layer Perception",
    "Ada Boost",
    "Gaussian"
]

voter = VotingClassifier(list(zip(cityClassifiersName, cityClassifiers)), voting="hard")

In [None]:
fitClassifiers(df, x_list, y, [voter], ["voter"])

From this we see that voting does not outperform the individual best performance of the classifiers.