# Build basic supervised ML pipelines to test performance on clusters

In [7]:
# import libraries

import os
import pandas as pd
import numpy as np
from scipy import stats
import math
from sklearn import metrics # for calculating Silhouette score

In [8]:

# make sure to navigate to project folder
os.chdir('/Users/trevor.mattos/Desktop/nycdsa/finalproject/haystacks.ai_unsupervised_ml')
# import functions from scripts
from so_fresh_so_clean import *

In [9]:
# navigate to data folder
os.chdir('/Users/trevor.mattos/Desktop/nycdsa/finalproject/haystacks.ai_unsupervised_ml/data')
# read data
df=pd.read_csv('GA_LISTINGS_SALES.csv')
# use cleaner function
df=df_cleaner(df)


In [10]:
# make list of numerical features
numers=[]
for col in df.columns:
    if df[col].dtype==('float64') or df[col].dtype==('int64'):
        numers.append(col)

In [11]:
# keep only numerical features in dataframe for now
df=df[numers]

## Set up the ML pipeline


In [44]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
# establish target
y=df['price']

In [None]:
# drop target from features
X=df.iloc[:,:-1]

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=11)

In [45]:
# set up the pipeline for Linear Regression with StandardScaler
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lm',LinearRegression())
])

## Fit linear model on training and test data, evaluate performance

In [None]:
# fit the model
pipeline.fit(X_train,y_train)

In [None]:
# obtain training score from basic model
print('The training score on the basic linear model is %f.'\
      %(pipeline.score(X_train,y_train)))

In [None]:
# obtain test score from basic model
print('The test score on the basic linear model is %f.'\
      %(pipeline.score(X_test,y_test)))

## Fit linear model on clusters (TEST DRIVE)
#### Be sure to train test split once we have optimal clusters

In [13]:
from sklearn.cluster import KMeans
kmeans = KMeans()

In [42]:
#from sklearn.ensemble import RandomForestRegressor

In [15]:
# leave out lat long for now
df=df.iloc[:,2:]

In [16]:
#set kmeans params
kmeans.set_params(n_clusters=3)

KMeans(n_clusters=3)

In [17]:
# fit kmeans
kmeans.fit(df)

KMeans(n_clusters=3)

In [18]:
# create a clusters column
clusters=pd.DataFrame(kmeans.labels_.reshape(-1,1), columns=['cluster'])

In [19]:
# join dataframe and clusters column
mydf=pd.concat([df,clusters], axis=1)


In [25]:
# split data set by clusters
mydf1=mydf.loc[mydf['cluster']==0]
mydf2=mydf.loc[mydf['cluster']==1]
mydf3=mydf.loc[mydf['cluster']==2]

In [37]:
# split target and features for cluster 1
y1=mydf1['price']
X1=mydf1.iloc[:,:-2]

In [40]:
# split target and features for cluster 2
y2=mydf2['price']
X2=mydf2.iloc[:,:-2]

In [49]:
# split target and features for cluster 3
y3=mydf3['price']
X3=mydf3.iloc[:,:-2]

In [46]:
# fit model on cluster 1
pipeline.fit(X1,y1)
pipeline.score(X1,y1)

0.44086977981133957

In [47]:
# fit model on cluster 2
pipeline.fit(X2,y2)
pipeline.score(X2,y2)

0.04245433467832127

In [50]:
# fit model on cluster 3
pipeline.fit(X3,y3)
pipeline.score(X3,y3)

0.10661210651932929

In [54]:
# fit model on overall dataframe
y_overall=df['price']
X_overall=df.iloc[:,:-1]

In [55]:
# fit model on overall
pipeline.fit(X_overall,y_overall)
pipeline.score(X_overall,y_overall)

0.4992023157704445