# Cluster data and feed to AVM prediction models

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
import os

In [2]:
# navigate to local repo
os.chdir('/users/trevor.mattos/desktop/nycdsa/finalproject/cleancode')

### Read the clean merged data

In [3]:
# from data_compiler import *
# df=mycompiler()

In [4]:
# produce the original clean merged df with the code cell above
# then save as csv to reduce computation time using this cell

# df.to_csv(
#     '/users/trevor.mattos/desktop/nycdsa/finalproject/cleancode/data/compiled.csv',
# index=False)

In [5]:
# read data from file
df=pd.read_csv('./data/compiled.csv')

In [6]:
# recast categorical columns as object 
    # to ensure that reading the CSV doesn't affect our analysis
for i in ['special_features',
          'transaction_type',
          'listing_status',
          'listing_special_features',
          'zip']:
    df[i]=df[i].astype(object)


### Reduce dimensionality with PCA

In [7]:
from pca_optimized import *
df_var, pca_loadings, components=mypca(df, n_components=3)

In [8]:
# check the components' exlained variance 
df_var

array([0.38979099, 0.10758053, 0.08565046])

In [31]:
# view loading vectors
pca_loadings

Unnamed: 0,pc1,pc2,pc3
beds,-0.080432,0.437244,0.260843
baths_full,-0.095839,0.441851,0.32204
square_footage,-0.092703,0.407998,0.305276
year_built,-0.146979,0.133285,0.282095
grocer_dist,-0.006645,-0.336528,0.269479
bank_dist,-0.065362,-0.252643,0.396339
school_dist,-0.058896,-0.262502,0.435359
walkscore,0.160847,0.222933,-0.184259
violent_crime_total_rate,0.296688,-0.014451,0.033821
violent_crime_assault_rate,0.262411,-0.065645,0.065679


In [10]:
# save components into a dataframe
df_pca=pd.DataFrame(components, columns=['pc1','pc2','pc3'])
df_pca

Unnamed: 0,pc1,pc2,pc3
0,-0.772738,-2.637952,2.006469
1,2.983391,-1.683634,0.195053
2,2.964482,-1.690154,0.201208
3,2.889888,-0.707172,0.404453
4,2.542239,-0.061526,1.401968
...,...,...,...
11486,-1.219903,-0.640894,2.171864
11487,0.218796,-1.531675,-1.332892
11488,-0.222348,-1.677375,0.296606
11489,-0.035737,-1.398951,-0.899497


### Cluster data using principal components

In [11]:
from sklearn.cluster import KMeans
kmeans = KMeans()

In [12]:
kmeans.set_params(n_clusters=3)
kmeans.fit(df_pca)

KMeans(n_clusters=3)

### Visualize clusters in 3d space

In [20]:
df_pca['clusters']=(kmeans.labels_.reshape(-1,1))

In [26]:
# create function
def coloring(clusters):
    if clusters==0:
        return '#3498DB'
    elif clusters==1:
        return '#17A589'
    elif clusters==2:
        return '#C39BD3'
    
# create new column
df_pca['color']=df_pca['clusters'].apply(coloring)

In [13]:

import plotly.graph_objects as go # for data visualization
import plotly.express as px # for data visualization

In [38]:

# Create a 3D scatter plot
fig = px.scatter_3d(df_pca, x=df_pca['pc1'], y=df_pca['pc2'],z=df_pca['pc3'],
                    opacity=0.3,
                    color=df_pca['color'],
                    color_discrete_sequence=['#3498DB','#17A589','#C39BD3'],
                    height=700, width=700)


# Update marker size
fig.update_traces(marker=dict(size=2))

fig.show()

#### Add cluster column to original dataframe to export for EDA

In [68]:
df['clusters']=(kmeans.labels_.reshape(-1,1))

In [69]:
df.to_csv('./data/clustered.csv',index=False)

# Predict prices using clusters

#### Create a separate object with target 

In [70]:
target=df['price']

#### Drop price from dataframe

In [71]:
df=df.drop(['price'], axis=1)

#### Drop non-numeric  features

In [72]:
numers=[]
for col in df.columns[2:]:
    if df[col].dtype==('float64') or df[col].dtype==('int64') or\
    df[col].dtype==('int32'):
        numers.append(col)
df=df[numers]

#### Add cluster label

In [73]:
df['clusters']=(kmeans.labels_.reshape(-1,1))

#### Use clusterTrainTestSplit to obtain :
* dictionary of train test split pairs for each cluster
* list of names of train test split per cluster dataframes
* X_train for full dataframe
* y_train for full dataframe
* X_test for full dataframe
* y_test for full dataframe

In [75]:
from clusterTrainTestSplit import *

In [76]:
# assign output and specificy arguments: dataframe, target, number of clusters
dataframes, dlist, X_train, y_train, X_test,y_test=clusterTrainTestSplit(df, target, c=3)

# Fit Random Forest Regression model to assess performance of clustered prediction 

#### Create function to fit Random Forest Regression models for each cluster:

In [77]:
def ClusterRF(dataframes,dlist):
    from sklearn.ensemble import RandomForestRegressor
    rf = RandomForestRegressor(n_estimators = 100, random_state = 0,max_depth=4)
    trainscores=[]
    testscores=[]
    clusters=[]
    for i in dlist:
        rf.fit(dataframes[i[0]], dataframes[i[1]])
        trainscores.append('The training score is: %f'\
                      %rf.score(dataframes[i[0]], dataframes[i[1]]))
        testscores.append('The test score is: %f'\
                      %rf.score(dataframes[i[2]], dataframes[i[3]]))
        clusters.append('Cluster: %d' %(int(i[0][-1])))
    results=list(zip(clusters,trainscores,testscores))
    return(results)
    

#### Use ClusterRF function and save results

In [78]:
myresults=ClusterRF(dataframes,dlist)
myresults

[('Cluster: 0',
  'The training score is: 0.607415',
  'The test score is: 0.552414'),
 ('Cluster: 1',
  'The training score is: 0.538973',
  'The test score is: 0.517075'),
 ('Cluster: 2',
  'The training score is: 0.906605',
  'The test score is: 0.637252')]

#### Obtain mean training score and test score for random forest cluster models

In [79]:
avgtstscores=[]
avgtrnscores=[]
for i in myresults:
    avgtstscores.append(float(i[2][-8:]))
    avgtrnscores.append(float(i[1][-8:]))
print('The average cluster training score is %f' %np.mean(avgtrnscores))
print('The average cluster test score is %f' %np.mean(avgtstscores))

The average cluster training score is 0.684331
The average cluster test score is 0.568914


### Fit Random Forest Regression model on overall training and test data to compare against performance of clustered prediction models

In [80]:
rf = RandomForestRegressor(n_estimators = 100, random_state = 0, max_depth=4)
rf.fit(X_train,y_train)
print('The overall training score is %f' %rf.score(X_train,y_train))
print('The overall test score is %f' %rf.score(X_test,y_test))

The overall training score is 0.544914
The overall test score is 0.539609
