In [None]:
import datetime as dt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy import stats
from scipy.spatial.distance import cdist

%matplotlib inline


In [None]:
all_df = pd.read_csv('all_types.csv')
all_df

In [None]:
df = all_df[['bedrooms','bathrooms','finished_SqFt', 'total_rooms', 'neighborhood']]

In [None]:
# Create dummies for categorical data
neighborhood = pd.get_dummies(df['neighborhood'])
df = pd.concat([df,neighborhood],axis=1)

In [None]:
df.info()

In [None]:
X = df.drop(['neighborhood'], axis=1)

X.to_csv('cluster_df.csv')

# Standardize
scale = StandardScaler()
scale.fit(X)
scaled_df = scale.transform(X)
print(scaled_df)

In [None]:
distortions = []
K = [2, 4, 6, 10, 20, 30, 50, 100]
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(scaled_df)
    kmeanModel.fit(scaled_df)
    distortions.append(sum(np.min(cdist(scaled_df, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / scaled_df.shape[0])

In [None]:
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
# Find the best k for clustering

scores = []
for k in [2, 4, 6, 10, 20, 30, 50, 100]:
    kmeans2 = KMeans(n_clusters = k)
    kmeans2.fit(scaled_df)
    scores.append(metrics.silhouette_score(scaled_df, kmeans2.labels_))

scores

In [None]:
#Cluster the data
kmeans = KMeans(n_clusters=50)
kmeans = kmeans.fit(scaled_df)

labels = kmeans.predict(scaled_df)
# Getting the cluster centers
C = kmeans.cluster_centers_

In [None]:
C

In [None]:
kmeans.score(scaled_df)

In [None]:
#Glue back to originaal data
all_df['cluster'] = labels

In [None]:
all_df.head()

In [None]:
# Sample user input
user_input = {'bedrooms':5, 'bathrooms': 3,'finished_SqFt': 4500,'total_rooms': 12,
'Allston': 0, 
'Back Bay': 0,              
'Bay Village': 0,          
'Beacon Hill': 0,           
'Brighton': 0,  
'Charlestown': 0, 
'Chinatown': 0, 
'Downtown': 0, 
'Downtown Crossing': 0, 
'East Boston': 0, 
'Fenway': 0, 
'Hyde Park': 0, 
'Jamaica Plain': 0, 
'Kenmore': 0, 
'Leather District': 0, 
'Mattapan': 0, 
'Mission Hill': 0, 
'North Dorchester': 0, 
'North End': 0, 
'Roslindale': 0, 
'Roxbury': 0, 
'South Boston' : 0,
'South Dorchester': 0, 
'South End': 1, 
'West End': 0, 
'West Roxbury': 0, 
'Winthrop': 0}

user_df = pd.DataFrame(user_input, index=[0])
print(user_df)

In [None]:
# scale = StandardScaler()
scaled_user_df = scale.transform(user_df)
print(scaled_user_df)

In [None]:
# Get cluster for user input
user_cluster = kmeans.predict(scaled_user_df)

In [None]:
print(user_cluster)

In [None]:
# Get distance from user input datapoint
trans = kmeans.transform(scaled_df)

In [None]:
trans

In [None]:
closest_point1, closest_point2, closest_point3 = 0,0,0
argsor = np.argsort(trans[:, user_cluster[0]])

closest_points = []
for i, argsortidx in enumerate(argsor):
    if i == 3:
        break
    closest_points.append(argsortidx)

print(closest_points)

In [None]:
for i in closest_points:
    print(all_df.iloc[i])
    print(all_df.loc[i,'zpid'])