#### Explore and Cluster User-Level Data

In [1]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans, DBSCAN

warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [None]:
#Function to Print Percentiles (for Cleaning)
def percentiles(df, variable):
    print "Min:  "+str(df[variable].quantile(q=0.00))
    print "1st:  "+str(df[variable].quantile(q=0.01))
    print "5th:  "+str(df[variable].quantile(q=0.05))
    print "10th: "+str(df[variable].quantile(q=0.10))
    print "25th: "+str(df[variable].quantile(q=0.25))
    print "50th: "+str(df[variable].quantile(q=0.50))
    print "75th: "+str(df[variable].quantile(q=0.75))
    print "90th: "+str(df[variable].quantile(q=0.90))
    print "95th: "+str(df[variable].quantile(q=0.95))
    print "99th: "+str(df[variable].quantile(q=0.99))
    print "Max:  "+str(df[variable].quantile(q=1.00))

In [None]:
user = pd.read_pickle("data/yelp/dataframes/yelp_review_user.pkl")
user.head(5)

In [None]:
##Look At Distribution of Total # Relevant
print "Total Sentences..."
percentiles(user, "total")

print "Relevant Sentences..."
percentiles(user, "relevant")

#### Let's Focus on users in the 75th - 99th percentile range

In [None]:
user = user[(user.relevant>=12) & (user.relevant<=219)]
user.reset_index(drop=True, inplace=True)
print user.location.value_counts()

#### Calculate % Of Each Topic Discussed

In [None]:
user["pct_food"] = user["topic_food"] / user["relevant"]
user["pct_service"] = user["topic_service"] / user["relevant"]
user["pct_ambience"] = user["topic_ambience"] / user["relevant"]
user["pct_value"] = user["topic_value"] / user["relevant"]
user.head(10)

#### Look at % Across Data

In [None]:
plt.figure(figsize=(20,10))

plt.subplot(2,2,1)
plt.hist(user.pct_food, bins=50, normed=True, cumulative=True)
plt.title("Food", fontsize=22)

plt.subplot(2,2,2)
plt.hist(user.pct_service, bins=50, normed=True, cumulative=True)
plt.title("Service", fontsize=22)

plt.subplot(2,2,3)
plt.hist(user.pct_ambience, bins=50, normed=True, cumulative=True)
plt.title("Ambience", fontsize=22)

plt.subplot(2,2,4)
plt.hist(user.pct_value, bins=50, normed=True, cumulative=True)
plt.title("Value", fontsize=22)

#### Apply PCA for Dimentionality Reduction

In [None]:
reducer = PCA(n_components=2, whiten=True)
reduced = reducer.fit_transform(user[["pct_food","pct_service","pct_ambience","pct_value"]])

reduced = pd.DataFrame(reduced, columns=["PCA_1", "PCA_2"])
reduced.head(5)

In [None]:
user = pd.concat([user, reduced], axis=1)
user.head(5)

In [None]:
user.location.value_counts()

In [None]:
plt.figure(figsize=(20,10))
plt.scatter(user.PCA_1, user.PCA_2)
plt.title("User Topics PCA", fontsize=22)

#### Cluster Yelp Users

In [None]:
##Function to Run DBSCAN Clustering
def dbscan(df, eps, min_samples):
    model = DBSCAN(eps=eps, min_samples=min_samples)
    clusters = pd.Series(model.fit_predict(df[["PCA_1","PCA_2"]]), name="cluster")
    
    return pd.concat([df, clusters], axis=1)

##Function to Run Mini-Batch K-Means Clustering
def kmeans(df, n_clusters):
    model = MiniBatchKMeans(n_clusters=n_clusters)
    clusters = pd.Series(model.fit_predict(df[["PCA_1","PCA_2"]]), name="cluster")
    
    return pd.concat([df, clusters], axis=1)

In [None]:
plt.figure(figsize=(20,10))
clustered = dbscan(user, eps=0.17, min_samples=35)
#clustered = kmeans(user, n_clusters=3)

print clustered["cluster"].nunique()
print clustered["cluster"].value_counts()
print clustered.groupby(by="cluster")[["pct_food","pct_service","pct_ambience","pct_value"]].mean()
plt.scatter(clustered.PCA_1, clustered.PCA_2, c=clustered.cluster)

Money! - with parameters eps=0.17 and min_samples=0.75 we've managed to split out people out into outliers and core

#### Let's Look at the chacteristics of our two clusters

In [None]:
def plot_by_cluster(df):
    sub0 = df[df.cluster==0]
    sub1 = df[df.cluster==-1]
    
    plt.figure(figsize=(20,10))
    plt.subplot(1,2,1)
    plt.scatter(sub0.PCA_1, sub0.PCA_2)
    plt.title("Cluster 0", fontsize=20)
    
    plt.subplot(1,2,2)
    plt.scatter(sub1.PCA_1, sub1.PCA_2)
    plt.title("Cluster -1", fontsize=20)

In [None]:
plot_by_cluster(clustered)

In [None]:
print clustered.cluster.value_counts()
clustered.groupby(by="cluster")[["relevant","pct_food","pct_service","pct_ambience","pct_value"]].mean()

In [None]:
clustered.groupby(by="cluster")[["pct_food","pct_service","pct_ambience","pct_value"]].std()

#### Split Out into "Core" and "Outliers" and Cluster Further
* Core - KMeans
* Outliers - More DBSCAN to Parse out "Outliers" from "OMG So Much Outlier WHO ARE YOU?" (aka a sort of hierarchical-DBSCAN)

In [None]:
#Split into Core and Outliers
core = clustered[clustered.cluster==0]
outliers = clustered[clustered.cluster==-1]

core.drop("cluster", axis=1, inplace=True)
outliers.drop("cluster", axis=1, inplace=True)

print core.columns
print outliers.columns

In [None]:
clustered2 = dbscan(core, eps=0.15, min_samples=60)
#clustered2 = kmeans(core, n_clusters=2)

plt.figure(figsize=(20,10))
print clustered2["cluster"].nunique()
plt.scatter(clustered2.PCA_1, clustered2.PCA_2, c=clustered2.cluster, label=clustered2.cluster)