In [74]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.manifold import TSNE 
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output

# load responses as a DataFrame
with open("responses.csv") as f:
    df = pd.read_csv(f)

# load a dict to translate short column headers to full survey questions    
with open("columns.csv") as f:
    x = pd.read_csv(f)
    headerDict = pd.Series(x.original.values, index=x.short).to_dict()
    
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010 entries, 0 to 1009
Columns: 150 entries, Music to House - block of flats
dtypes: float64(134), int64(5), object(11)
memory usage: 1.2+ MB


In [44]:
# create a dataFrame of the non-numerical columns from the data to see what the clustering can't handle
df_other = df.select_dtypes(['object'])
df_other.describe()

# QUESTION: Does it make sense to take these columns of categorical data and translate them to numerical data so we don't lose the info for the clustering? There is a sufficiently small number of unique values...

Unnamed: 0,Smoking,Alcohol,Punctuality,Lying,Internet usage,Gender,Left - right handed,Education,Only child,Village - town,House - block of flats
count,1002,1005,1008,1008,1010,1004,1007,1009,1008,1006,1006
unique,4,3,3,4,4,2,2,6,2,2,2
top,tried smoking,social drinker,i am always on time,sometimes,few hours a day,female,right handed,secondary school,no,city,block of flats
freq,430,659,399,549,744,593,906,621,754,707,595


In [61]:
# create a dataFrame of the numerical columns from the data; drop all the NaN values for now
df_num = df.select_dtypes(['number']).dropna().reset_index(drop=True)
df_num.info()

# QUESTION: We probably don't want to lose more than 300 rows when dropping the NaN values; should we replace them with the average for the column? With the middle value of the possible range of values?

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 686 entries, 0 to 685
Columns: 139 entries, Music to Number of siblings
dtypes: float64(134), int64(5)
memory usage: 745.1 KB


In [68]:
# QUESTION: since we have a high-dimensionality data set, should we normalize the data set? 
def dummy_cluster_norm(data, numCluster):
    
    # instantiate kmeans object and perform clustering
    kmeans = KMeans(n_clusters = numCluster)
    kmeans.fit(preprocessing.normalize(data))
    
    # dimensionality reduction via TSNE
    tsne_3d_df = pd.DataFrame(TSNE(n_components=3).fit_transform(data))
    
    # append data with column of cluster labels and 3d coordinates
    data['clusterNo'] = kmeans.labels_
    data['xcoord'] = tsne_3d_df[0]
    data['ycoord'] = tsne_3d_df[1]
    data['zcoord'] = tsne_3d_df[2]
    
    return data

In [69]:
df_num_clusters = dummy_cluster_norm(df_num, 3)

In [71]:
import plotly.express as px

fig = px.scatter_3d(df_num_clusters, x='xcoord', y='ycoord', z='zcoord', color='clusterNo')

fig.show()