In [1]:
import sys
sys.path.append('..')

from bag_of_words import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import MDS

In [2]:
topic_num = 20
df = get_data(path='../data/iphone6.csv')

In [3]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8,
                                   max_features=200000,
                                   min_df=1,
                                   use_idf=True,
                                   ngram_range=(1,1),
                                   stop_words = 'english',
                                   norm='l2')
tfidf_matrix = tfidf_vectorizer.fit_transform(df.Reviews_bw)
vocab = tfidf_vectorizer.get_feature_names()

In [4]:
km = KMeans(n_clusters=topic_num)
km.fit(tfidf_matrix)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=20, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [5]:
topics = km.labels_.tolist()
df['topics'] = topics
print df['topics'].value_counts()

17    2632
2     1974
19     485
18     463
13     450
0      444
4      384
12     363
9      323
5      315
6      307
15     228
3      228
7      213
16     188
1      176
10     148
14     133
8      111
11      91
Name: topics, dtype: int64


In [6]:
grouped = df['Rating'].groupby(df['topics'])
print grouped.mean()

topics
0     4.108108
1     4.829545
2     2.949341
3     3.280702
4     4.666667
5     4.879365
6     4.345277
7     4.957746
8     4.477477
9     2.857585
10    4.844595
11    4.901099
12    1.997245
13    4.771111
14    4.699248
15    4.960526
16    4.936170
17    3.542933
18    2.105832
19    4.703093
Name: Rating, dtype: float64


In [7]:
print "Top terms per cluster:"
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
review_group = df['Reviews'].groupby(df['topics'])

for i in range(topic_num):
    print "\nCluster %d words:" % i
    for ind in order_centroids[i, :5]: #replace 5 with n words per cluster
        print vocab[ind],

Top terms per cluster:

Cluster 0 words:
came phone condition charger new 
Cluster 1 words:
described exactly phone product item 
Cluster 2 words:
phone great new screen got 
Cluster 3 words:
sim card phone unlocked work 
Cluster 4 words:
good far phone condition really 
Cluster 5 words:
great phone works price condition 
Cluster 6 words:
product great good excelent excellent 
Cluster 7 words:
love phone iphone new great 
Cluster 8 words:
ok good phone product perfect 
Cluster 9 words:
working stopped phone good months 
Cluster 10 words:
thanks awesome phone great good 
Cluster 11 words:
excelente producto recomendable product recomendado 
Cluster 12 words:
work phone screen charger properly 
Cluster 13 words:
works great perfectly phone fine 
Cluster 14 words:
nice phone love good works 
Cluster 15 words:
excellent product condition recommended phone 
Cluster 16 words:
perfect condition works phone new 
Cluster 17 words:
iphone excelent good expected great 
Cluster 18 words:
battery p

In [None]:
dist = 1 - cosine_similarity(tfidf_matrix)
MDS()
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist)
xs, ys = pos[:, 0], pos[:, 1]

In [None]:
%matplotlib inline 

#create data frame that has the result of the MDS plus the cluster numbers and titles
dfplot = pd.DataFrame(dict(x=xs, y=ys, label=topics, review=df["Reviews"])) 

#group by cluster
groups = dfplot.groupby('label')


# set up plot
fig, ax = plt.subplots(figsize=(17, 9)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, 
            label=cluster_names[name], color=cluster_colors[name], 
            mec='none')
    ax.set_aspect('auto')
    ax.tick_params(\
        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')
    ax.tick_params(\
        axis= 'y',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelleft='off')
    
ax.legend(numpoints=1)  #show legend with only 1 point

#add label in x,y position with the label as the film title
for i in range(len(df)):
    ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)  

    
plt.show() #show the plot
