In [1]:
import sys
sys.path.append('..')

from bag_of_words import *
from topic_model_svd import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from scipy.sparse.linalg import svds
from sklearn.cluster import KMeans
import numpy as np

In [2]:
topic_num = 20
df = get_data(path='../data/iphone6.csv')

In [3]:
f = open('stop_words.txt', 'r')
mystop = f.readlines()[0].split()
f.close()
mystop.extend( list(ENGLISH_STOP_WORDS) )

vectorizer = CountVectorizer(max_df=1.0,
                             max_features=200000,
                             min_df=1,
                             stop_words = mystop,
                             ngram_range=(1,1))


In [4]:
bw_matrix = vectorizer.fit_transform(df.Reviews_bw)
bw_matrix = TfidfTransformer(norm='l2', use_idf=False).fit_transform(bw_matrix)
vocab = vectorizer.get_feature_names()

In [5]:
t, s, d = svds(bw_matrix, k=50)

In [6]:
km = KMeans(n_clusters=topic_num)
km.fit(t)
topics = km.labels_.tolist()
df['topic'] = topics

In [7]:
grouped = df['Rating'].groupby(df['topic'])
print grouped.mean()

topics
0     3.848055
1     4.324675
2     4.750000
3     4.090000
4     4.686275
5     3.212851
6     3.403509
7     1.972678
8     2.440476
9     4.279412
10    4.479452
11    3.666667
12    3.850877
13    4.590909
14    3.533333
15    1.871681
16    4.515337
17    3.353488
18    2.122302
19    2.914027
Name: Rating, dtype: float64


In [8]:
print "Top terms per cluster:"
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
review_group = df['Reviews'].groupby(df['topic'])

for i in range(topic_num):
    print "\nCluster %d words:" % i
    for ind in order_centroids[i, :2]: #replace 5 with n words per cluster
        print ind
        base = get_new_base(d[ind], vocab, cutoff=0.3)
        print [item[1] for item in base]

Top terms per cluster:

Cluster 0 words:
34
[u'apple', u'price', u'refurbished']
44
[u'battery']

Cluster 1 words:
17
[u'delivery', u'money', u'shipping']
21
[u'quality', u'service']

Cluster 2 words:
13
[u'gb', u'gift']
12
[u'gb', u'gift', u'scratch']

Cluster 3 words:
41
[u'problems', u'time']
42
[u'problems', u'screen', u'time']

Cluster 4 words:
36
[u'fast', u'price', u'shipping']
39
[u'price', u'seller', u'unlocked']

Cluster 5 words:
28
[u'charge', u'issues']
44
[u'battery']

Cluster 6 words:
23
[u'box']
8
[u'device', u'headphones', u'month']

Cluster 7 words:
17
[u'delivery', u'money', u'shipping']
18
[u'delivery', u'money', u'shipping']

Cluster 8 words:
5
[u'camera', u'sound']
4
[u'ordered', u'promised']

Cluster 9 words:
46
[u'iphone', u'new']
35
[u'apple', u'scratches', u'seller']

Cluster 10 words:
19
[u'day', u'return', u'son']
13
[u'gb', u'gift']

Cluster 11 words:
13
[u'gb', u'gift']
14
[u'amazon', u'days', u'scratch']

Cluster 12 words:
9
[u'device', u'headphones', u'mo

array([  1.69945909e-04,  -5.93240090e-04,  -3.99738359e-04,
        -8.95285354e-05,  -3.99145409e-04,  -7.40520222e-04,
         4.78044051e-04,   1.02598884e-03,  -8.80437724e-04,
        -5.01014912e-04,  -1.09262103e-03,   5.07685840e-04,
         5.94397964e-04,  -1.00587012e-03,   4.01072075e-04,
         9.03458147e-04,  -1.63337658e-04,  -3.26205630e-04,
        -1.87449276e-04,  -9.57664975e-04,   2.66633980e-04,
        -5.34081766e-04,  -7.97285115e-05,  -2.46913469e-05,
        -7.59944483e-04,   3.61855191e-03,  -3.88150944e-04,
        -9.24614831e-04,   5.85867612e-04,   1.55562089e-04,
         2.60206511e-03,  -1.46019128e-03,   3.04842914e-03,
        -1.01541611e-04,  -1.33805188e-03,  -1.62883003e-04,
        -7.15323544e-04,   2.48970704e-03,  -7.86977235e-04,
        -4.22414904e-03,   4.42944844e-03,   5.31612029e-02,
         1.98514255e-02,  -1.33135583e-02,   2.74797042e-03,
         1.80362365e-03,  -1.08003406e-03,   1.00339802e-03,
        -4.82729044e-03,

array([ -3.37484693e-03,  -1.50527349e-03,   2.29907887e-03,
        -5.44669507e-03,  -6.31039142e-03,   5.39822108e-03,
         2.13027027e-03,   1.61112399e-03,   7.56479888e-04,
         1.52490489e-03,  -2.51088835e-03,  -1.08319868e-02,
         1.33103998e-02,   2.20639359e-02,   6.51843033e-03,
         8.26955615e-03,   3.47539382e-03,  -2.43682074e-03,
        -1.28661606e-03,  -2.50885802e-03,   1.27146268e-02,
        -6.43036917e-04,   5.77540541e-03,  -4.83990164e-03,
        -6.97631823e-04,  -1.52697268e-01,  -4.16409317e-03,
        -1.18200172e-02,   8.22938398e-04,   3.50022820e-03,
        -9.09768414e-03,   7.29260867e-03,  -1.77993627e-03,
        -1.45248825e-03,   1.74994514e-03,  -1.65307451e-03,
         2.05514834e-03,  -2.99227227e-03,  -1.27114507e-03,
         1.51172578e-03,  -1.51273062e-04,   3.61372967e-03,
         1.55987241e-03,  -1.38119161e-03,  -1.55617362e-04,
        -1.00708960e-04,  -1.67071773e-04,   2.08655832e-04,
        -3.77481438e-04,