In [1]:
from gensim.models.word2vec import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
from gensim.parsing.porter import PorterStemmer
from nltk.corpus import stopwords
import numpy as np
import pickle
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import math
import operator
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#Only run if raw dataset has changed
"""
##
#clean raw text
##

#Initialize the porter stemmer and stopwords
stemmer = PorterStemmer()
with open('../Misc/stopwords.txt', 'r') as f:
    stops = []
    for line in f:
        stops.append(line.rstrip())
    stops = set(stops)

#clean the data and save to disk
cleaned = open('../cleaned_reviews.txt', 'w')

with open('reviews.txt', 'r') as f:
    for line in f:
        line = line.replace('...more', '')
        line = line.replace('.', ' ')
        tokens = [ stemmer.stem(word) for word in line.lower().rstrip().split(' ') if word not in stops ]
        cleaned.write('{}\n'.format(' '.join(tokens)))

cleaned.close()
"""
print('Comments')

In [None]:
#Only run if cleaned dataset has changed or you want a different word2vec model
"""
##
# Code for creating the model. After created, then just load the saved model
##

#Helpful iterator, credit here: https://rare-technologies.com/word2vec-tutorial/
class MySentences(object):
    
    def __init__(self, filename):
        self.filename = filename
 
    def __iter__(self):
        for line in open(self.filename):
            yield line.rstrip().split()

#train word2vec on cleaned data, but do so using the memory saving trick from the link above
sentences = MySentences('../cleaned_reviews.txt')
model = Word2Vec(sentences, size = 600, workers = 4)
model.save('../Models/w2v_600d')

#Get vectors only from the model and save to disk
wv = model.wv
wv.save('../Word_Vectors/wv_600d')
"""
print('Comments')

In [None]:
#Only run if word2vec model has changed
"""
#dict mapping ids to words
word_ids = {}

#write master matrix to file
with open('../Pre_Clustering/wvmaster_600d.csv', 'w') as f:

    #create matrix for clustering, 1st column is id, save to disk
    for word, data in wv.vocab.items():
        word_ids[ str(data.index) ] = word
        temp = np.array([ data.index ])
        temp = np.append(temp, wv[word])
        temp = ','.join([ str(element) for element in temp ])
        f.write('{}\n'.format(temp))

#Save id dictionary to disk
with open('../Pre_Clustering/wordids_600d.pkl', 'wb') as f:
    pickle.dump(word_ids, f, pickle.HIGHEST_PROTOCOL)
"""
print('Comments')

In [2]:
#Uncomment if word2vec model has changed

#load numpy data
X = np.loadtxt('../Pre_Clustering/wvmaster_1500d.csv', delimiter = ',')

#load dictionary data
word_ids = {}
with open('../Pre_Clustering/wordids_1500d.pkl', 'rb') as f:
    word_ids = pickle.load(f)

#load the word2vec model
w2v = Word2Vec.load('../Models/w2v_1500d')
    
#load word vectors from initial word2vec training
wv = KeyedVectors.load('../Word_Vectors/wv_1500d')

In [None]:
#Only run if word2vec model has changed
"""
#run k means
kmeans = KMeans(n_clusters = 100, random_state = 0).fit(X[:, 1:X.shape[1] ])

#save kmeans
with open('../KMeans/kmeans_600d.pkl', 'wb') as f:
    pickle.dump(kmeans, f, pickle.HIGHEST_PROTOCOL)
"""
print('Comments')

In [None]:
#Only run if kmeans has changed
"""
#load kmeans
with open('../KMeans/kmeans_600d.pkl', 'rb') as f:
    kmeans = pickle.load(f)

#attach words to labels
clustered_words = {}
for i, label in enumerate(kmeans.labels_):
    clustered_words[ word_ids[str(int(X[i, 0]))] ] = label

#group words by their labels
wordbags = [ [] for i in range(100) ]
for k, v in clustered_words.items():
    wordbags[int(v)].append( (k, wv.vocab[k].count) )
    
#Sort each cluster and trim to top 20.
fullbags = [''] * 100
for i, bag in enumerate(wordbags):
    wordbags[i] = [ item[0] for item in sorted(bag, key = operator.itemgetter(1), reverse = True) ]
    fullbags[i] = wordbags[i]
    wordbags[i] = wordbags[i][0:20]
    
with open('../KMeans/fullbags_600d.pkl' , 'wb') as f:
    pickle.dump(fullbags, f, pickle.HIGHEST_PROTOCOL)

#save trimmed clusters so we don't have to do anything above again
with open('../KMeans/wordbags_600d.pkl', 'wb') as f:
    pickle.dump(wordbags, f, pickle.HIGHEST_PROTOCOL)
"""
print('Comments')

In [3]:
#load the wordbags and fullbags from file
with open('../KMeans/wordbags_1500d.pkl', 'rb') as f:
    wordbags = pickle.load(f)
    
with open('../KMeans/fullbags_1500d.pkl', 'rb') as f:
    fullbags = pickle.load(f)

In [None]:
"""
#save the top 20 words from each cluster to text file
with open('../Pre_Paircounting/Top20_600d.txt', 'w') as f:
    for bag in wordbags:
        for word in bag:
            f.write('{}\n'.format(word))
        f.write('###\n')
"""
print('Comments')

/////////////////////////////////////////////////////
// Run Up to This point before running C++ program //
/////////////////////////////////////////////////////

In [4]:
#get paircounts for wordvec from file
with open('../Cluster_Metrics/paircounts_1500d.txt', 'r') as f:
    paircounts = []
    for line in f:
        paircounts.append(int(line.rstrip()))

In [5]:
#combination formula
def nCr(n, r):
    f = math.factorial
    if n < r:
        return 1
    else:
        return f(n) // f(r) // f(n - r)

In [6]:
clusters = [ (i, float(paircounts[i]) / nCr(len(wordbags[i]), 2)) for i in range(len(wordbags)) ]
clusters = sorted(clusters, key = lambda item: item[1], reverse = True)
v = np.array([ item[1] for item in clusters ])

#with open('../Cluster_Metrics/Stats.txt', 'a') as f:
#    f.write('Word2Vec with 600 dim: Avg = {:.3f}; Std Dev = {:.3f}\n'.format(np.mean(v), np.std(v)))

////////////////////////////////////////
// END "CLUSTER GOODNESS" MEASUREMENT //
////////////////////////////////////////

In [7]:
#Trying seed word stuff
with open('../Misc/amenities_keywords.txt', 'r') as f:
    amenities = [ line.rstrip() for line in f ]

with open('../Misc/service_keywords.txt', 'r') as f:
    services = [ line.rstrip() for line in f ]

#Initialize the porter stemmer and stopwords
stemmer = PorterStemmer()
with open('../Misc/stopwords.txt', 'r') as f:
    stops = []
    for line in f:
        stops.append(line.rstrip())
    stops = set(stops)
    
amenities = [ stemmer.stem(word.lower()) for word in amenities if word not in stops ]
services = [ stemmer.stem(word.lower()) for word in services if word not in stops ]

In [12]:
#find the clusters that are most similar to the seed words
amenity_similars = set([])
for i, bag in enumerate(fullbags):
    if len(set(amenities) & set(bag)) / float(len(set(amenities))) > 0.5:
        amenity_similars.add(i)

service_similars = set([])
for i, bag in enumerate(fullbags):
    if len(set(services) & set(bag)) / float(len(set(services))) > 0.5:
        print(set(services) & set(bag))
        service_similars.add(i)

{'feedback', 'appear', 'effici', 'prioriti', 'teamwork', 'care', 'peopl', 'empathi', 'flexibl', 'posit', 'qualiti', 'custom', 'listen', 'humor', 'respons', 'patient', 'valu', 'timeli', 'friendli', 'tact'}


In [14]:
service_similars

{21}

In [None]:
with open('../Misc/reviews.txt', 'r') as f:
    testreviews = []
    for i in range(25):
        line = f.readline().lower().rstrip()
        line = line.replace('...more', '')
        line = line.replace('.', '. ')
        sentences = line.split('.')
        sentences = [ sentence.strip() for sentence in sentences if sentence not in [' '] ]
        testreviews.append(sentences)

In [None]:
for sentence in testreviews[9]:
    tokens = [ stemmer.stem(word) for word in sentence.split(' ') if word not in stops ]
    for i in service_similars:
        if len(set(tokens) & set(fullbags[i])) / float(len(tokens)) > 0.5:
            print('Service: {}'.format(' '.join(tokens)))
            
    for i in amenity_similars:
        if len(set(tokens) & set(fullbags[i])) / float(len(tokens)) > 0.5:
            print('Amenity: {}'.format(' '.join(tokens)))