In [None]:
from gensim.models.word2vec import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
#from gensim.parsing.porter import PorterStemmer
#from nltk.corpus import stopwords
import numpy as np
import pickle
from sklearn.cluster import KMeans
#from sklearn.metrics import pairwise_distances
import math
import operator
import matplotlib.pyplot as plt
from os import system
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
%matplotlib inline

aspects = ['amenities', 'service', 'price', 'location']

In [None]:
"""
##
#clean raw text
##

#Initialize the porter stemmer and stopwords
#stemmer = PorterStemmer()
with open('../../Misc/stopwords.txt', 'r') as f:
    stops = set()
    for line in f:
        stops.add(line.rstrip())

#clean the data and save to disk
with open('../../Misc/reviews.txt', 'r') as f:
    with open('../../cleaned_reviews.txt', 'w') as cleaned:
        for line in f:
            line = line.replace('...more', '')
            line = line.replace('.', ' ')
            #tokens = [ stemmer.stem(word) for word in line.lower().rstrip().split(' ') if word not in stops ]
            tokens = [ word.strip() for word in line.lower().rstrip().split(' ') if word not in stops ]
            cleaned.write('{}\n'.format(' '.join(tokens)))
"""

In [None]:
"""
##
# Code for creating the model. After created, then just load the saved model
##

#Helpful iterator, credit here: https://rare-technologies.com/word2vec-tutorial/
class MySentences(object):
    
    def __init__(self, filename):
        self.filename = filename
 
    def __iter__(self):
        for line in open(self.filename):
            yield line.rstrip().split()

#train word2vec on cleaned data, but do so using the memory saving trick from the link above
sentences = MySentences('../../cleaned_reviews.txt')
model = Word2Vec(sentences, size = 300, workers = 4)
model.save('../../Models/w2v')

#Get vectors only from the model and save to disk
wv = model.wv
wv.save('../../Word_Vectors/wv')

#remove model from memory, we only need the wv's
del model
"""

In [None]:
"""
#load word vectors from initial word2vec training
wv = KeyedVectors.load('../../Word_Vectors/wv')
"""

In [None]:

"""#write master matrix to file
with open('../../Pre_Clustering/wvmaster.csv', 'w') as f:
    
    #save word and its vector disk
    for word, data in wv.vocab.items():
        f.write('{} '.format(word))
        f.write('{}\n'.format(' '.join([ str(element) for element in wv[word] ])))    
"""

In [None]:
"""
#Run the retrofit program (runs in a separate subprocess)
print('Exit code: {}'.format(system(
    'python \
    ../../retrofitting/retrofit.py \
    -i ../../Pre_Clustering/wvmaster.csv \
    -l ../../Misc/seeds.txt \
    -n 10 \
    -o ../../Pre_Clustering/retrofitted_dirty.txt'
)))
"""

In [None]:
"""
#output of retrofit doesn't have a header which we need for loading into gensim
with open('../../Pre_Clustering/retrofitted_dirty.txt', 'r') as f:
    lines = f.readlines()

with open('../../Pre_Clustering/retrofitted_dirty.txt', 'w') as f:
    
    f.write('{} {}\n'.format(len(wv.vocab), 300))
    
    for line in lines:
        f.write(line)

del lines
"""

In [None]:
"""
#load the retrofitted vectors in as a gensim object
wv = KeyedVectors.load_word2vec_format('../../Pre_Clustering/retrofitted_dirty.txt', binary = False)
wv.save('../../Word_Vectors/retrofitted_wv')
"""

In [None]:
"""
#load the retrofitted wordvectors from file
wv = KeyedVectors.load('../../Word_Vectors/retrofitted_wv')
"""

In [None]:
"""
#create the vocab->index, index->vocab dictionary, and indexed word vectors and save all to disk
vocab2index = {}
index2vocab = {}

with open('../../Pre_Clustering/retrofitted_dirty.txt', 'r') as f: 
    
    lines = f.readlines()
    
    with open('../../Pre_Clustering/retrofitted_clean.txt', 'w') as o:
        
        for line in lines[1:]:
            
            #get the word and its vector separately
            splits = line.rstrip().split(' ')
            word = splits[0]
            vector = splits[1:]
            
            #build the vocab dictionaries
            vocab2index[word] = wv.vocab[word].index
            index2vocab[ vocab2index[word] ] = word
            
            #save the indexed vectors to file for loading later
            o.write('{} '.format(vocab2index[word]))
            o.write('{}\n'.format(' '.join(vector)))
    
    del lines

with open('../../Vector_Tracking/vocab2index.pkl', 'wb') as f:
    pickle.dump(vocab2index, f, pickle.HIGHEST_PROTOCOL)
    
with open('../../Vector_Tracking/index2vocab.pkl', 'wb') as f:
    pickle.dump(index2vocab, f, pickle.HIGHEST_PROTOCOL)
"""

In [None]:
#how many k
numk = 1000

#load the retrofitted wordvectors from file
wv = KeyedVectors.load('../../Word_Vectors/retrofitted_wv')

#vocab -> index
with open('../../Vector_Tracking/vocab2index.pkl', 'rb') as f:
    vocab2index = pickle.load(f)

#index -> vocab
with open('../../Vector_Tracking/index2vocab.pkl', 'rb') as f:
    index2vocab = pickle.load(f)

#master numpy matrix with index as first column and word vector as the rest
X = np.loadtxt('../../Pre_Clustering/retrofitted_clean.txt', delimiter = ' ')

In [None]:
"""
#run k means
kmeans = KMeans(n_clusters = numk, random_state = 0, n_jobs = 4).fit(X[:, 1:])

#save kmeans
with open('../../KMeans/kmeans_{}.pkl'.format(numk), 'wb') as f:
    pickle.dump(kmeans, f, pickle.HIGHEST_PROTOCOL)
"""

In [None]:
"""
#load kmeans
with open('../../KMeans/kmeans_{}.pkl'.format(numk), 'rb') as f:
    kmeans = pickle.load(f)

#attach words to labels
clustered_words = {}
for i, label in enumerate(kmeans.labels_):
    clustered_words[ index2vocab[int(X[i, 0])] ] = label

#group words by their labels
fullbags = [ [] for i in range(numk) ]
for k, v in clustered_words.items():
    fullbags[int(v)].append( (k, wv.vocab[k].count) )

#Sort each cluster
for i, bag in enumerate(fullbags):
    fullbags[i] = [ item[0] for item in sorted(bag, key = operator.itemgetter(1), reverse = True) ]

with open('../../KMeans/fullbags_{}.pkl'.format(numk), 'wb') as f:
    pickle.dump(fullbags, f, pickle.HIGHEST_PROTOCOL)
"""

In [None]:
#load the wordbags and fullbags from file
with open('../../KMeans/fullbags_{}.pkl'.format(numk), 'rb') as f:
    fullbags = pickle.load(f)
    
#wordbags is top 20 words from each cluster
wordbags = [ bag[:20] for bag in fullbags ]
    
#attach cluster ids to words
inversefull = {}
for i, bag in enumerate(fullbags):
    for word in bag:
        inversefull[word] = i

In [None]:
#using gensm's similarity method, find the similarity between the seed words
#in a cluster and all of the words in the cluster
confidence = [ { aspect: 0.0 for aspect in aspects } for i in range(numk) ]

for aspect in aspects:
    with open('../../Misc/{}.txt'.format(aspect), 'r') as f:

        temp = set()
        for line in f:
            temp.add(line.rstrip().lower())

        for i, bag in enumerate(fullbags):

            setbag = set(bag)

            if len(temp & setbag) > 0:
                confidence[i][aspect] = wv.n_similarity(temp & setbag, setbag)

In [None]:
#identify the aspect clusters
aspectclusters = [ 'other' for i in range(numk) ]
for i, c in enumerate(confidence):
    for aspect in aspects:
        if c[aspect] > 0:
            aspectclusters[i] = aspect

In [None]:
"""
#Master dictionary that contains every review broken down by sentence, the cluster(s) that
#sentence belongs to, the aspect(s) present in the sentence, and the sentiment score of the sentence
docs = []
analyzer = SentimentIntensityAnalyzer()

with open('../../Misc/stopwords.txt', 'r') as f:
    stops = set()
    for line in f:
        stops.add(line.rstrip())

with open('../../Misc/reviews.txt') as f:
    
    for i in range(200000):
        line = f.readline().rstrip().lower()
        line = line.replace('...more', '')
        
        #reviews are split on periods
        sentences = [ sentence.strip() for sentence in line.split('.') ]
        
        #remove stopwords and non-vocab words from the sentence
        for j, sentence in enumerate(sentences):
            sentences[j] = ' '.join([ word.strip() for word in sentence.split(' ') if word not in stops and wv.__contains__(word.strip()) ])
        
        #remove empty sentences (ie: someone used an isolated period or an ellipsis)
        sentences = [ sentence for sentence in sentences if sentence != '' ]
        
        #each element of the docs list is a sentence
        docs.append([ { 'docid': i, 'sentence': sentence, 'score': analyzer.polarity_scores(sentence), 'cluster': [], 'aspect': [] } for sentence in sentences ])
"""

In [None]:
"""
#here, we identify which clusters and aspects that correlate to the sentence
for doc in docs:
    for sentdata in doc:
        
        #returns top 10 most similar words from the vocab to each word in the sentence
        similars = wv.most_similar(positive = sentdata['sentence'].split(' '))
        
        #this will track which clusters the sentence likely belongs
        rank = [ 0 for i in range(numk) ]
        for word in similars:
            rank[ inversefull[word[0]] ] += 1
        
        #the cluster data is a list of tuples each containing the cluster
        #and the ratio of words from the similar words in that cluster with
        #the number of similar words
        for i, c in enumerate(rank):
            if c > 0:
                sentdata['cluster'].append( (i, c / float(len(similars))) )
        
        #the aspects, like the clusters, are a list of tuples
        asptotals = { aspect: 0.0 for aspect in aspects }
        for c in sentdata['cluster']:
            if aspectclusters[ c[0] ] != 'other':
                asptotals[ aspectclusters[ c[0] ] ] += c[1]

        sentdata['aspect'] = [ (k, v) for k, v in asptotals.items() ]

#save to disk
with open('../../Aspect_Dictionary/aspect_dictionary_{}.pkl'.format(numk), 'wb') as f:
    pickle.dump(docs, f, pickle.HIGHEST_PROTOCOL)
"""

In [None]:
#load the dictionary from file
with open('../../Aspect_Dictionary/aspect_dictionary_{}.pkl'.format(numk), 'rb') as f:
    docs = pickle.load(f)

In [None]:
#print the sentences and their aspects if they were classified as having one
for doc in docs:
    for sentence in doc:
        for i in range(len(sentence['aspect'])):
            if sentence['aspect'][i][1] > 0:
                print('Aspect(s): {}\nScore: {}\n{}\n'.format([ aspect for aspect in sentence['aspect'] if aspect[1] > 0 ], sentence['score'], sentence['sentence']))
                break

// Begin gross k scoring //

In [None]:
"""
#read each cluster's data into memory
with open('../../Cluster_Metrics/overlaps.txt', 'r') as f:
    
    lines = f.readlines()
    clusters = []
    temp = []
    for i in range(1, len(lines)):
        
        if lines[i].rstrip()[0:3] == '###':
            clusters.append(temp)
            temp = []
        else:
            temp.append(lines[i].rstrip())
    
    clusters.append(temp)
"""

In [None]:
"""
kdata = {}
for i, c in enumerate(clusters):
    
    kdata[i] = {}
    
    for line in c:
        data = line.split(',')
        kdata[i][ data[0] ] = {}
        for entry in data[1:]:
            apct = entry.split(' ')
            kdata[i][ data[0] ][ apct[0][0:-1] ] = float(apct[1])
"""

In [None]:
"""
scores = [ 0.0 for i in range(len(clusters)) ]

for k, v in kdata.items():
    
    score = 0.0
    length = 0
    for a, b in v.items():
        nums = [ float(d) for c, d in b.items() ]
        score += sum(nums) / float(len(nums))
        length += 1
    score = score / length
    
    scores[k] = score
"""

In [None]:
"""
for i, score in enumerate(scores):
    print('{}: {}'.format((i + 1) * 100, score))
"""

// end gross k scoring //

// begin seed list creation helpers //

In [None]:
"""
with open('../../Thesaurus/thesaurus.pkl', 'rb') as f:
    thesaurus = pickle.load(f)
"""

In [None]:
"""
synSet = set()
for k in thesaurus:
    synSet.add(k.lower())
    
vocabSet = set()
for k in wv.vocab:
    vocabSet.add(k.lower())
    
intSet = synSet & vocabSet
"""

In [None]:
"""
thesausets = [ set(ws) for ws in fullbags ]
for i in range(len(thesausets)):
    thesausets[i] = thesausets[i] & intSet
"""

// end seed list creation helpers //