In [1]:
from gensim.models.word2vec import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
#from gensim.parsing.porter import PorterStemmer
#from nltk.corpus import stopwords
import numpy as np
import pickle
from sklearn.cluster import KMeans
#from sklearn.metrics import pairwise_distances
import math
import operator
import matplotlib.pyplot as plt
from os import system
%matplotlib inline

In [None]:
"""
##
#clean raw text
##

#Initialize the porter stemmer and stopwords
#stemmer = PorterStemmer()
with open('../../Misc/stopwords.txt', 'r') as f:
    stops = []
    for line in f:
        stops.append(line.rstrip())
    stops = set(stops)

#clean the data and save to disk
with open('../../Misc/reviews.txt', 'r') as f:
    with open('../../cleaned_reviews.txt', 'w') as cleaned:
        for line in f:
            line = line.replace('...more', '')
            line = line.replace('.', ' ')
            #tokens = [ stemmer.stem(word) for word in line.lower().rstrip().split(' ') if word not in stops ]
            tokens = [ word.strip() for word in line.lower().rstrip().split(' ') if word not in stops ]
            cleaned.write('{}\n'.format(' '.join(tokens)))
"""
print('Comments')

In [None]:
"""
##
# Code for creating the model. After created, then just load the saved model
##

#Helpful iterator, credit here: https://rare-technologies.com/word2vec-tutorial/
class MySentences(object):
    
    def __init__(self, filename):
        self.filename = filename
 
    def __iter__(self):
        for line in open(self.filename):
            yield line.rstrip().split()

#train word2vec on cleaned data, but do so using the memory saving trick from the link above
sentences = MySentences('../../cleaned_reviews.txt')
model = Word2Vec(sentences, size = 500, workers = 4)
model.save('../../Models/w2v')

#Get vectors only from the model and save to disk
wv = model.wv
wv.save('../../Word_Vectors/wv')
"""
print('Comments')

In [2]:
#load word vectors from initial word2vec training
wv = KeyedVectors.load('../../Word_Vectors/wv')

In [None]:
"""
#write master matrix to file
with open('../../Pre_Clustering/wvmaster.csv', 'w') as f:

    #save word and its vector disk
    for word, data in wv.vocab.items():
        f.write('{} '.format(word))
        f.write('{}\n'.format(' '.join([ str(element) for element in wv[word] ])))
"""        
print('Comments')

In [3]:
#Run the retrofit program (runs in a separate subprocess)
print('Exit code: {}'.format(system(
    'python \
    ../../retrofitting/retrofit.py \
    -i ../../Pre_Clustering/wvmaster.csv \
    -l ../../Misc/seeds.txt \
    -n 10 \
    -o ../../Pre_Clustering/retrofitted_dirty.txt'
)))

Exit code: 0


In [4]:
#create the vocab->index, index->vocab dictionary, and indexed word vectors and save all to disk
vocab2index = {}
index2vocab = {}

with open('../../Pre_Clustering/retrofitted_dirty.txt', 'r') as f:
    with open('../../Pre_Clustering/retrofitted_clean.txt', 'w') as o:
        
        for line in f:
            
            #get the word and its vector separately
            splits = line.rstrip().split(' ')
            word = splits[0]
            vector = splits[1:]
            
            #build the vocab dictionaries
            vocab2index[word] = wv.vocab[word].index
            index2vocab[ vocab2index[word] ] = word
            
            #save the indexed vectors to file for loading later
            o.write('{} '.format(vocab2index[word]))
            o.write('{}\n'.format(' '.join(vector)))

with open('../../Vector_Tracking/vocab2index.pkl', 'wb') as f:
    pickle.dump(vocab2index, f, pickle.HIGHEST_PROTOCOL)
    
with open('../../Vector_Tracking/index2vocab.pkl', 'wb') as f:
    pickle.dump(index2vocab, f, pickle.HIGHEST_PROTOCOL)

In [5]:
#vocab -> index
with open('../../Vector_Tracking/vocab2index.pkl', 'rb') as f:
    vocab2index = pickle.load(f)

#index -> vocab
with open('../../Vector_Tracking/index2vocab.pkl', 'rb') as f:
    index2vocab = pickle.load(f)

#master numpy matrix with index as first column and word vector as the rest
X = np.loadtxt('../../Pre_Clustering/retrofitted_clean.txt', delimiter = ' ')

In [6]:
for i in range(10, 110, 10):

    #run k means
    kmeans = KMeans(n_clusters = i, random_state = 0, n_jobs = 4).fit(X[:, 1:])

    #save kmeans
    with open('../../KMeans/kmeans_{}.pkl'.format(i), 'wb') as f:
        pickle.dump(kmeans, f, pickle.HIGHEST_PROTOCOL)

#print('Comments')

In [8]:
#load kmeans

for j in range(10, 110, 10):

    with open('../../KMeans/kmeans_{}.pkl'.format(j), 'rb') as f:
        kmeans = pickle.load(f)

    #attach words to labels
    clustered_words = {}
    for i, label in enumerate(kmeans.labels_):
        clustered_words[ index2vocab[int(X[i, 0])] ] = label

    #group words by their labels
    fullbags = [ [] for i in range(100) ]
    for k, v in clustered_words.items():
        fullbags[int(v)].append( (k, wv.vocab[k].count) )

    #Sort each cluster and trim to top 20.
    wordbags = [ [] for i in range(100) ]
    for i, bag in enumerate(fullbags):
        fullbags[i] = [ item[0] for item in sorted(bag, key = operator.itemgetter(1), reverse = True) ]
        wordbags[i] = fullbags[i][0:20]

    with open('../../KMeans/fullbags_{}.pkl'.format(j), 'wb') as f:
        pickle.dump(fullbags, f, pickle.HIGHEST_PROTOCOL)

    #save trimmed clusters so we don't have to do anything above again
    with open('../../KMeans/wordbags_{}.pkl'.format(j), 'wb') as f:
        pickle.dump(wordbags, f, pickle.HIGHEST_PROTOCOL)

#print('Comments')

In [None]:
#load the wordbags and fullbags from file
with open('../../KMeans/wordbags.pkl', 'rb') as f:
    wordbags = pickle.load(f)
    
with open('../../KMeans/fullbags.pkl', 'rb') as f:
    fullbags = pickle.load(f)

In [None]:
with open('../../Thesaurus/thesaurus.pkl', 'rb') as f:
    thesaurus = pickle.load(f)

In [None]:
synSet = set()
for k in thesaurus:
    synSet.add(k.lower())
    
vocabSet = set()
for k in wv.vocab:
    vocabSet.add(k.lower())
    
intSet = synSet & vocabSet

In [None]:
thesausets = [ set(ws) for ws in fullbags ]
for i in range(len(thesausets)):
    thesausets[i] = thesausets[i] & intSet

In [None]:
amenities = set()
services = set()
location = set()
price = set()

with open('../../Misc/amenities.txt', 'r') as f:
    for line in f:
        for i, bag in enumerate(fullbags):
            if line.rstrip().lower() in bag:
                amenities.add(i)

with open('../../Misc/services.txt', 'r') as f:
    for line in f:
        for i, bag in enumerate(fullbags):
            if line.rstrip().lower() in bag:
                services.add(i)

with open('../../Misc/price.txt', 'r') as f:
    for line in f:
        for i, bag in enumerate(fullbags):
            if line.rstrip().lower() in bag:
                price.add(i)

with open('../../Misc/location.txt', 'r') as f:
    for line in f:
        for i, bag in enumerate(fullbags):
            if line.rstrip().lower() in bag:
                location.add(i)

In [None]:
for i in price:
    print(wordbags[i], '\n')

// helpers for creating seed lists //

In [None]:
amenities = set()
with open('../../Misc/amenities.txt', 'r') as f:
    
    for line in f:

        word = line.rstrip().lower()
        amenities.add(word)

        if word in synSet:
            amenities = amenities | thesaurus[word]

In [None]:
services = set()
with open('../../Misc/services.txt', 'r') as f:
    
    for line in f:

        word = line.rstrip().lower()
        services.add(word)

        if word in synSet:
            services = services | thesaurus[word]

In [None]:
price = set()
with open('../../Misc/price.txt', 'r') as f:
    
    for line in f:

        word = line.rstrip().lower()
        price.add(word)

        if word in synSet:
            price = price | thesaurus[word]

In [None]:
location = set()
with open('../../Misc/location.txt', 'r') as f:
    
    for line in f:

        word = line.rstrip().lower()
        location.add(word)

        if word in synSet:
            location = location | thesaurus[word]

In [None]:
with open('../../Misc/seeds.txt', 'w') as f:
    
    for w1 in amenities:
        
        f.write('{} '.format(w1))
        
        for w2 in amenities:
            if w1 != w2:
                f.write('{} '.format(w2))
        
        f.write('\n')
    
    for w1 in services:
        
        f.write('{} '.format(w1))
        
        for w2 in services:
            if w1 != w2:
                f.write('{} '.format(w2))
        
        f.write('\n')
    
    for w1 in price:
        
        f.write('{} '.format(w1))
        
        for w2 in price:
            if w1 != w2:
                f.write('{} '.format(w2))
        
        f.write('\n')
    
    for w1 in location:
        
        f.write('{} '.format(w1))
        
        for w2 in location:
            if w1 != w2:
                f.write('{} '.format(w2))
        
        f.write('\n')