In [1]:
from gensim.models.word2vec import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
from gensim.parsing.porter import PorterStemmer
from nltk.corpus import stopwords
import numpy as np
import pickle
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import math
import operator
from os import system
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
import codecs

aspects = ['amenities', 'service', 'price', 'location']

In [None]:
"""
##
#clean raw text
##

#Initialize stopwords and regex
stops = set(stopwords.words('english'))
regex = re.compile(r'[^a-zA-Z0-9\s]|[\_\^\`\[\]\\]', re.IGNORECASE)

#clean the data and save to disk
with codecs.open('../../Misc/reviews.txt', 'r', encoding = 'utf-8', errors = 'ignore') as f:
    with open('../../cleaned_reviews.txt', 'w') as cleaned:
        
        for line in f:

            #remove non-alphanumeric symbols
            line = regex.sub(' ', line.lower().rstrip())

            #split into tokens and ignore stopwords
            tokens = [ word.strip() for word in line.split(' ') if word not in stops ]

            #remove empty elements from the list and stem
            tokens = [ word for word in tokens if word != '' ]

            #ignore first two elements because they're just identifiers
            tokens = tokens[2:]

            #write cleaned data to file
            if len(tokens) > 0:
                cleaned.write('{}\n'.format(' '.join(tokens)))
"""

In [None]:
"""
##
# Code for creating the model. After created, then just load the saved model
##

#Helpful iterator, credit here: https://rare-technologies.com/word2vec-tutorial/
class MySentences(object):
    
    def __init__(self, filename):
        self.filename = filename
 
    def __iter__(self):
        for line in open(self.filename):
            yield line.rstrip().split()

#train word2vec on cleaned data, but do so using the memory saving trick from the link above
sentences = MySentences('../../cleaned_reviews.txt')
model = Word2Vec(sentences, size = 300, workers = 4)
model.save('../../Models/w2v')

#Get vectors only from the model and save to disk
wv = model.wv
wv.save('../../Word_Vectors/wv')

#remove model from memory, we only need the wv's
del model
"""

In [2]:
#load word vectors from initial word2vec training
wv = KeyedVectors.load('../../Word_Vectors/wv')


In [None]:
"""
#write master matrix to file
with open('../../Pre_Clustering/wvmaster.csv', 'w') as f:
    
    #save word and its vector disk
    for word, data in wv.vocab.items():
        f.write('{} '.format(word))
        f.write('{}\n'.format(' '.join([ str(element) for element in wv[word] ])))    

#set up the index <-> vocab maps
index2vocab = {}
vocab2index = {}

for word in wv.vocab:
    vocab2index[word] = wv.vocab[word].index
    index2vocab[ vocab2index[word] ] = word
"""

Begin default clustering (for seed word creation)

In [None]:
"""
#write the pre_retrofitted matrix to file
with open('../../Pre_Clustering/default_matrix.csv', 'w') as f:
    
    for word in wv.vocab:
        f.write('{} {}\n'.format(vocab2index[word], ' '.join([ str(element) for element in wv[word] ])))
"""

In [None]:
"""
#load the default matrix from file
X = np.loadtxt('../../Pre_Clustering/default_matrix.csv', delimiter = ' ')
"""

In [None]:
"""
#use clusters to help find seed words
kmeans = KMeans(n_clusters = 100, random_state = 0, n_jobs = 4).fit(X[:, 1:])

#save kmeans
with open('../../KMeans/kmeans_default.pkl', 'wb') as f:
    pickle.dump(kmeans, f, pickle.HIGHEST_PROTOCOL)
"""

In [None]:
"""
#load kmeans
with open('../../KMeans/kmeans_default.pkl', 'rb') as f:
    kmeans = pickle.load(f)

#attach words to labels
clustered_words = {}
for i, label in enumerate(kmeans.labels_):
    clustered_words[ index2vocab[int(X[i, 0])] ] = label

#group words by their labels
fullbags = [ [] for i in range(100) ]
for k, v in clustered_words.items():
    fullbags[int(v)].append( (k, wv.vocab[k].count) )

#Sort each cluster
for i, bag in enumerate(fullbags):
    fullbags[i] = [ item[0] for item in sorted(bag, key = operator.itemgetter(1), reverse = True) ]

with open('../../KMeans/fullbags_default.pkl', 'wb') as f:
    pickle.dump(fullbags, f, pickle.HIGHEST_PROTOCOL)
"""

In [None]:
"""
#help finding seed words by using default clusters
aspect = 'price'
with open('../../Misc/{}.txt'.format(aspect), 'r') as f:
    seeds = set([ word.strip() for word in f.readlines() ])

with open('../../Misc/{}.txt'.format(aspect), 'w') as f:
    
    for bag in fullbags:
        
        if 'dollar' in bag:
            seeds |= set(bag)
            
    for word in sorted(seeds):
        f.write('{}\n'.format(word))
"""

In [None]:
"""
#trim seed word list to top length by frequency count
length = 100
with open('../../Misc/{}.txt'.format(aspect), 'r') as f:
    seeds = [ word.strip() for word in f.readlines() if wv.__contains__(word.strip()) ]
    seeds = sorted(([ item[0] for item in sorted([ (word, wv.vocab[word].count) for word in seeds ], key = operator.itemgetter(1), reverse = True) ])[:length])

with open('../../Misc/{}.txt'.format(aspect), 'w') as f:
    for word in seeds:
        f.write('{}\n'.format(word))
"""

In [None]:
"""
with open('../../Misc/seeds.txt', 'w') as s:
    
    for aspect in aspects:
        with open('../../Misc/{}.txt'.format(aspect), 'r') as f:
            
            aspectseeds = [ word.rstrip() for word in f.readlines() ]
            for i in range(len(aspectseeds)):
                
                s.write('{} '.format(aspectseeds[i]))
                
                for j in range(len(aspectseeds)):
                    
                    if i != j:
                        s.write('{} '.format(aspectseeds[j]))
                
                s.write('\n')
"""

End default clustering

In [None]:
"""
#Run the retrofit program (runs in a separate subprocess)
print('Exit code: {}'.format(system(
    'python \
    ../../retrofitting/retrofit.py \
    -i ../../Pre_Clustering/wvmaster.csv \
    -l ../../Misc/seeds.txt \
    -n 10 \
    -o ../../Pre_Clustering/retrofitted_dirty.txt'
)))
"""

In [None]:
"""
#output of retrofit doesn't have a header which we need for loading into gensim
with open('../../Pre_Clustering/retrofitted_dirty.txt', 'r') as f:
    lines = f.readlines()

with open('../../Pre_Clustering/retrofitted_dirty.txt', 'w') as f:
    
    f.write('{} {}\n'.format(len(wv.vocab), 300))
    
    for line in lines:
        f.write(line)

del lines
"""

In [None]:
"""
#load the retrofitted vectors in as a gensim object
wv = KeyedVectors.load_word2vec_format('../../Pre_Clustering/retrofitted_dirty.txt', binary = False)
wv.save('../../Word_Vectors/retrofitted_wv')
"""

In [None]:
"""
#load the retrofitted wordvectors from file
wv = KeyedVectors.load('../../Word_Vectors/retrofitted_wv')
"""

In [None]:
"""
#create the vocab->index, index->vocab dictionary, and indexed word vectors and save all to disk
vocab2index = {}
index2vocab = {}

with open('../../Pre_Clustering/retrofitted_dirty.txt', 'r') as f: 
    
    lines = f.readlines()
    
    with open('../../Pre_Clustering/retrofitted_clean.txt', 'w') as o:
        
        for line in lines[1:]:
            
            #get the word and its vector separately
            splits = line.rstrip().split(' ')
            word = splits[0]
            vector = splits[1:]
            
            #build the vocab dictionaries
            vocab2index[word] = wv.vocab[word].index
            index2vocab[ vocab2index[word] ] = word
            
            #save the indexed vectors to file for loading later
            o.write('{} '.format(vocab2index[word]))
            o.write('{}\n'.format(' '.join(vector)))
    
    del lines

with open('../../Vector_Tracking/vocab2index.pkl', 'wb') as f:
    pickle.dump(vocab2index, f, pickle.HIGHEST_PROTOCOL)
    
with open('../../Vector_Tracking/index2vocab.pkl', 'wb') as f:
    pickle.dump(index2vocab, f, pickle.HIGHEST_PROTOCOL)
"""

In [None]:
"""
#Build dictionaries from defaults
vocab2index = {}
index2vocab = {}

for word in wv.vocab:
    vocab2index[word] = wv.vocab[word].index
    index2vocab[ vocab2index[word] ] = word
    
with open('../../Vector_Tracking/vocab2index.pkl', 'wb') as f:
    pickle.dump(vocab2index, f, pickle.HIGHEST_PROTOCOL)
    
with open('../../Vector_Tracking/index2vocab.pkl', 'wb') as f:
    pickle.dump(index2vocab, f, pickle.HIGHEST_PROTOCOL)
"""

In [None]:
#how many k
numk = 900

#load the retrofitted wordvectors from file
#wv = KeyedVectors.load('../../Word_Vectors/retrofitted_wv')
wv = KeyedVectors.load('../../Word_Vectors/wv')

#vocab -> index
with open('../../Vector_Tracking/vocab2index.pkl', 'rb') as f:
    vocab2index = pickle.load(f)

#index -> vocab
with open('../../Vector_Tracking/index2vocab.pkl', 'rb') as f:
    index2vocab = pickle.load(f)

#master numpy matrix with index as first column and word vector as the rest
#X = np.loadtxt('../../Pre_Clustering/retrofitted_clean.txt', delimiter = ' ')
X = np.loadtxt('../../Pre_Clustering/default_matrix.csv', delimiter = ' ')

/// For gross scoring ///

In [None]:
"""
#find the best scoring k from 100 - 1000 (every 100)
for numk in range(100, 1100, 100):
    
    #kmeans model
    kmeans = KMeans(n_clusters = numk, random_state = 0, n_jobs = 4).fit(X[:, 1:])
    
    #save kmeans
    with open('../../KMeans/kmeans_{}.pkl'.format(numk), 'wb') as f:
        pickle.dump(kmeans, f, pickle.HIGHEST_PROTOCOL)

    #attach words to labels
    clustered_words = {}
    for i, label in enumerate(kmeans.labels_):
        clustered_words[ index2vocab[int(X[i, 0])] ] = label

    #group words by their labels
    fullbags = [ [] for i in range(numk) ]
    for k, v in clustered_words.items():
        fullbags[int(v)].append( (k, wv.vocab[k].count) )

    #Sort each cluster
    for i, bag in enumerate(fullbags):
        fullbags[i] = [ item[0] for item in sorted(bag, key = operator.itemgetter(1), reverse = True) ]
    
    #save clustered words
    with open('../../KMeans/fullbags_{}.pkl'.format(numk), 'wb') as f:
        pickle.dump(fullbags, f, pickle.HIGHEST_PROTOCOL)

    #score for this k
    kscore = 0.0
    numclusters = 0

    #score the clustering
    for i, bag in enumerate(fullbags):
        setbag = set(bag)
        clusterscore = 0.0
        numaspects = 0

        #go through every aspect
        for aspect in aspects:
            with open('../../Misc/{}.txt'.format(aspect), 'r') as f:

                #seed words
                temp = set([ line.rstrip() for line in f ])

                #don't worry about clusters that don't ahve any aspect seed words
                if len(temp & setbag) > 0:
                    clusterscore += wv.n_similarity(temp & setbag, setbag)
                    numaspects += 1

        #only care about clusters with aspects
        if numaspects > 0:
            numclusters += 1
            kscore += clusterscore / numaspects

    #average score
    kscore /= numclusters

    with open('../../Cluster_Metrics/scores.txt', 'a') as f:
        f.write('k = {} {}\n'.format(numk, kscore))
"""

/// end gross scoring ///

In [None]:
"""
#run k means
kmeans = KMeans(n_clusters = numk, random_state = 0, n_jobs = 4).fit(X[:, 1:])

#save kmeans
with open('../../KMeans/kmeans_{}.pkl'.format(numk), 'wb') as f:
    pickle.dump(kmeans, f, pickle.HIGHEST_PROTOCOL)
"""

In [None]:
"""
#load kmeans
with open('../../KMeans/kmeans_{}.pkl'.format(numk), 'rb') as f:
    kmeans = pickle.load(f)

#attach words to labels
clustered_words = {}
for i, label in enumerate(kmeans.labels_):
    clustered_words[ index2vocab[int(X[i, 0])] ] = label

#group words by their labels
fullbags = [ [] for i in range(numk) ]
for k, v in clustered_words.items():
    fullbags[int(v)].append( (k, wv.vocab[k].count) )

#Sort each cluster
for i, bag in enumerate(fullbags):
    fullbags[i] = [ item[0] for item in sorted(bag, key = operator.itemgetter(1), reverse = True) ]

with open('../../KMeans/fullbags_{}.pkl'.format(numk), 'wb') as f:
    pickle.dump(fullbags, f, pickle.HIGHEST_PROTOCOL)
"""

In [3]:
numk = 900

#load the wordbags and fullbags from file
#with open('../../KMeans/fullbags_{}.pkl'.format(numk), 'rb') as f:
with open('../../Validation/fullbags_{}.pkl'.format(numk), 'rb') as f:
    fullbags = pickle.load(f)

#wordbags is top 20 words from each cluster
wordbags = [ bag[:20] for bag in fullbags ]

In [None]:
"""
#for scoring
#using gensim's similarity method, find the similarity between the seed words
#in a cluster and all of the words in the cluster
confidence = [ { aspect: 0.0 for aspect in aspects } for i in range(numk) ]

for aspect in aspects:
    with open('../../Misc/{}.txt'.format(aspect), 'r') as f:

        temp = set()
        for line in f:
            temp.add(line.rstrip().lower())

        for i, bag in enumerate(fullbags):

            setbag = set(bag)

            if len(temp & setbag) > 0:
                confidence[i][aspect] = wv.n_similarity(temp & setbag, setbag)
"""

In [4]:
#Group all aspect words into their respective sets and find mean vectors
aspectsets = { aspect: [] for aspect in aspects }

for aspect in aspects:
    with open('../../Misc/{}.txt'.format(aspect), 'r') as f:

        temp = set()
        for line in f:
            temp.add(line.rstrip().lower())

        for i, bag in enumerate(fullbags):

            setbag = set(bag)

            if len(temp & setbag) > 0:
                
                aspectsets[aspect].append(np.mean([ wv[word] for word in setbag ], axis = 0))

In [None]:
"""
#Master dictionary that contains every review broken down by sentence,
#the aspect(s) present in the sentence, and the sentiment score of the sentence
docs = []
analyzer = SentimentIntensityAnalyzer()
stops = set(stopwords.words('english'))
regex = re.compile(r'[^a-zA-Z0-9\s.!?]|[\_\^\`\[\]\\]', re.IGNORECASE)
splitregex = re.compile(r'[.|?|!]', re.IGNORECASE)

#clean the data and save to disk
with codecs.open('../../Misc/reviews.txt', 'r', encoding = 'utf-8', errors = 'ignore') as f:
    
    filenum = 1
    
    for i, line in enumerate(f):

        #remove non-alphanumeric symbols
        line = regex.sub(' ', line.lower().rstrip())
        
        #splitup by word
        words = line.split(' ')
        
        #remove first two words, but save in own variable
        review_number, line = '_'.join(words[:2]), ' '.join(words[2:])

        #split into tokens and ignore stopwords
        sentences = [ sentence.strip() for sentence in splitregex.split(line) ]
        sentences = [ ' '.join([ word.strip() for word in sentence.split(' ') if word not in stops and wv.__contains__(word.strip()) ]) for sentence in sentences ]
        
        #remove empty sentences
        sentences = [ sentence for sentence in sentences if sentence != '' ]

        #write cleaned data to file
        if len(sentences) > 0:
            
            #add doc to list
            docs.append({'database_id': review_number, 'array_id': i, 'score': 0.0, 'data': [ { 'doc_id': i, 'sentence': sentence, 'sentiment': analyzer.polarity_scores(sentence), 'aspect': [] } for sentence in sentences ] })
            
        #save in 5 pieces
        if i % (1373102 // 5) == 0 and i != 0:
            
            #save to disk
            with open('../../Aspect_Dictionary/aspect_dictionary_{}.pkl'.format(filenum), 'wb') as f:
                pickle.dump(docs, f, pickle.HIGHEST_PROTOCOL)
            
            #updates
            filenum += 1
            del docs
            docs = []
            
if len(docs) > 0:
    
    #save to disk
    with open('../../Aspect_Dictionary/aspect_dictionary_{}.pkl'.format(filenum + 1), 'wb') as f:
        pickle.dump(docs, f, pickle.HIGHEST_PROTOCOL)
"""

In [5]:
#return cosine similarity of two vectors (need to be oriented vertically)
def similarity(a, b):
    
    return a.T.dot(b) / (np.sqrt(a.T.dot(a)) * np.sqrt(b.T.dot(b)))

In [None]:
#go through each set of documents
for i in range(2, 7):

    #we can only work with parts of the overall document set at a time
    with open('../../Aspect_Dictionary/aspect_dictionary_{}.pkl'.format(i), 'rb') as f:

        #load the documents being considered into memory
        docs = pickle.load(f)

        #go through every review
        for doc in docs:

            #go through every sentence in the review
            for sentdata in doc['data']:

                #find mean of the sentence
                mean = np.mean([ wv[word] for word in sentdata['sentence'].split(' ') ], axis = 0)

                #find the aspect scores
                sentdata['aspect'] = [ (aspect, max([ similarity(mean, bagmean) for bagmean in aspectsets[aspect] ])) for aspect in aspects ]

    #save to disk
    with open('../../Aspect_Dictionary/aspect_dictionary_{}.pkl'.format( 1 ), 'wb') as f:
        pickle.dump(docs, f, pickle.HIGHEST_PROTOCOL)
    
    #clear memory up
    del docs

In [None]:
#go through each set of documents
for i in range(1, 7):
    
    #we can only work with part of the whole document set at a time
    with open('../../Aspect_Dictionary/aspect_dictionary_{}.pkl'.format(i), 'rb') as f:
        
        #load the documents being considered into memory
        docs = pickle.load(f)
    
        #score each doc
        for doc in docs:
            
            #tracks aspects present in the review
            aspcnt = { aspect: 0 for aspect in aspects }
            
            #go through each sentence
            for data in doc['data']:
                
                #go through each aspect
                for asp in data['aspect']:
                    
                    #only count an aspect as being present if similarity is > 0.5
                    if asp[1] >= 0.5:
                        aspcnt[ asp[0] ] += 1
            
            #score is sum of absolute value of compound sentiment + sum of aspect count for the whole review
            doc['score'] = sum([ abs(snt['sentiment']['compound']) for snt in doc['data'] ]) + sum([aspcnt[aspect] for aspect in aspects ])

    #save
    with open('../../Aspect_Dictionary/aspect_dictionary_{}.pkl'.format(i), 'wb') as f:
        pickle.dump(docs, f, pickle.HIGHEST_PROTOCOL)
    
    #clear up some memory
    del docs

In [None]:
ids = sorted([ (doc['array_id'], doc['score']) for doc in docs ], key = operator.itemgetter(1), reverse  = True)

with codecs.open('../../Misc/reviews.txt', 'r', encoding = 'utf-8', errors = 'ignore') as f:
    lines = []
    for i in range(numlines):
        lines.append(f.readline().rstrip())

with open('../../Results/results_{}.txt'.format(numlines), 'w') as f:
    for i in range(5):
        f.write('{}\n\n'.format(lines[ ids[i][0] ]))

// Begin gross k scoring //

In [None]:
"""
for numk in range(100, 1100, 100):
    
    #kmeans model
    kmeans = KMeans(n_clusters = numk, random_state = 0, n_jobs = 4).fit(X[:, 1:])

    #attach words to labels
    clustered_words = {}
    for i, label in enumerate(kmeans.labels_):
        clustered_words[ index2vocab[int(X[i, 0])] ] = label

    #group words by their labels
    fullbags = [ [] for i in range(numk) ]
    for k, v in clustered_words.items():
        fullbags[int(v)].append( (k, wv.vocab[k].count) )

    #Sort each cluster
    for i, bag in enumerate(fullbags):
        fullbags[i] = [ item[0] for item in sorted(bag, key = operator.itemgetter(1), reverse = True) ]

    #score for this k
    kscore = 0.0
    numclusters = 0

    #score the clustering
    for i, bag in enumerate(fullbags):
        setbag = set(bag)
        clusterscore = 0.0
        numaspects = 0

        #go through every aspect
        for aspect in aspects:
            with open('../../Misc/{}.txt'.format(aspect), 'r') as f:

                #seed words
                temp = set([ line.rstrip() for line in f ])

                #don't worry about clusters that don't ahve any aspect seed words
                if len(temp & setbag) > 0:
                    clusterscore += wv.n_similarity(temp & setbag, setbag)
                    numaspects += 1

        #only care about clusters with aspects
        if numaspects > 0:
            numclusters += 1
            kscore += clusterscore / numaspects

    #average score
    kscore /= numclusters

    with open('../../Cluster_Metrics/scores.txt', 'a') as f:
        f.write('k = {} {}\n'.format(numk, kscore))
"""

// end gross k scoring //

// begin seed list creation helpers //

In [None]:
"""
with open('../../Thesaurus/thesaurus.pkl', 'rb') as f:
    thesaurus = pickle.load(f)
"""

In [None]:
"""
synSet = set()
for k in thesaurus:
    synSet.add(k.lower())
    
vocabSet = set()
for k in wv.vocab:
    vocabSet.add(k.lower())
    
intSet = synSet & vocabSet
"""

In [None]:
"""
thesausets = [ set(ws) for ws in fullbags ]
for i in range(len(thesausets)):
    thesausets[i] = thesausets[i] & intSet
"""

In [None]:
"""
aspect = 'service'
with open('../../Misc/{}.txt'.format(aspect), 'r') as f:
    words  = set([ line.rstrip() for line in f.readlines() ])

with open('../../Misc/{}.txt'.format(aspect), 'w') as f:
    
    for bag in fullbags:
        if 'customer' in bag:
            words |= set(bag)
            
    for word in sorted(words):
        f.write('{}\n'.format(word))
"""

In [None]:
"""
with open('../../Misc/{}.txt'.format(aspect), 'r') as f:
    words  = set([ line.rstrip() for line in f.readlines() ])

with open('../../Misc/{}.txt'.format(aspect), 'w') as f:
    for word in sorted([ item[0] for item in sorted([ (word, wv.vocab[word].count) for word in words], key = operator.itemgetter(1), reverse = True)[:100] ]):
        f.write('{}\n'.format(word))
"""

In [None]:
"""
#write seeds as adjacency list
with open('../../Misc/seeds.txt', 'w') as f:
    for aspect in aspects:
        with open('../../Misc/{}.txt'.format(aspect), 'r') as a:
            words = [ line.rstrip() for line in a ]
            for i in range(len(words)):
                f.write('{} '.format(words[i]))
                for j in range(len(words)):
                    if i != j:
                        f.write('{} '.format(words[j]))
                f.write('\n')
"""

// end seed list creation helpers //

// default vector scoring //

In [None]:
"""
#load word vectors from initial word2vec training
wv = KeyedVectors.load('../../Word_Vectors/wv')

vocab2index = {}
index2vocab = {}

with open('../../Validation/wvmaster.csv', 'w') as f:
    
    #save word and its vector disk
    for word, data in wv.vocab.items():
        vocab2index[word] = wv.vocab[word].index
        index2vocab[ vocab2index[word] ] = word
        
        f.write('{} '.format(wv.vocab[word].index))
        f.write('{}\n'.format(' '.join([ str(element) for element in wv[word] ])))

X = np.loadtxt('../../Validation/wvmaster.csv', delimiter = ' ')
"""

In [None]:
"""
for numk in range(100, 1100, 100):

    kmeans = KMeans(n_clusters = numk, random_state = 0, n_jobs = 4).fit(X[:, 1:])

    #attach words to labels
    clustered_words = {}
    for i, label in enumerate(kmeans.labels_):
        clustered_words[ index2vocab[int(X[i, 0])] ] = label

    #group words by their labels
    fullbags = [ [] for i in range(numk) ]
    for k, v in clustered_words.items():
        fullbags[int(v)].append( (k, wv.vocab[k].count) )

    #Sort each cluster
    for i, bag in enumerate(fullbags):
        fullbags[i] = [ item[0] for item in sorted(bag, key = operator.itemgetter(1), reverse = True) ]
    
    #save fullbags
    with open('../../Validation/fullbags_{}.pkl'.format(numk), 'wb') as f:
        pickle.dump(fullbags, f, pickle.HIGHEST_PROTOCOL)

    #score for this k
    kscore = 0.0
    numclusters = 0

    #score the clustering
    for i, bag in enumerate(fullbags):
        setbag = set(bag)
        clusterscore = 0.0
        numaspects = 0

        #go through every aspect
        for aspect in aspects:
            with open('../../Misc/{}.txt'.format(aspect), 'r') as f:

                #seed words
                temp = set([ line.rstrip() for line in f ])

                #don't worry about clusters that don't ahve any aspect seed words
                if len(temp & setbag) > 0:
                    clusterscore += wv.n_similarity(temp & setbag, setbag)
                    numaspects += 1

        #only care about clusters with aspects
        if numaspects > 0:
            numclusters += 1
            kscore += clusterscore / numaspects

    #average score
    kscore /= numclusters

    with open('../../Validation/scores.txt', 'a') as f:
        f.write('k = {} {}\n'.format(numk, kscore))
"""

// end default scoring //