In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
def calculate_coherence( w2v_model, term_rankings ):
    overall_coherence = 0.0
    for topic_index in range(len(term_rankings)):
        # check each pair of terms
        pair_scores = []
        for pair in combinations( term_rankings[topic_index], 2 ):
            pair_scores.append( w2v_model.similarity(pair[0], pair[1]) )
        # get the mean for all pairs in this topic
        topic_score = sum(pair_scores) / len(pair_scores)
        overall_coherence += topic_score
    # get the mean score across all topics
    return overall_coherence / len(term_rankings)

import numpy as np
def get_descriptor( all_terms, H, topic_index, top ):
    # reverse sort the values to sort the indices
    top_indices = np.argsort( H[topic_index,:] )[::-1]
    # now get the terms corresponding to the top-ranked indices
    top_terms = []
    for term_index in top_indices[0:top]:
        top_terms.append( all_terms[term_index] )
    return top_terms

def rank_terms( A, terms ):
    # get the sums over each column
    sums = A.sum(axis=0)
    # map weights to the terms
    weights = {}
    for col, term in enumerate(terms):
        weights[term] = sums[0,col]
    # rank the terms by their weight over all documents
    return sorted(weights.items(), key=operator.itemgetter(1), reverse=True)


In [0]:
import pandas as pd
import string

custom_stop_words = []
with open( '/content/drive/My Drive/STAT628_Module3/Data Preprocessing/feature extraction/stopwords.txt', "r" ) as fin:
    for line in fin.readlines():
        custom_stop_words.append( line.strip() )
# note that we need to make it hashable
print("Stopword list has %d entries" % len(custom_stop_words) )

city_list = ['ON','AZ','IL','NC','NV','OH','AB','PA','QC','SC','WI']
for city in city_list:
  print('\n======================= Analysis for',city,'==============================')
  path = '/content/drive/My Drive/STAT628_Module3/Score Model/bars_review_' + city +'.csv'

  df = pd.read_csv(path)
  df = df.loc[df['text']!='']

  # df_pos = df.loc[df['stars']>=5.0]
  df_neg = df.loc[df['stars']<=2.0]

  import numpy as np
  from sklearn.feature_extraction.text import CountVectorizer
  vectorizer = CountVectorizer(stop_words = custom_stop_words, min_df = 20)
  A_neg = vectorizer.fit_transform(df_neg['text'].apply(lambda x: np.str_(x)))
  terms_neg = vectorizer.get_feature_names()
  print("negative vocabulary has %d distinct terms" % len(terms_neg))
  print( "Created %d X %d negative document-term matrix" % (A_neg.shape[0], A_neg.shape[1]) )

  raw_documents_neg = list(df_neg['text'])
  from sklearn.externals import joblib
  joblib.dump((A_neg,terms_neg,raw_documents_neg), "articles-raw_neg.pkl") 

  import operator

  
  print('\nrank for negative words in',city,':')
  ranking = rank_terms( A_neg, terms_neg )
  for i, pair in enumerate( ranking[0:10] ):
      print( "%02d. %s (%.2f)" % ( i+1, pair[0], pair[1] ) )

 
  best_k = {'AB':5,'AZ':5,'IL':4,'NC':4,'NV':5,'OH':4,'ON':5,'PA':4,'QC':3,'SC':5,'WI':4}

  k = best_k[city]

  from sklearn import decomposition
  model = decomposition.NMF( init="nndsvd", n_components=k ) 
  # apply the model and extract the two factor matrices
  W_neg = model.fit_transform( A_neg )
  H_neg = model.components_

  print('\ndescriptors for negative texts in',city,':')
  descriptors = []
  for topic_index in range(k):
      descriptors.append( get_descriptor( terms_neg, H_neg, topic_index, 7 ) )
      str_descriptor = ", ".join( descriptors[topic_index] )
      print("Topic %02d: %s" % ( topic_index+1, str_descriptor ) )

  import collections
  ids = list(df_neg['business_id'])
  weights = list(df_neg['weight'])
  texts = list(df_neg['text'])
  busi_weight = collections.defaultdict(list)
  busi_text = collections.defaultdict(list)

  for i in range(len(ids)):
    if texts[i] is np.nan:
      continue
    busi = ids[i]
    busi_weight[busi].append(weights[i])
    busi_text[busi].append(texts[i])
    
  scores = []
  ids = []

  for busi in busi_text:
    ids.append(busi)
    if len(busi_text[busi]) < k:
      score = np.array([[0]*k])
      scores.append(score)
      continue
    A = vectorizer.transform(busi_text[busi])
    W = model.fit_transform( A )
    w = np.array([busi_weight[busi]])
    score = np.dot(w,W)
    score = score/sum(busi_weight[busi])
    scores.append(score)
  scores = np.array(scores)
  result = pd.DataFrame({'business_id':ids})
  word_weight = pd.DataFrame()

  top_terms = []
  top_weights = []
  for i in range(k):
    top_terms.append([])
    top_weights.append([])

    col_name = 'topic_'+str(i+1)
    result[col_name] = scores[:,0,i]



    topic_index = i
    top_indices = np.argsort( H_neg[topic_index,:] )[::-1]
    
    for term_index in top_indices[0:7]:
        top_terms[i].append( terms_neg[term_index] )
        top_weights[i].append( H_neg[topic_index,term_index] )
    print(top_terms[i])
    print(top_weights[i])

    col_name = 'topic_'+str(i+1)+'words'
    word_weight[col_name] = top_terms[i]
    col_name = 'topic_'+str(i+1)+'weight'
    word_weight[col_name] = top_weights[i]

  path = '/content/drive/My Drive/STAT628_Module3/score/'+city+'.csv'
  result.to_csv(path,index=False,sep=',')
  
  path = '/content/drive/My Drive/STAT628_Module3/word weight/'+city+'_weight.csv'
  word_weight.to_csv(path,index=False,sep=',')

Stopword list has 403 entries

negative vocabulary has 3880 distinct terms
Created 20549 X 3880 negative document-term matrix

rank for negative words in ON :
01. service (13072.00)
02. time (11074.00)
03. drink (9501.00)
04. table (9343.00)
05. ask (8690.00)
06. wait (7510.00)
07. bar (6267.00)
08. server (6119.00)
09. tell (5750.00)
10. look (5577.00)

descriptors for negative texts in ON :
Topic 01: table, wait, seat, minutes, sit, host, arrive
Topic 02: fry, taste, chicken, dish, menu, wing, sauce
Topic 03: drink, bar, beer, night, look, friends, bartender
Topic 04: time, service, wait, experience, staff, slow, long
Topic 05: ask, server, tell, bill, manager, leave, waitress
['table', 'wait', 'seat', 'minutes', 'sit', 'host', 'arrive']
[12.029219988805197, 6.56423725191533, 3.4299963071125026, 3.103288971586179, 2.0644827897548956, 1.8424517120324186, 1.40855295820922]
['fry', 'taste', 'chicken', 'dish', 'menu', 'wing', 'sauce']
[3.886273234508964, 3.7615946757712337, 3.68163714180

  u = x_n / x_n_nrm
  W[W < eps] = 0


['table', 'wait', 'minutes', 'ask', 'waitress', 'tell', 'seat']
[7.318160212777609, 6.457447135749204, 4.810949325220429, 3.8054618949453496, 2.82873058848509, 2.7932379102160283, 2.506140967935145]
['fry', 'sandwich', 'taste', 'burger', 'cheese', 'menu', 'chicken']
[3.849334795635559, 3.550248626824093, 2.7956578427793786, 2.5608384101037633, 2.5045824082052612, 2.0796452834139636, 1.9337875757559435]
['bar', 'drink', 'bartender', 'beer', 'service', 'sit', 'night']
[7.581956893037111, 6.441183898360244, 2.048673854720156, 1.6886663407989444, 1.3173827302846384, 1.3015766517479403, 1.057199939604537]
['time', 'service', 'experience', 'manager', 'server', 'tell', 'know']
[8.758037909324889, 3.2959364118089276, 1.1040897096292457, 0.9857368495385314, 0.9106587688215748, 0.8308251892007342, 0.7823895271651884]

negative vocabulary has 1105 distinct terms
Created 3519 X 1105 negative document-term matrix

rank for negative words in QC :
01. service (1841.00)
02. drink (1696.00)
03. time (1

In [0]:
  path = '/content/drive/My Drive/STAT628_Module3/Bars Review by States/bars_review_AB.csv'

  df = pd.read_csv(path)
  df = df.loc[df['text']!='']

  # df_pos = df.loc[df['stars']>=5.0]
  df_neg = df.loc[df['stars']<=1.0]

In [0]:
len(df_neg)

1438

In [0]:
l = list(df_neg['text'])

In [0]:
for review in l:
  if 'wings' in review:
    print(review)
    break

long long time customer location obvious issuesowners r nice despite request do wingsnot forget wifi place include nyc la wifi owners paranoid
