In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
import pandas as pd
import string

custom_stop_words = []
with open( '/content/drive/My Drive/STAT628_Module3/bars/feature extraction/stopwords.txt', "r" ) as fin:
    for line in fin.readlines():
        custom_stop_words.append( line.strip() )
# note that we need to make it hashable
print("Stopword list has %d entries" % len(custom_stop_words) )

city_list = ['AB','AZ','IL','NC','NV','OH','ON','PA','QC','SC','VA','WA','WI']
for city in city_list:
  print('\n======================= Analysis for',city,'==============================')
  path = '/content/drive/My Drive/STAT628_Module3/states/bars_review_' + city +'.csv'

  df = pd.read_csv('/content/drive/My Drive/STAT628_Module3/states/bars_review_NV.csv')
  df = df.loc[df['text']!='']

  df_pos = df.loc[df['stars']>=5.0]
  df_neg = df.loc[df['stars']<=1.0]

  import numpy as np
  from sklearn.feature_extraction.text import CountVectorizer
  vectorizer = CountVectorizer(stop_words = custom_stop_words, min_df = 20)
  # use a custom stopwords list, set the minimum term-document frequency to 20
  A_pos = vectorizer.fit_transform(df_pos['text'].apply(lambda x: np.str_(x)))
  terms_pos = vectorizer.get_feature_names()
  print("\npositive vocabulary has %d distinct terms" % len(terms_pos))
  A_neg = vectorizer.fit_transform(df_neg['text'].apply(lambda x: np.str_(x)))
  terms_neg = vectorizer.get_feature_names()
  print("negative vocabulary has %d distinct terms" % len(terms_neg))
  print( "\nCreated %d X %d positive document-term matrix" % (A_pos.shape[0], A_pos.shape[1]) )
  print( "Created %d X %d negative document-term matrix" % (A_neg.shape[0], A_neg.shape[1]) )

  raw_documents_pos = list(df_pos['text'])
  raw_documents_neg = list(df_pos['text'])
  from sklearn.externals import joblib
  joblib.dump((A_pos,terms_pos,raw_documents_pos), "articles-raw_pos.pkl") 
  joblib.dump((A_neg,terms_neg,raw_documents_neg), "articles-raw_neg.pkl") 

  import operator
  def rank_terms( A, terms ):
      # get the sums over each column
      sums = A.sum(axis=0)
      # map weights to the terms
      weights = {}
      for col, term in enumerate(terms):
          weights[term] = sums[0,col]
      # rank the terms by their weight over all documents
      return sorted(weights.items(), key=operator.itemgetter(1), reverse=True)
  print('\nrank for positive words in',city,':')
  ranking = rank_terms( A_pos, terms_pos )
  for i, pair in enumerate( ranking[0:10] ):
      print( "%02d. %s (%.2f)" % ( i+1, pair[0], pair[1] ) )
  print('\nrank for negative words in',city,':')
  ranking = rank_terms( A_neg, terms_neg )
  for i, pair in enumerate( ranking[0:10] ):
      print( "%02d. %s (%.2f)" % ( i+1, pair[0], pair[1] ) )

  k = 10
  # create the model
  from sklearn import decomposition
  model = decomposition.NMF( init="nndsvd", n_components=k ) 
  # apply the model and extract the two factor matrices
  W_pos = model.fit_transform( A_pos )
  H_pos = model.components_
  model = decomposition.NMF( init="nndsvd", n_components=k ) 
  W_neg = model.fit_transform( A_neg )
  H_neg = model.components_
  import numpy as np
  def get_descriptor( terms, H, topic_index, top ):
      # reverse sort the values to sort the indices
      top_indices = np.argsort( H[topic_index,:] )[::-1]
      # now get the terms corresponding to the top-ranked indices
      top_terms = []
      for term_index in top_indices[0:top]:
          top_terms.append( terms[term_index] )
      return top_terms
  print('\ndescriptors for positive texts in',city,':')
  descriptors = []
  for topic_index in range(k):
      descriptors.append( get_descriptor( terms_pos, H_pos, topic_index, 10 ) )
      str_descriptor = ", ".join( descriptors[topic_index] )
      print("Topic %02d: %s" % ( topic_index+1, str_descriptor ) )
      
  print('\ndescriptors for negative texts in',city,':')
  descriptors = []
  for topic_index in range(k):
      descriptors.append( get_descriptor( terms_neg, H_neg, topic_index, 10 ) )
      str_descriptor = ", ".join( descriptors[topic_index] )
      print("Topic %02d: %s" % ( topic_index+1, str_descriptor ) )

Stopword list has 370 entries


positive vocabulary has 9014 distinct terms
negative vocabulary has 4985 distinct terms

Created 133322 X 9014 positive document-term matrix
Created 37080 X 4985 negative document-term matrix

rank for positive words in AB :
01. great (91594.00)
02. place (76626.00)
03. good (57950.00)
04. service (54753.00)
05. time (51468.00)
06. vegas (48381.00)
07. love (40543.00)
08. drink (40229.00)
09. amaze (39929.00)
10. best (38779.00)

rank for negative words in AB :
01. place (23027.00)
02. service (20916.00)
03. time (19709.00)
04. drink (17652.00)
05. ask (17086.00)
06. back (16573.00)
07. tell (15001.00)
08. wait (14123.00)
09. table (13374.00)
10. good (11848.00)

descriptors for positive texts in AB :
Topic 01: restaurant, menu, dish, delicious, sauce, taste, eat, meal, flavor, dinner
Topic 02: great, service, atmosphere, staff, awesome, friendly, beer, price, nice, selection
Topic 03: vegas, room, club, stay, hotel, pool, strip, night, best, people
Topi