In [24]:
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import json
import io
import time
import re
import sys
from pyspark import SparkConf, SparkContext
import findspark
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [25]:
def train_test_split(n = 0.7):
    time1 = time.time()
    # import files
    review_json_file = '/Users/wenxianfei/Desktop/lda_cf/review.json'

    review = []
    for line in open(review_json_file, 'r'):
        review.append(json.loads(line))

    # convert to dataframe
    review_df = pd.DataFrame.from_records(review)
    # extract the userful column
    review_df = review_df.loc[:,['business_id','user_id','stars','text']]
    # split the test and training dataset
    length = int(len(review_df) * n)

    review_df_training = review_df.iloc[:length,]
    review_df_test = review_df.iloc[length:,]
    review_df_training.to_csv('training.csv')
    review_df_test.to_csv('test.csv')
    time2 = time.time()
    print('SUCCESS!!!  train_test_split')
    print('The training set has ', length, 'rows data')
    print('The testing set has ', len(review_df) - length, 'rows data')
    print('Time: ', time2 - time1)

In [32]:
def textProcessing(text):
    # lower words  
    text = text.lower()
    # remove punctuation
    for c in string.punctuation:
        text = text.replace(c, ' ')
    # tokenize
    wordLst = nltk.word_tokenize(text)
    # stop word
    filtered = [w for w in wordLst if w not in stopwords.words('english')]
    # keep noun  
    refiltered =nltk.pos_tag(filtered)
    filtered = [w for w, pos in refiltered if pos.startswith('NN')]
    # xtract the stem
    ps = PorterStemmer()
    filtered = [ps.stem(w) for w in filtered]

    return " ".join(filtered) 

def rating_proportion(text,rate):
    return text * int(rate)

In [27]:
def lda(review,n_topic = 10,n_top_words=20):
# vectorization
# generate the word-docu matrix
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,stop_words='english')
    tf = tf_vectorizer.fit_transform(review)
# train the lda model
    lda = LatentDirichletAllocation(n_topics=n_topic, 
                                max_iter=50,
                                learning_method='batch')
    lda.fit(tf)
# print the performance
    print('perplexity is: ',lda.perplexity(tf))

# generate the top word list for every topic
    tf_feature_names = tf_vectorizer.get_feature_names()
    feature_dict = {k: v for v, k in enumerate(tf_feature_names)}

    for topic_idx, topic in enumerate(lda.components_):       
        print ("Topic #%d:" % topic_idx)
        print (" ".join([tf_feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))    

# return the topic*word distribution matrix
    return lda.components_,feature_dict

In [28]:
train_test_split(n = 0.0001)

SUCCESS!!!  train_test_split
The training set has  668 rows data
The testing set has  6685232 rows data
Time:  262.3570120334625


In [29]:
training = pd.read_csv('training.csv')

In [30]:
training.head()

Unnamed: 0.1,Unnamed: 0,business_id,user_id,stars,text
0,0,ujmEBvifdJM6h6RLv4wQIg,hG7b0MtEbXx5QzbzE6C_VA,1.0,Total bill for this horrible service? Over $8G...
1,1,NZnhc2sEQy3RmzKTZnqtwQ,yXQM5uF2jS6es16SJzNHfg,5.0,I *adore* Travis at the Hard Rock's new Kelly ...
2,2,WTqjgwHlXbSFevF32_DJVw,n6-Gk65cPZL6Uz8qRm3NYw,5.0,I have to say that this office really has it t...
3,3,ikCg8xy5JIg_NGPx-MSIDA,dacAIZ6fTM6mqwW5uxkskg,5.0,Went in for a lunch. Steak sandwich was delici...
4,4,b1b1eb3uo-w561D0ZfCEiQ,ssoyf2_x0EQMed6fgHeMyQ,1.0,Today was my second out of three sessions I ha...


In [34]:
training['text'] = training['text'].apply(textProcessing)

In [36]:
training['text'] = training['text'] * training['stars'].apply(int)

In [54]:
review_by_user = training.groupby('user_id').text.sum()
review_by_business = training.groupby('business_id').text.sum()

In [56]:
review_by_user.to_csv('review_by_user.csv')
review_by_business.to_csv('review_by_business')

In [57]:
review = training['text'] 
topic_word_matrix,feature_dict = lda(review,n_topic=10,n_top_words=20)



perplexity is:  453.6783950470701
Topic #0:
food place time burger servic server everyth order breakfast restaur star eat meal portion bit beer price lunch friend menu
Topic #1:
pizza place tea staff time food sandwich order option famili sauc menu fri game day peopl wing flavor coffe drink
Topic #2:
place dish taco menu roll chef servic time tri way flavor restaur friend burrito portion item spot someth patio bit
Topic #3:
food room time staff place hotel peopl experi night kid day servic vega sauc area taco bar soup dog music
Topic #4:
time place order store rice kabob help way salad food hair soup thing tabl price bowl day restaur deliveri star
Topic #5:
chicken food servic time peopl hous cream ice spot home rang place meat tortilla veggi noth shrimp dinner summer spici
Topic #6:
custom locat servic time day staff work place love price car machin thing employe job thank area husband issu compani
Topic #7:
place price food time staff servic drink breakfast menu locat coffe select ma

In [101]:
def topic_probability(document,feature_dict,topic_word_matrix):
    word_list = document.split()
    topic_num = len(topic_word_matrix)
    topic_probability = {k:0 for k in range(topic_num)}
    for topic_idx in range(topic_num):
        for word in word_list:
            if word in feature_dict.keys():
                topic_probability[topic_idx] += topic_word_matrix[topic_idx,feature_dict[word]]
    return topic_probability

In [77]:
review_by_user = pd.read_csv('review_by_user.csv',header=None)
review_by_business  = pd.read_csv('review_by_business',header=None)

In [113]:
review_by_user[2] = review_by_user[1].apply(lambda x: topic_probability(x,feature_dict,topic_word_matrix))
review_by_business[2] = review_by_business[1].apply(lambda x: topic_probability(x,feature_dict,topic_word_matrix))

In [114]:
import math
def normalize(x):
    normalized_list = []
    new_dict = {}
    for key,value in x.items():
        normalized_list.append(value**2)
    for key,value in x.items():
        new_dict[key] = value/math.sqrt(sum(normalized_list))
    return new_dict

In [121]:
review_by_user[2] = review_by_user[2].apply(normalize)
review_by_business[2] = review_by_business[2].apply(normalize)

In [122]:
review_by_business

Unnamed: 0,0,1,2
0,-1xuC540Nycht_iWFeJ-dw,trè bon poulet avec la sauc piquant c succul b...,"{0: 0.4962998299801332, 1: 0.2913037937311138,..."
1,-4TMQnQJW1yd6NqGRDvAeA,fan mission place owner surpris other ambianc ...,"{0: 0.4480124219817073, 1: 0.2476050282509689,..."
2,-DHKF52ALa4B2ls1V6OlxA,vision sourc hill opinion appoint strain staff...,"{0: 0.42497081950850296, 1: 0.2639099177583943..."
3,-RwnEdrn3ZdW1yug_CBmLg,shaunz shirt staff qualiti cloth select shaunz...,"{0: 0.15812163309098687, 1: 0.3744876575516591..."
4,-U7tvCtaraTQ9b0zBhpBMA,bacon guy bacon borderlin crispi salti salti c...,"{0: 0.6185535779777801, 1: 0.13145316838464094..."
5,-iFvYhgysvjkxckCr42NRw,servic food place expect place food time day t...,"{0: 0.5856739361404252, 1: 0.21109967457713877..."
6,-oOKqZbYDt08zaWWyLZNIw,food price amount food payfood price amount fo...,"{0: 0.5203877276199708, 1: 0.15096698040523898..."
7,01o6K5ID_vW8tXZ7QAzPJg,prc tile roof replac commun area prc caught nu...,"{0: 0.32579177726278474, 1: 0.0631368301518875..."
8,0BW6h-igJinzbqc-prYUaQ,hy steakhous steak hous lot space intim experi...,"{0: 0.5592097842709489, 1: 0.1571190966041284,..."
9,0JGMKaKJGVuDus5WcJzvjw,experi start reserv saturday drink drink order...,"{0: 0.7215885720331826, 1: 0.2976709833365778,..."
