In [18]:
import pandas as pd
import numpy as np
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split

In [58]:
# load data

los_angeles_df = pd.read_csv('all_reviews_df.csv')
other_cities_df = pd.read_csv('other_cities_reviews.csv')
other_cities_df.rename(columns={"review_text": "text from review"}, inplace = True)

df = pd.concat([los_angeles_df, other_cities_df], ignore_index=True)
df.info

<bound method DataFrame.info of                           id         rating  \
0     kbwKxRdvWTu_Hy5sDN-JWQ  5 star rating   
1     kbwKxRdvWTu_Hy5sDN-JWQ  2 star rating   
2     kbwKxRdvWTu_Hy5sDN-JWQ  5 star rating   
3     kbwKxRdvWTu_Hy5sDN-JWQ  3 star rating   
4     kbwKxRdvWTu_Hy5sDN-JWQ  5 star rating   
...                      ...            ...   
4325  XRE8PgaxgJm5qiIqD_7eBQ  4 star rating   
4326  XRE8PgaxgJm5qiIqD_7eBQ  5 star rating   
4327  XRE8PgaxgJm5qiIqD_7eBQ  4 star rating   
4328  XRE8PgaxgJm5qiIqD_7eBQ  5 star rating   
4329  XRE8PgaxgJm5qiIqD_7eBQ  5 star rating   

                                       text from review  
0     Came here yesterday with my wife. Food is amaz...  
1     This review is from when I tried the food last...  
2     One of my favorite spots for food in Westwood....  
3     I knew that since this place had self-order ki...  
4     Delicious and Fast Dim Sum! It was easy to ord...  
...                                                 ...

In [7]:
# load pretrained tokenizer & model

tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

In [19]:
# test tokenizer on first review

tester = df.loc[0, 'text from review']

tester_token = tokenizer.encode(tester, return_tensors = 'pt')
tester_result = model(tester_token)

int(torch.argmax(tester_result.logits))+1

4

In [22]:
# define a function to calculate the sentiment score

def calculate_sentiment(review):
    tokens = tokenizer.encode(review, return_tensors = 'pt')
    result = model(tokens)

    return int(torch.argmax(result.logits))+1

In [60]:
# truncate reviews greater than 52 characters (max this model can handle)

for index, row in df.iterrows():
    if len(df.loc[index, 'text from review']) > 512:
        df.loc[index, 'text from review'] = df.loc[index, 'text from review'][:512]

In [61]:
# calculate score for each review & append to df


df['sentiment score'] = df['text from review'].apply(calculate_sentiment)


In [84]:
print(df.head())

                       id  rating  \
0  kbwKxRdvWTu_Hy5sDN-JWQ       5   
1  kbwKxRdvWTu_Hy5sDN-JWQ       2   
2  kbwKxRdvWTu_Hy5sDN-JWQ       5   
3  kbwKxRdvWTu_Hy5sDN-JWQ       3   
4  kbwKxRdvWTu_Hy5sDN-JWQ       5   

                                    text from review  sentiment score  
0  Came here yesterday with my wife. Food is amaz...                4  
1  This review is from when I tried the food last...                3  
2  One of my favorite spots for food in Westwood....                5  
3  I knew that since this place had self-order ki...                3  
4  Delicious and Fast Dim Sum! It was easy to ord...                5  


In [87]:
# find average sentiment score across all restaurants

print(np.mean(df['sentiment score'], axis=0))

4.17459584295612


In [83]:
# find average rating (from yelp) across all restaurants

print(np.mean(df['rating'], axis=0))

4.403695150115474


In [85]:
# save new df

df.to_csv('df_with_sentiment.csv', index = False)