In [7]:
import pandas as pd
from decimal import Decimal
from re import sub
from nltk.corpus import stopwords
import string
from langdetect import detect
from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()

from nltk.stem import PorterStemmer

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




# Yelp data

In [8]:
dfb = pd.read_pickle('data/YelpBusiness.pkl.gz')
dfr = pd.read_pickle('data/YelpReview.pkl.gz')

In [9]:
dfb.head(2)

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state,relevance
1,30 Eglinton Avenue W,"{'RestaurantsReservations': 'True', 'GoodForMe...",QXAEGFB4oINsVuTFxEYKFQ,"Specialty Food, Restaurants, Dim Sum, Imported...",Mississauga,"{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W...",1,43.605499,-79.652289,Emerald Chinese Restaurant,L5R 3E7,128,2.5,ON,1
2,"10110 Johnston Rd, Ste 15","{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...",gnKjwL_1w79qoiV3IC_xQQ,"Sushi Bars, Restaurants, Japanese",Charlotte,"{'Monday': '17:30-21:30', 'Wednesday': '17:30-...",1,35.092564,-80.859132,Musashi Japanese Restaurant,28210,170,4.0,NC,1


In [10]:
def check_montreal(city):
    if isinstance(city, str):
        if city in ['Montr√©al','Montreal']:
            return 1
        else:
            return 0
    else:
        return 0

In [11]:
dfb['in_montreal'] = dfb['city'].progress_apply(check_montreal)

HBox(children=(IntProgress(value=0, max=78595), HTML(value='')))




In [12]:
den = len(dfb)
dfb = dfb[dfb.in_montreal == 1]
b_ids = set(dfb.business_id)
enum = len(dfb)
print('{:.1f}% of relevant Yelp businesses are in Montreal'.format( enum/den*100 ) ) 

6.3% of relevant Yelp businesses are in Montreal


In [13]:
den = len(dfr)
dfr = dfr[dfr.business_id.isin(b_ids)]
enum = len(dfr)
print('{:.1f}% of relevant Yelp reviews are in Montreal'.format( enum/den*100 ) ) 

2.9% of relevant Yelp reviews are in Montreal


In [14]:
dfr['lang'] = dfr['clean_text'].progress_apply(lambda x: detect(x))

HBox(children=(IntProgress(value=0, max=138638), HTML(value='')))




## Translate French text

In [43]:
dfrf = dfr[dfr.lang == 'fr'].copy()

In [44]:
dfdict = pd.read_csv('data/fr-en.csv')
D = { dfdict.iloc[i,0]:dfdict.iloc[i,1] for i in range(len(dfdict)) }
Wf = set(D.keys())
def translate(text):
    word_tokens = text.split()
    en_tr = []
    for w in word_tokens:
        if w in Wf:
            en_tr.append(D[w])
        elif w.isnumeric():
            en_tr.append(w)
    return ' '.join(en_tr)

In [45]:
dfrf['text_en'] = dfrf['clean_text'].progress_apply(translate)

HBox(children=(IntProgress(value=0, max=17727), HTML(value='')))

In [48]:
dfrf['clean_text'] = dfrf['text_en']
dfrf = dfrf.drop(columns='text_en')

In [56]:
den = len(dfr)
dfre = dfr[dfr.lang == 'en'].copy()
enum = len(dfre)
print('{:.1f}% of relevant Yelp reviews in Montreal are in english'.format( enum/den*100 ) ) 

86.5% of relevant Yelp reviews in Montreal are in english


## Stemming

In [58]:
dfr_new = pd.concat([dfre,dfrf])

In [59]:
ps = PorterStemmer()
def stem_cleaned_text(text):
    if isinstance(text, str):
        word_tokens = text.split() 
        clean_text = [ps.stem(w) for w in word_tokens]
        ct = ' '.join(clean_text)
        return ct
    else:
        return None

In [60]:
dfr_new['stemmed_text'] = dfr_new['clean_text'].progress_apply(stem_cleaned_text)

HBox(children=(IntProgress(value=0, max=137639), HTML(value='')))

In [61]:
dfr_new = dfr_new.drop(columns='clean_text')

In [62]:
print(len(dfr_new))
dfr_new = dfr_new.dropna(subset=['stemmed_text'])
print(len(dfr_new))

137639
137639


In [63]:
dfb.to_pickle('data/MontrealBusiness.pkl.gz')

In [64]:
dfr_new.to_pickle('data/MontrealReview.pkl.gz')

# Airbnb data

In [49]:
dfal = pd.read_csv('data/MontrealAirbnbListings.csv.gz', compression='gzip')

  interactivity=interactivity, compiler=compiler, result=result)


In [50]:
dfal['price_float'] = dfal.apply(lambda x: float(Decimal(sub(r'[^\d.]', '', x.price))), axis=1)

In [51]:
stop_words = set(stopwords.words('english'))
punct = list(string.punctuation)
ps = PorterStemmer()

def clean_text(text):
    if isinstance(text, str):
        for ch in punct:
            text = text.replace(ch, '')
        word_tokens = text.lower().split() 
        clean_text = [ps.stem(w) for w in word_tokens if not w in stop_words]
        ct = ' '.join(clean_text)
        return ct
    else:
        return None
dfal['stemmed_neighborhood_overview'] = dfal['neighborhood_overview'].progress_apply(clean_text)

HBox(children=(IntProgress(value=0, max=19787), HTML(value='')))




In [52]:
dfal.to_pickle('data/MontrealAirbnbListings.pkl.gz')

In [53]:
dfr.head(5)

Unnamed: 0,business_id,cool,date,funny,stars,useful,sentiment,lang,stemmed_text
59,nDaW2hhQV5KYiGH7HzAOcg,1.0,2016-12-04 02:34:48,0.0,3.0,1.0,0,en,stumbl upon ice cream store see high review ye...
107,luPvNx4XSxaM7pka8EifuA,2.0,2017-04-19 11:38:43,1.0,3.0,0.0,0,en,fresh food tasti amaz servic 3 characterist co...
123,0JGMKaKJGVuDus5WcJzvjw,0.0,2017-04-30 03:37:51,1.0,5.0,2.0,1,en,unbeliev experi start finish reserv 9pm saturd...
191,9KmvrnyjWTr4sly0Dt770g,0.0,2017-08-16 04:41:54,0.0,1.0,0.0,0,en,locat excel restaur beauti food aw server rude...
205,XqJG7Ux_mMfMJnyG2Q4m2Q,0.0,2018-08-04 20:49:20,0.0,5.0,0.0,1,en,incred experi noth particular special neighbor...


# Accuracy of sentiment estimation on translated text

In [67]:
from sklearn.externals import joblib
tfidf  = joblib.load('models/TFIDF_model.joblib')
# svd    = joblib.load('models/SVD_model.joblib')
logreg = joblib.load('models/LogReg_model_noSVD.joblib')

In [76]:
dfr_new['est_sentiment'] = logreg.predict(tfidf.transform(dfr_new['stemmed_text']))

In [93]:
import numpy as np
saf = np.array(dfr_new[dfr_new.lang == 'fr'].sentiment)
sef = np.array(dfr_new[dfr_new.lang == 'fr'].est_sentiment)
sae = np.array(dfr_new[dfr_new.lang == 'en'].sentiment)
see = np.array(dfr_new[dfr_new.lang == 'en'].est_sentiment)

In [94]:
print('Sentiment prediction is {:.2f}% accurate on translated text'.format( (1-sum(abs(saf-sef))/len(saf))*100 ) ) 
print('Sentiment prediction is {:.2f}% accurate on original text'.format( (1-sum(abs(sae-see))/len(sae))*100 ) ) 

Sentiment prediction is 76.53% accurate on translated text
Sentiment prediction is 89.26% accurate on original text
