## 0. Load Libraries

In [157]:
import pandas as pd
import re
from classifier import *
import numpy as np

## 1. Obtain Data

In [174]:
reviews_df = pd.read_csv("../data/interim/reviews_lang.csv")

In [175]:
reviews_df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,lang
0,2539,55688172,2015-12-04,25160947,Peter,Great host,ENGLISH
1,2539,97474898,2016-08-27,91513326,Liz,Nice room for the price. Great neighborhood. J...,ENGLISH
2,2539,105340344,2016-10-01,90022459,Евгений,Very nice apt. New remodeled.,ENGLISH
3,2539,133131670,2017-02-20,116165195,George,Great place to stay for a while. John is a gre...,ENGLISH
4,2539,138349776,2017-03-19,118432644,Carlos,.,Unknown


In [176]:
reviews_df.shape

(1106639, 7)

## 2. Sentiment Analysis

### English Sentiment Analysis

In [161]:
#!conda install -y -c conda-forge twython

In [162]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/shivanigoel/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [163]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [164]:
sid = SentimentIntensityAnalyzer()

In [101]:
#reviews_df["sentiments"] = reviews_df["comments"].apply(sid.polarity_scores)
#reviews_df = pd.concat([reviews_df.drop(['sentiments'], axis=1), reviews_df['sentiments'].apply(pd.Series)], axis=1)

### Spanish Sentiment Analysis

https://github.com/aylliote/senti-py

gives score from 0 to 1

In [165]:
clf = SentimentClassifier()

In [168]:
x = "Esto es tan bueno me encanta" #This is so good. Love it.
y = "Ubicación asquerosa. Nunca volveré." #Disgusting location. will never come back.
print(clf.predict(y))
print(clf.predict(x))

0.04990360002581841
0.8815847389709347


### German Sentiment Analysis

In [50]:
!pip install -U textblob-de
!python -m textblob.download_corpora

Collecting textblob-de
[?25l  Downloading https://files.pythonhosted.org/packages/47/61/7a5759c3ac60bf9330a50ce81ebe7f0aac1bc6c674d45e00f7b3e190f5af/textblob_de-0.4.3-py2.py3-none-any.whl (468kB)
[K    100% |████████████████████████████████| 471kB 3.2MB/s 
Installing collected packages: textblob-de
Successfully installed textblob-de-0.4.3
[nltk_data] Downloading package brown to
[nltk_data]     /Users/shivanigoel/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shivanigoel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shivanigoel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/shivanigoel/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package 

In [51]:
from textblob import Blobber
from textblob_de import PatternTagger, PatternAnalyzer
tb_de = Blobber(pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())

In [170]:
blob1_de = tb_de("Das Leben ist eine Schachtel Pralinen schlecht")
print(blob1_de.sentiment.polarity)

-1.0


### French Sentiment Analysis

In [67]:
from textblob import Blobber
from textblob_fr import PatternTagger, PatternAnalyzer
tb_fe = Blobber(pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())

In [172]:
blob1 = tb_fe(u"Quelle belle matinée")
print(blob1.sentiment[0])
blob2 = tb(u"C'est une voiture terribles.")
print(blob2.sentiment[0])

0.8
-0.7


## Finale

In [178]:
reviews_df.comments.isna().sum()

482

In [179]:
reviews_df = reviews_df[~reviews_df.comments.isna()]

In [180]:
def run_sentiment_analysis(row):
    text = row['comments']
    if('ENGLISH' in row['lang']):
        return float(sid.polarity_scores(text)['compound'])
    if('SPANISH' in row['lang']):
        return float(clf.predict(text))
    if('FRENCH' in row['lang']):
        return float(tb_fe(text).sentiment[0])
    if('GERMAN' in row['lang']):
        return float(tb_de(text).sentiment.polarity)
    return np.nan

In [None]:
reviews_df['polarity'] = reviews_df.apply(run_sentiment_analysis, axis=1)

In [96]:
reviews_df.head()

Unnamed: 0.1,Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,lang,comments_clean,polarity
0,24431,60948,301066782,2018-08-03,9485766,Melanie,Positives first: very large space for New York...,ENGLISH,positive first large space new york interestin...,0.9229
1,848983,19018581,290463512,2018-07-14,6522555,Vicki,Emma and Alistair are great. They even gave me...,ENGLISH,emma alistair great even give shelf fridge acc...,0.96
2,299568,3835681,20630431,2014-10-02,1439046,Jan,"The Host is perfect guy, the place is really n...",ENGLISH,host perfect guy place really nice look close ...,0.7778
3,619865,12600938,122667036,2016-12-25,50020643,Jerez,"We had a lovely stay, very accommodating, help...",ENGLISH,lovely stay accommodate helpful welcome would ...,0.9058
4,539573,9713045,207205374,2017-10-28,11537814,Jeffrey,We like to thank Cedric to make special arrang...,ENGLISH,like thank cedric make special arrangement che...,0.9081


In [97]:
reviews_df.polarity.isna().sum()

3560

### Normalization for spanish reviews polarity

In [101]:
a = -1.0
b = 1.0
minimum = 0.0
maximum = 1.0
def normalize_polarity(row):
    if('SPANISH' in row['lang']):
        return ((b-a)*(row['polarity']-minimum)/(maximum-minimum)) + a
    return row['polarity']
    

In [102]:
normalized_polarity = reviews_df.apply(normalize_polarity,axis = 1)

In [106]:
reviews_df['polarity'] = normalized_polarity

### Dropping na

In [109]:
reviews_df = reviews_df[~reviews_df.polarity.isna()]

In [110]:
reviews_df.lang.value_counts()

ENGLISH    98383
FRENCH      3875
SPANISH     3529
GERMAN      1273
Name: lang, dtype: int64

## Handle bot generated reviews

In [118]:
reviews_df.sort_values("polarity", ascending=True)[["comments", "polarity","listing_id","id"]].head(25)

Unnamed: 0,comments,polarity,listing_id,id
81763,Ganz schlechte Unterkunft. Schlechte Gerüche w...,-1.0,6327222,96784934
30143,eine schäbige Wohnung. Nie wieder!,-1.0,25742022,329351594
49889,I deeply advise to check on other places befor...,-0.999974,20503833,332536080
45914,Alexander no estaba en el país y quien se enca...,-0.997277,11799451,142496550
74148,This is a filthy hole in a filthy old building...,-0.9969,12584271,276452660
107320,"Yoni, mas puntualidad con el check in no estuv...",-0.996711,15329781,248432253
54008,"Llegamos a las 21:15, después de pasar el duro...",-0.996697,593292,180473246
64233,El baño es un verdadero problema ya que la taz...,-0.996151,14820644,230585296
73997,There was a snow storm earlier in the week and...,-0.9957,30463017,408052186
56063,"During the stay, Zooey and I felt very happy t...",-0.9948,284208,190661427


In [126]:
print(reviews_df[reviews_df['listing_id'] == 284208].comments[56063])

During the stay, Zooey and I felt very happy to live with Louisa and her husband. We have some big luggage making the room a bit crowded, but still lovely. Louisa and her husband were living in the parlor, so we can't have time in the beautiful parlor, which is a bit disappointing to us. We think the Chihuahua Zou Zou was very cute at first, but sometimes she seems not welcoming and comfortable about us there, making some angry sound when we came close to her. Unfortunately, the accident happened on the day we left. Louisa agreed to let us leave the luggage in the house till the afternoon after we checked out early for sightseeing, and she also went out for parade so she agreed we kept the keys until we came back in case she wasn’t home. Since the weekend subway delay, we came back two hours later than the time we discussed with her first, but we informed her in advance and she was ok about it. When we entered the long hall and accessed to parlor, Zou zou seems very angry about our app

In [145]:
reviews_df[reviews_df.comments.str.startswith('The host canceled')]['polarity'].value_counts()

 0.0000    1615
-0.3818       4
 0.7263       1
Name: polarity, dtype: int64

In [146]:
reviews_df[reviews_df.comments.str.startswith('The reservation was canceled')]['polarity'].value_counts()

0.0    149
Name: polarity, dtype: int64

In [152]:
reviews_df = reviews_df[~reviews_df.comments.str.startswith('The host canceled')]

In [153]:
reviews_df = reviews_df[~reviews_df.comments.str.startswith('The reservation was canceled')]

## Writing down in csv

In [156]:
reviews_df.to_csv('../data/interim/final_ratings')