## 0. Load Libraries

In [1]:
import pandas as pd
import re

In [2]:
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [3]:
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('crubadan')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shivanigoel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/shivanigoel/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shivanigoel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package crubadan to
[nltk_data]     /Users/shivanigoel/nltk_data...
[nltk_data]   Package crubadan is already up-to-date!


True

## 1. Obtain Data

In [5]:
reviews_df = pd.read_csv('../data/interim/reviews_lang.csv')

In [6]:
reviews_df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,lang
0,2539,55688172,2015-12-04,25160947,Peter,Great host,ENGLISH
1,2539,97474898,2016-08-27,91513326,Liz,Nice room for the price. Great neighborhood. J...,ENGLISH
2,2539,105340344,2016-10-01,90022459,Евгений,Very nice apt. New remodeled.,ENGLISH
3,2539,133131670,2017-02-20,116165195,George,Great place to stay for a while. John is a gre...,ENGLISH
4,2539,138349776,2017-03-19,118432644,Carlos,.,Unknown


In [7]:
reviews_df.shape

(1106639, 7)

In [8]:
reviews_df['lang'].value_counts()

ENGLISH                984444
FRENCH                  39058
SPANISH                 34378
GERMAN                  13160
ITALIAN                  6516
Chinese                  5433
PORTUGUESE               5424
DUTCH                    3019
Korean                   2555
Japanese                 2154
RUSSIAN                  1994
Unknown                  1283
DANISH                   1054
SWEDISH                  1014
ChineseT                  998
FINNISH                   447
NORWEGIAN                 402
TURKISH                   392
CZECH                     384
POLISH                    352
CATALAN                   339
TAGALOG                   220
SLOVAK                    167
INDONESIAN                154
ROMANIAN                  148
IRISH                     131
HUNGARIAN                 103
MALTESE                    95
ESTONIAN                   86
WELSH                      85
LITHUANIAN                 73
GREEK                      61
GALICIAN                   57
HEBREW    

## 2. Sample Data

Sentiment polarity calculation takes a lot of time. For quick EDA, I am sampling the data and using 1/10th of it.

In [9]:
reviews_df = reviews_df.sample(frac=0.1, replace=False, random_state=42)

In [10]:
reviews_df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,lang
24431,60948,301066782,2018-08-03,9485766,Melanie,Positives first: very large space for New York...,ENGLISH
848983,19018581,290463512,2018-07-14,6522555,Vicki,Emma and Alistair are great. They even gave me...,ENGLISH
299568,3835681,20630431,2014-10-02,1439046,Jan,"The Host is perfect guy, the place is really n...",ENGLISH
619865,12600938,122667036,2016-12-25,50020643,Jerez,"We had a lovely stay, very accommodating, help...",ENGLISH
539573,9713045,207205374,2017-10-28,11537814,Jeffrey,We like to thank Cedric to make special arrang...,ENGLISH


In [11]:
reviews_df.shape

(110664, 7)

## 3. Cleaning Data

In [12]:
reviews_df.comments = reviews_df.comments.astype(str)

In [13]:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [25]:
def transform_text(text):
    # converting to lower text
    text = text.lower()
    
    # tokenization and removing punctuations
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    
    # removing words that have numbers in them
    text = [word for word in text if not any(c.isdigit() for c in word)]
    
    # removing stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    
    # removing empty tokens
    text = [t for t in text if len(t) > 0]
    
    # pos tagging
    pos_tags = pos_tag(text)
    
    # lemmatization
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    
    # removing 1 letter words
    text = [t for t in text if len(t) > 1]
    
    # join all
    text = " ".join(text)
    
    return(text)

In [26]:
reviews_df['comments_clean'] = reviews_df['comments'].apply(transform_text)

In [27]:
reviews_df.comments_clean.head()

24431     positive first large space new york interestin...
848983    emma alistair great even give shelf fridge acc...
299568    host perfect guy place really nice look close ...
619865    lovely stay accommodate helpful welcome would ...
539573    like thank cedric make special arrangement che...
Name: comments_clean, dtype: object

## Writing cleaned data to file

In [29]:
reviews_df.to_csv('../data/interim/smdata_cleaned.csv', index = False)