In [2]:
import pandas as pd
import numpy as np
import nltk
import re
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
from langdetect import detect
import itertools
import string
import collections
from collections import Counter

In [3]:
df = pd.read_csv('reviews.csv')
df

Unnamed: 0,Customer Review,text
0,Review 1,I travel alot and really enjoyed my stay here....
1,Review 2,Stayed one night for a seminar at the Universi...
2,Review 3,I stayed at the the BW Downtown on two separat...
3,Review 4,This is my first year to go to with a group of...
4,Review 5,I travel with several friends (we're from a sm...
...,...,...
195,Review 196,We try to use any excuse to celebrate a weeken...
196,Review 197,"The Sorella is a nice, contemporary hotel. Qui..."
197,Review 198,Dieses ist immer wieder eines meiner Favourite...
198,Review 199,Sch√∂ne Zimmer und sind sehr gut eingerichtet....


In [4]:
df['word_count'] = df['text'].str.split().apply(len)
df['language'] = df['text'].apply(lambda x: detect(x))
new_df=df.drop(df[df.language != 'en'].index)
new_df

Unnamed: 0,Customer Review,text,word_count,language
0,Review 1,I travel alot and really enjoyed my stay here....,39,en
1,Review 2,Stayed one night for a seminar at the Universi...,76,en
2,Review 3,I stayed at the the BW Downtown on two separat...,241,en
3,Review 4,This is my first year to go to with a group of...,97,en
4,Review 5,I travel with several friends (we're from a sm...,77,en
...,...,...,...,...
193,Review 194,"Got a Deluxe Room facing the ""green"" to watch ...",61,en
194,Review 195,"The Hotel was super clean, nice modern-retro d...",81,en
195,Review 196,We try to use any excuse to celebrate a weeken...,100,en
196,Review 197,"The Sorella is a nice, contemporary hotel. Qui...",28,en


In [67]:
df['word_count'].describe()
# df_sorted = df.sort_values('word_count', ascending = False)
# fig = plt.figure(figsize=(20,10))
# plt.plot('Customer Review','word_count', data=df_sorted)
# plt.xlabel('Reviews')
# plt.ylabel('Word Count')

# Need to add a boxplot based on statistics

count    200.000000
mean     125.565000
std      100.740703
min       10.000000
25%       60.000000
50%       99.500000
75%      160.500000
max      678.000000
Name: word_count, dtype: float64

In [8]:

wn = WordNetLemmatizer()
ps = nltk.PorterStemmer()
stopword = nltk.corpus.stopwords.words('english')

def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation])
    text = "".join([word for word in text if not word.isdigit()])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopword]    
    return text

new_df['body_clean'] = new_df['text'].apply(lambda x: clean_text(x.lower())) #Remove punctuation & stopwords, normalized, numbers and tokenized

def stem(stem_text):
    text = [ps.stem(word) for word in stem_text]
    return text

new_df['body_clean_stem'] = new_df['body_clean'].apply(lambda x: stem(x)) #Stemming Performed

def lemmatizer(lem_text):
    text = [wn.lemmatize(word, pos = 'v') for word in lem_text]
    return text

new_df['body_clean_lem'] = new_df['body_clean'].apply(lambda x: lemmatizer(x)) #Lemmatization Performed


new_df

Unnamed: 0,Customer Review,text,word_count,language,body_clean,body_clean_stem,body_clean_lem
0,Review 1,I travel alot and really enjoyed my stay here....,39,en,"[travel, alot, really, enjoyed, stay, hotel, c...","[travel, alot, realli, enjoy, stay, hotel, cle...","[travel, alot, really, enjoy, stay, hotel, cle..."
1,Review 2,Stayed one night for a seminar at the Universi...,76,en,"[stayed, one, night, seminar, university, clea...","[stay, one, night, seminar, univers, clean, co...","[stay, one, night, seminar, university, clean,..."
2,Review 3,I stayed at the the BW Downtown on two separat...,241,en,"[stayed, bw, downtown, two, separate, business...","[stay, bw, downtown, two, separ, busi, trip, n...","[stay, bw, downtown, two, separate, business, ..."
3,Review 4,This is my first year to go to with a group of...,97,en,"[first, year, go, group, quilter, southeast, k...","[first, year, go, group, quilter, southeast, k...","[first, year, go, group, quilter, southeast, k..."
4,Review 5,I travel with several friends (we're from a sm...,77,en,"[travel, several, friends, small, town, girls,...","[travel, sever, friend, small, town, girl, wee...","[travel, several, friends, small, town, girls,..."
...,...,...,...,...,...,...,...
193,Review 194,"Got a Deluxe Room facing the ""green"" to watch ...",61,en,"[got, deluxe, room, facing, green, watch, th, ...","[got, delux, room, face, green, watch, th, jul...","[get, deluxe, room, face, green, watch, th, ju..."
194,Review 195,"The Hotel was super clean, nice modern-retro d...",81,en,"[hotel, super, clean, nice, modernretro, desig...","[hotel, super, clean, nice, modernretro, desig...","[hotel, super, clean, nice, modernretro, desig..."
195,Review 196,We try to use any excuse to celebrate a weeken...,100,en,"[try, use, excuse, celebrate, weekend, hotel, ...","[tri, use, excus, celebr, weekend, hotel, sore...","[try, use, excuse, celebrate, weekend, hotel, ..."
196,Review 197,"The Sorella is a nice, contemporary hotel. Qui...",28,en,"[sorella, nice, contemporary, hotel, quiet, ho...","[sorella, nice, contemporari, hotel, quiet, ho...","[sorella, nice, contemporary, hotel, quiet, ho..."


In [18]:
all_words = new_df['body_clean_lem'].tolist()
all_words_2 = list(itertools.chain(*all_words))
counts_all_words_2 = collections.Counter(all_words_2)
common_words = pd.DataFrame(counts_all_words_2.most_common(), columns=['words', 'count'])
common_words

Unnamed: 0,words,count
0,hotel,317
1,room,263
2,stay,211
3,staff,150
4,great,121
...,...,...
2663,wspa,1
2664,develop,1
2665,suprised,1
2666,sportsfitness,1


In [29]:
tfidf_vect = TfidfVectorizer(analyzer=lemmatizer) #analyzer hyperparameter directly runs the fuction created
X_tfidf = tfidf_vect.fit_transform(new_df['body_clean_lem'])
X_tfidf_df = pd.DataFrame(X_tfidf.toarray()) #changes sparse matrix into an array 
X_tfidf_df.columns = tfidf_vect.get_feature_names()
X_tfidf_df

Unnamed: 0,aaa,aback,able,absolute,absolutely,absolutley,ac,acceptable,access,accessible,...,yummy,zone,ä,äì,äôd,äôs,äôt,äù,äúbottega,äúour
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.153507,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
189,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
190,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
