In [1]:
# Import Libraries 
import pandas as pd
import numpy as np
from textblob import TextBlob
from itertools import chain
from langdetect import detect

In [2]:
#from nltk import *
# set up the path
path = 'C:/Users/lucywalker/Library/Mobile Documents/com~apple~CloudDocs/MSc Data Science/DissertationData'
outpath = 'C:/Users/lucywalker/Library/Mobile Documents/com~apple~CloudDocs/MSc Data Science/DissertationData/Results'

In [3]:
# Read dataset.
data = pd.read_csv("/Users/lucywalker/Library/Mobile Documents/com~apple~CloudDocs/MSc Data Science/DissertationData/reviews.csv.gz")
data.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,15400,21032,2009-12-21,53815,Hailey,We loved staying at Phillipa's place in Chelse...
1,13913,80770,2010-08-18,177109,Michael,My girlfriend and I hadn't known Alina before ...
2,13913,367568,2011-07-11,19835707,Mathias,Alina was a really good host. The flat is clea...
3,13913,529579,2011-09-13,1110304,Kristin,Alina is an amazing host. She made me feel rig...
4,13913,595481,2011-10-03,1216358,Camilla,"Alina's place is so nice, the room is big and ..."


In [4]:
len(data)

1124776

In [5]:
data.dtypes

listing_id        int64
id                int64
date             object
reviewer_id       int64
reviewer_name    object
comments         object
dtype: object

In [6]:
# filtering out automatic reviews

data = data[~data['comments'].str.contains('This is an automated posting', na=False)]

len(data)

1111103

In [7]:
# Drop missing values
data.dropna(inplace=True)
data.drop(columns={'reviewer_name'}, inplace=True)# reviewer name is not necessary for the analysis
data.head()

Unnamed: 0,listing_id,id,date,reviewer_id,comments
0,15400,21032,2009-12-21,53815,We loved staying at Phillipa's place in Chelse...
1,13913,80770,2010-08-18,177109,My girlfriend and I hadn't known Alina before ...
2,13913,367568,2011-07-11,19835707,Alina was a really good host. The flat is clea...
3,13913,529579,2011-09-13,1110304,Alina is an amazing host. She made me feel rig...
4,13913,595481,2011-10-03,1216358,"Alina's place is so nice, the room is big and ..."


In [8]:
len(data)

1111007

In [9]:
# Remove short reviews (below 30 characters) as does not provide much information

data['SentenceLength']=data['comments'].apply(lambda x: len(x))
data=data[data.SentenceLength>= 30]### remove rows with length less than 30.
#Distribution of Reviews per year
data['Year']= pd.DatetimeIndex(data['date']).year
ReviewDistributionDF=data.groupby(['Year']).comments.count().reset_index()


In [10]:
len(data)

1047177

In [11]:
# Select reviews from 2019 to 2022

import unidecode
data['comments']=data['comments'].apply(lambda x: unidecode.unidecode(x))

data=data.loc[data['Year']>= 2019]
data.dtypes
data.info(verbose=True)
data.dropna(inplace=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 556489 entries, 15 to 1124775
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   listing_id      556489 non-null  int64 
 1   id              556489 non-null  int64 
 2   date            556489 non-null  object
 3   reviewer_id     556489 non-null  int64 
 4   comments        556489 non-null  object
 5   SentenceLength  556489 non-null  int64 
 6   Year            556489 non-null  int64 
dtypes: int64(5), object(2)
memory usage: 34.0+ MB


In [12]:
len(data)

556489

In [13]:
# Determining language of reviews 

data['detect'] = data['comments'].apply(detect)

In [14]:
#  selecting english only

data = data[data['detect'] == 'en']

len(data)

509629

In [15]:
data.head()

Unnamed: 0,listing_id,id,date,reviewer_id,comments,SentenceLength,Year,detect
15,13913,451955791,2019-05-12,58728173,"Alina's place is cosy, convenient, and full of...",268,2019,en
17,13913,467269212,2019-06-10,2291517,"Alina, was very quick to respond. Thoughtful a...",91,2019,en
18,13913,538005731,2019-09-29,7253695,Alina is an amazing host who welcomed me warml...,152,2019,en
19,13913,539957261,2019-10-02,45592945,Alina is a very relaxed and friendly host who ...,259,2019,en
44,13913,543287825,2019-10-07,28531625,Felt at home - Alina is an excellent host - ve...,114,2019,en


In [16]:
# Changing '&' to 'and'

data["comments"].replace("&", "and", regex=True)

15         Alina's place is cosy, convenient, and full of...
17         Alina, was very quick to respond. Thoughtful a...
18         Alina is an amazing host who welcomed me warml...
19         Alina is a very relaxed and friendly host who ...
44         Felt at home - Alina is an excellent host - ve...
                                 ...                        
1124771    The view from the appointment was fabulous, ve...
1124772    Great place, location, and host.<br/>Easy walk...
1124773    very good value . been there many times over t...
1124774    This place was wonderful. We were a family of ...
1124775    We had a great experience. The location is gre...
Name: comments, Length: 509629, dtype: object

In [17]:
# Changing American english to British English

import requests

def britishize(string):
    url ="https://raw.githubusercontent.com/hyperreality/American-British-English-Translator/master/data/american_spellings.json"
    american_to_british_dict = requests.get(url).json()    

    for american_spelling, british_spelling in american_to_british_dict.items():
        string = string.replace(american_spelling, british_spelling)
  
    return string


data = data.apply(lambda x: britishize(x) if x.name == 'comments' else x)

In [18]:
# Replacing English language abbreviations 

punctuation='["\'?,\.]' 
abbr_dict={
    "what's":"what is",
    "what're":"what are",
    "who's":"who is",
    "who're":"who are",
    "where's":"where is",
    "where're":"where are",
    "when's":"when is",
    "when're":"when are",
    "how's":"how is",
    "how're":"how are",

    "i'm":"i am",
    "we're":"we are",
    "you're":"you are",
    "they're":"they are",
    "it's":"it is",
    "he's":"he is",
    "she's":"she is",
    "that's":"that is",
    "there's":"there is",
    "there're":"there are",

    "i've":"i have",
    "we've":"we have",
    "you've":"you have",
    "they've":"they have",
    "who've":"who have",
    "would've":"would have",
    "not've":"not have",

    "i'll":"i will",
    "we'll":"we will",
    "you'll":"you will",
    "he'll":"he will",
    "she'll":"she will",
    "it'll":"it will",
    "they'll":"they will",

    "isn't":"is not",
    "wasn't":"was not",
    "aren't":"are not",
    "weren't":"were not",
    "can't":"can not",
    "couldn't":"could not",
    "don't":"do not",
    "didn't":"did not",
    "shouldn't":"should not",
    "wouldn't":"would not",
    "doesn't":"does not",
    "haven't":"have not",
    "hasn't":"has not",
    "hadn't":"had not",
    "won't":"will not",
    punctuation:'',
    '\s+':' ', # replace multi space with one single space
}

def process_data(dataset_name):
    dataset_name.comments=dataset_name.comments.str.lower() # convert to lower case
    dataset_name.comments=dataset_name.comments.astype(str)
    dataset_name.replace(abbr_dict,regex=True,inplace=True)
    display(dataset_name.head(2))
    return dataset_name

data = process_data(data)

Unnamed: 0,listing_id,id,date,reviewer_id,comments,SentenceLength,Year,detect
15,13913,451955791,2019-05-12,58728173,alinas place is cosy convenient and full of ec...,268,2019,en
17,13913,467269212,2019-06-10,2291517,alina was very quick to respond thoughtful and...,91,2019,en


In [19]:
len(data)

509629

In [20]:
#replace any reference of host with 'host'

data['comments'].replace(r'\bShe\b', 'host', regex=True, inplace= True)
data['comments'].replace(r'\bshe\b', 'host', regex=True, inplace= True)
data['comments'].replace(r'\bhe\b', 'host', regex=True, inplace= True)
data['comments'].replace(r'\bHe\b', 'host', regex=True, inplace= True)
data['comments'].replace(r'\bthey\b', 'host', regex=True, inplace= True)
data['comments'].replace(r'\bThey\b', 'host', regex=True, inplace= True)
data['comments'].replace(r'\bperson\b', 'host', regex=True, inplace= True)

In [21]:
data['comments'] = data['comments'].astype(str).str.lower()

In [22]:
# Tokenisation

import nltk


from nltk.tokenize import RegexpTokenizer

regexp = RegexpTokenizer('\w+')

data['comments_token']=data['comments'].apply(regexp.tokenize)
data.head(3)

Unnamed: 0,listing_id,id,date,reviewer_id,comments,SentenceLength,Year,detect,comments_token
15,13913,451955791,2019-05-12,58728173,alinas place is cosy convenient and full of ec...,268,2019,en,"[alinas, place, is, cosy, convenient, and, ful..."
17,13913,467269212,2019-06-10,2291517,alina was very quick to respond thoughtful and...,91,2019,en,"[alina, was, very, quick, to, respond, thought..."
18,13913,538005731,2019-09-29,7253695,alina is an amazing host who welcomed me warml...,152,2019,en,"[alina, is, an, amazing, host, who, welcomed, ..."


In [23]:
data.head()

Unnamed: 0,listing_id,id,date,reviewer_id,comments,SentenceLength,Year,detect,comments_token
15,13913,451955791,2019-05-12,58728173,alinas place is cosy convenient and full of ec...,268,2019,en,"[alinas, place, is, cosy, convenient, and, ful..."
17,13913,467269212,2019-06-10,2291517,alina was very quick to respond thoughtful and...,91,2019,en,"[alina, was, very, quick, to, respond, thought..."
18,13913,538005731,2019-09-29,7253695,alina is an amazing host who welcomed me warml...,152,2019,en,"[alina, is, an, amazing, host, who, welcomed, ..."
19,13913,539957261,2019-10-02,45592945,alina is a very relaxed and friendly host who ...,259,2019,en,"[alina, is, a, very, relaxed, and, friendly, h..."
44,13913,543287825,2019-10-07,28531625,felt at home - alina is an excellent host - ve...,114,2019,en,"[felt, at, home, alina, is, an, excellent, hos..."


In [24]:
# remove stop words 

from nltk.corpus import stopwords
stopwords_other = ['though', 'stay', 'would', 'really', 'one', 'could', 'place stay', 'get', 'thank', 'claudia', 'want', 'know', 'use', 'go', 'everything', 'give', 'make', 'us', 'also', 'ever', 'need', 'however', 'say', 'even']

stopwords = nltk.corpus.stopwords.words("english") + stopwords_other

data['comments_token'] = data['comments_token'].apply(lambda x: [item for item in x if item not in stopwords])
data.head(3)


Unnamed: 0,listing_id,id,date,reviewer_id,comments,SentenceLength,Year,detect,comments_token
15,13913,451955791,2019-05-12,58728173,alinas place is cosy convenient and full of ec...,268,2019,en,"[alinas, place, cosy, convenient, full, eclect..."
17,13913,467269212,2019-06-10,2291517,alina was very quick to respond thoughtful and...,91,2019,en,"[alina, quick, respond, thoughtful, accommodat..."
18,13913,538005731,2019-09-29,7253695,alina is an amazing host who welcomed me warml...,152,2019,en,"[alina, amazing, host, welcomed, warmly, alway..."


In [25]:
# Change comment token to strings and remove words shorter than two letters 

data['comments_string'] = data['comments_token'].apply(lambda x: ' '.join([item for item in x if len(item)>2]))
data.head()


Unnamed: 0,listing_id,id,date,reviewer_id,comments,SentenceLength,Year,detect,comments_token,comments_string
15,13913,451955791,2019-05-12,58728173,alinas place is cosy convenient and full of ec...,268,2019,en,"[alinas, place, cosy, convenient, full, eclect...",alinas place cosy convenient full eclectic art...
17,13913,467269212,2019-06-10,2291517,alina was very quick to respond thoughtful and...,91,2019,en,"[alina, quick, respond, thoughtful, accommodat...",alina quick respond thoughtful accommodating d...
18,13913,538005731,2019-09-29,7253695,alina is an amazing host who welcomed me warml...,152,2019,en,"[alina, amazing, host, welcomed, warmly, alway...",alina amazing host welcomed warmly always talk...
19,13913,539957261,2019-10-02,45592945,alina is a very relaxed and friendly host who ...,259,2019,en,"[alina, relaxed, friendly, host, made, complet...",alina relaxed friendly host made completely we...
44,13913,543287825,2019-10-07,28531625,felt at home - alina is an excellent host - ve...,114,2019,en,"[felt, home, alina, excellent, host, friendly,...",felt home alina excellent host friendly makes ...


In [26]:
data.to_csv('preprocessed_data.csv', index=False)


In [27]:
import pandas as pd
df = pd.read_csv('preprocessed_data.csv')
df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,comments,SentenceLength,Year,detect,comments_token,comments_string
0,13913,451955791,2019-05-12,58728173,alinas place is cosy convenient and full of ec...,268,2019,en,"['alinas', 'place', 'cosy', 'convenient', 'ful...",alinas place cosy convenient full eclectic art...
1,13913,467269212,2019-06-10,2291517,alina was very quick to respond thoughtful and...,91,2019,en,"['alina', 'quick', 'respond', 'thoughtful', 'a...",alina quick respond thoughtful accommodating d...
2,13913,538005731,2019-09-29,7253695,alina is an amazing host who welcomed me warml...,152,2019,en,"['alina', 'amazing', 'host', 'welcomed', 'warm...",alina amazing host welcomed warmly always talk...
3,13913,539957261,2019-10-02,45592945,alina is a very relaxed and friendly host who ...,259,2019,en,"['alina', 'relaxed', 'friendly', 'host', 'made...",alina relaxed friendly host made completely we...
4,13913,543287825,2019-10-07,28531625,felt at home - alina is an excellent host - ve...,114,2019,en,"['felt', 'home', 'alina', 'excellent', 'host',...",felt home alina excellent host friendly makes ...


In [28]:
df_2019 = df[df["Year"]==2019]

df_2020 = df[df["Year"]==2020]

df_2021 = df[df["Year"]==2021]

df_2022 = df[df["Year"]==2022]
