In [4]:
#imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [5]:
import re

In [6]:
#get dataframe from csv file 

cwd = os.getcwd()

df = pd.read_csv(cwd+"/BritAir_reviews.csv", index_col=0)

In [7]:
df.head()

Unnamed: 0,reviews,stars,date,country
0,Not Verified | BA is not treating its premium ...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,6th July 2023,United Kingdom
1,✅ Trip Verified | 24 hours before our departu...,4,5th July 2023,South Africa
2,✅ Trip Verified | We arrived at Heathrow at 0...,1,5th July 2023,United Kingdom
3,✅ Trip Verified | Original flight was cancell...,1,4th July 2023,Greece
4,Not Verified | Airport check in was functiona...,3,3rd July 2023,Italy


In [8]:
df['verified'] = df.reviews.str.contains("Trip Verified")

In [9]:
df['verified']

0       False
1        True
2        True
3        True
4       False
        ...  
3495    False
3496    False
3497    False
3498    False
3499    False
Name: verified, Length: 3500, dtype: bool

# Cleaning the reviews

In [10]:
#import nltk library for lemmatization of words

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma = WordNetLemmatizer()

In [11]:
reviews_data = df.reviews.str.strip("✅ Trip Verified |")

In [12]:
#list for cleaned data corpus
corpus = []

In [13]:
#here, remove punctuations, make all lowercase, loop through each review, join it then add it to corpus
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

In [14]:
df['corpus'] = corpus

In [15]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | BA is not treating its premium ...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,6th July 2023,United Kingdom,False,verified ba treating premium economy passenger...
1,✅ Trip Verified | 24 hours before our departu...,4,5th July 2023,South Africa,True,hour departure ba cape town heathrow thursday ...
2,✅ Trip Verified | We arrived at Heathrow at 0...,1,5th July 2023,United Kingdom,True,arrived heathrow find flight ibiza cancelled b...
3,✅ Trip Verified | Original flight was cancell...,1,4th July 2023,Greece,True,original flight cancelled explanation represen...
4,Not Verified | Airport check in was functiona...,3,3rd July 2023,Italy,False,verified airport check functionary little warm...


# Cleaning the format date

In [17]:
df.dtypes

reviews     object
stars       object
date        object
country     object
verified      bool
corpus      object
dtype: object

In [19]:
df.date = pd.to_datetime(df.date)

In [20]:
df.date.head()

0   2023-07-06
1   2023-07-05
2   2023-07-05
3   2023-07-04
4   2023-07-03
Name: date, dtype: datetime64[ns]

# Cleaning the rating stars

In [21]:
#check for unique values and remove outliers
df.stars.unique()

array(['\n\t\t\t\t\t\t\t\t\t\t\t\t\t5', '4', '1', '3', '10', '2', '7',
       '9', '5', '8', '6', 'None'], dtype=object)

In [22]:
df.stars = df.stars.str.strip('\n\t\t\t\t\t\t\t\t\t\t\t\t\t5')

In [23]:
(df.star).value_counts()

1       797
2       395
3       394
8       346
7       300
10      298
9       295
        255
4       239
6       176
None      5
Name: stars, dtype: int64

In [26]:
# remove rows with None
df.drop(df[df.stars == "None"].index, axis=0, inplace=True)

In [28]:
#check the unique values again
df.stars.unique()

array(['', '4', '1', '3', '10', '2', '7', '9', '8', '6'], dtype=object)

In [36]:
#check for null values 
df.isnull().value_counts()

reviews  stars  date   country  verified  corpus
False    False  False  False    False     False     3493
dtype: int64

In [30]:
df.country.isnull().value_counts()

False    3493
True        2
Name: country, dtype: int64

In [31]:
df.drop(df[df.country.isnull()== True].index, axis=0, inplace=True)

In [37]:
df.country.unique()

array(['United Kingdom', 'South Africa', 'Greece', 'Italy', 'Senegal',
       'United States', 'United Arab Emirates', 'Romania', 'Australia',
       'Canada', 'Cyprus', 'Spain', 'Chile', 'Sweden', 'Ireland',
       'Hong Kong', 'Netherlands', 'Austria', 'France', 'India',
       'Belgium', 'New Zealand', 'Czech Republic', 'Malaysia',
       'Singapore', 'Ghana', 'Germany', 'Switzerland', 'Bermuda',
       'Botswana', 'Brazil', 'Panama', 'Nigeria', 'Russian Federation',
       'Philippines', 'Bulgaria', 'Poland', 'Thailand', 'Argentina',
       'Mexico', 'Denmark', 'Saint Kitts and Nevis', 'Vietnam', 'Norway',
       'Jordan', 'Japan', 'Taiwan', 'China', 'Slovakia', 'Kuwait',
       'Israel', 'Qatar', 'South Korea', 'Saudi Arabia', 'Hungary',
       'Portugal', 'Cayman Islands', 'Costa Rica', 'Egypt', 'Iceland',
       'Laos', 'Turkey', 'Indonesia', 'Bahrain', 'Dominican Republic',
       'Luxembourg', 'Finland', 'Ukraine', 'Trinidad & Tobago'],
      dtype=object)

In [32]:
df.shape

(3493, 6)

In [33]:
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | BA is not treating its premium ...,,2023-07-06,United Kingdom,False,verified ba treating premium economy passenger...
1,✅ Trip Verified | 24 hours before our departu...,4,2023-07-05,South Africa,True,hour departure ba cape town heathrow thursday ...
2,✅ Trip Verified | We arrived at Heathrow at 0...,1,2023-07-05,United Kingdom,True,arrived heathrow find flight ibiza cancelled b...
3,✅ Trip Verified | Original flight was cancell...,1,2023-07-04,Greece,True,original flight cancelled explanation represen...
4,Not Verified | Airport check in was functiona...,3,2023-07-03,Italy,False,verified airport check functionary little warm...
...,...,...,...,...,...,...
3488,Flew World Traveller Plus for the first time. ...,7,2014-06-12,Canada,False,flew world traveller plus first time trip lhr ...
3489,Glasgow to LHR on a completely full flight. Th...,,2014-06-12,United Kingdom,False,glasgow lhr completely full flight crew amazin...
3490,LHR-CPH-LHR both A320s. Other than bussed to s...,4,2014-06-12,United Kingdom,False,lhr cph lhr bussed stand outward good bus unex...
3491,The outward trip Manchester - Heathrow - Milan...,7,2014-06-10,United Kingdom,False,outward trip manchester heathrow milan fine ev...


In [34]:
#export cleaned data
df.to_csv(cwd + "/cleaned-BritAir-reviews.csv")