In [5]:
#imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

#regex
import re

In [6]:
#create a dataframe from csv file

cwd = os.getcwd()

df = pd.read_csv(cwd+"/BA_reviews.csv", index_col=0)

In [7]:
df.head()

Unnamed: 0,reviews,stars,date,country
0,Not Verified | Happy with the flight crew. Ha...,5.0,24th September 2024,United States
1,✅ Trip Verified | Horrible service from boar...,10.0,22nd September 2024,Morocco
2,Not Verified | My wife and I are very disappo...,1.0,13th September 2024,United States
3,Not Verified | We flew BA between Heathrow an...,1.0,13th September 2024,Australia
4,Not Verified | Absolutely disgusted with BA. ...,8.0,13th September 2024,United Kingdom


In [8]:
df['verified'] = df.reviews.str.contains("Trip Verified")

In [9]:
df['verified']

0       False
1        True
2       False
3       False
4       False
        ...  
3495    False
3496    False
3497    False
3498    False
3499    False
Name: verified, Length: 3500, dtype: bool

In [10]:
import re
import spacy

nlp = spacy.load("en_core_web_sm")

stop_words = spacy.lang.en.stop_words.STOP_WORDS

reviews_data = df.reviews.str.replace(r"✅ Trip Verified \|", "", regex=True).str.strip()

corpus = []

for rev in reviews_data:
    rev = re.sub(r'[^a-zA-Z]', ' ', rev)  
    rev = rev.lower()  
    doc = nlp(rev)  
    tokens = [token.lemma_ for token in doc if token.text not in stop_words and not token.is_punct]
    cleaned_review = " ".join(tokens)  
    corpus.append(cleaned_review)  


In [11]:
df['corpus'] = corpus

In [12]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | Happy with the flight crew. Ha...,5.0,24th September 2024,United States,False,verify happy flight crew hadn t plane ...
1,✅ Trip Verified | Horrible service from boar...,10.0,22nd September 2024,Morocco,True,horrible service boarding landing fly london...
2,Not Verified | My wife and I are very disappo...,1.0,13th September 2024,United States,False,verify wife disappointed fly british airwa...
3,Not Verified | We flew BA between Heathrow an...,1.0,13th September 2024,Australia,False,verify fly ba heathrow berlin way connecti...
4,Not Verified | Absolutely disgusted with BA. ...,8.0,13th September 2024,United Kingdom,False,verify absolutely disgusted ba flight ca...


In [20]:
df.dtypes

reviews      object
stars       float64
date         object
country      object
verified       bool
corpus       object
dtype: object

In [None]:
df.date = pd.to_datetime(df.date)

In [22]:
df.date.head()

0    24th September 2024
1    22nd September 2024
2    13th September 2024
3    13th September 2024
4    13th September 2024
Name: date, dtype: object

In [23]:

df.stars.unique()

array([ 5., 10.,  1.,  8.,  4.,  2.,  9.,  3.,  6.,  7., nan])

In [24]:
df.stars.value_counts()

stars
1.0     868
2.0     407
3.0     398
8.0     341
10.0    286
7.0     272
9.0     270
5.0     246
4.0     236
6.0     173
Name: count, dtype: int64

In [None]:

df.drop(df[df.stars == "None"].index, axis=0, inplace=True)

In [26]:

df.stars.unique()

array([ 5., 10.,  1.,  8.,  4.,  2.,  9.,  3.,  6.,  7., nan])

In [27]:
df.isnull().value_counts()

reviews  stars  date   country  verified  corpus
False    False  False  False    False     False     3496
         True   False  False    False     False        3
         False  False  True     False     False        1
Name: count, dtype: int64

In [28]:
df.country.isnull().value_counts()

country
False    3499
True        1
Name: count, dtype: int64

In [29]:
#drop the rows using index where the country value is null
df.drop(df[df.country.isnull() == True].index, axis=0, inplace=True)

In [30]:
df.shape

(3499, 6)

In [31]:
#resetting the index
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | Happy with the flight crew. Ha...,5.0,24th September 2024,United States,False,verify happy flight crew hadn t plane ...
1,✅ Trip Verified | Horrible service from boar...,10.0,22nd September 2024,Morocco,True,horrible service boarding landing fly london...
2,Not Verified | My wife and I are very disappo...,1.0,13th September 2024,United States,False,verify wife disappointed fly british airwa...
3,Not Verified | We flew BA between Heathrow an...,1.0,13th September 2024,Australia,False,verify fly ba heathrow berlin way connecti...
4,Not Verified | Absolutely disgusted with BA. ...,8.0,13th September 2024,United Kingdom,False,verify absolutely disgusted ba flight ca...
...,...,...,...,...,...,...
3494,Travelled to Palma in new BA 'sardine' busines...,1.0,28th October 2014,United Kingdom,False,travel palma new ba sardine business class...
3495,We flew London Heathrow to Singapore business ...,3.0,28th October 2014,Australia,False,fly london heathrow singapore business class t...
3496,GIG-LHR B773 Newly refurbished cabin. Seats we...,9.0,28th October 2014,Denmark,False,gig lhr b newly refurbish cabin seat goo...
3497,LHR-DEL. RTN Club World. Excellent service on ...,6.0,28th October 2014,United Kingdom,False,lhr del rtn club world excellent service s...


In [32]:
# export the cleaned data

df.to_csv(cwd + "/cleaned-BA-reviews.csv")