In [1]:
import pandas as pd
import numpy as np

import requests

from bs4 import BeautifulSoup

In [2]:
# List to store all reviews
all_reviews = []

for page_num in range(1, 386):
    url = f'https://www.airlinequality.com/airline-reviews/british-airways/page/{page_num}/'
    response = requests.get(url)
    
    if response.status_code == 200:  # Check if the request was successful
        soup = BeautifulSoup(response.text, 'lxml')
        reviews = soup.find_all('div', class_='text_content')
        reviews_text = [review.get_text(strip=True) for review in reviews]
        
        all_reviews.extend(reviews_text)
    else:
        print(f"Failed to retrieve data from page {page_num}")

reviews_df = pd.DataFrame({'reviews': all_reviews})
reviews_df.shape, reviews_df.head()

((3850, 1),
                                              reviews
 0  ✅Trip Verified| As someone who flies relentles...
 1  ✅Trip Verified|   Flew with British Airways cl...
 2  ✅Trip Verified|   Straightforward check in T5....
 3  Not Verified| I am beyond upset and disgusted ...
 4  ✅Trip Verified|   I purchased round trip direc...)

In [3]:
# table data
table_df = pd.DataFrame()

for page in range(1, 386):
    url = f'https://www.airlinequality.com/airline-reviews/british-airways/page/{page}/'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    
    tables = soup.find_all('table', class_='review-ratings')
    
    df_list = []
    
    for table in tables:
        data = {}
        
        for tr in table.find_all('tr'):
            tds = tr.find_all('td')
            if len(tds) == 2:
                column_name = tds[0].get_text(strip=True)
                
                if 'review-rating-stars' in tds[1]['class']:
                    spans = tds[1].find_all('span', class_='star fill')
                    column_value = len(spans)
                else:
                    column_value = tds[1].get_text(strip=True)
                
                data[column_name] = column_value
        
        if data:
            df = pd.DataFrame([data])
            df_list.append(df)
    
    page_data_df = pd.concat(df_list, ignore_index=True)

    if not page_data_df.empty:
        page_data_df = page_data_df.iloc[1:]
    
    
    table_df = pd.concat([table_df, page_data_df], ignore_index=True)


table_df.shape

(3850, 15)

In [4]:
table_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3850 entries, 0 to 3849
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Food & Beverages        3428 non-null   float64
 1   Inflight Entertainment  2637 non-null   float64
 2   Seat Comfort            3727 non-null   float64
 3   Staff Service           0 non-null      float64
 4   Value for Money         0 non-null      float64
 5   Type Of Traveller       3084 non-null   object 
 6   Seat Type               3849 non-null   object 
 7   Route                   3079 non-null   object 
 8   Date Flown              3077 non-null   object 
 9   Cabin Staff Service     3713 non-null   float64
 10  Ground Service          3005 non-null   float64
 11  Wifi & Connectivity     668 non-null    float64
 12  Value For Money         3850 non-null   float64
 13  Recommended             3850 non-null   object 
 14  Aircraft                2008 non-null   

In [5]:
# extract all reviews header
all_header = []

for page_num in range(1, 386):
    url = f'https://www.airlinequality.com/airline-reviews/british-airways/page/{page_num}/'
    response = requests.get(url)
    
    if response.status_code == 200:  # Check if the request was successful
        soup = BeautifulSoup(response.text, 'lxml')
        headers = soup.find_all('h2', class_='text_header')
        headers_text = [header.get_text(strip=True) for header in headers]
        
        all_header.extend(headers_text)
    else:
        print(f"Failed to retrieve data from page {page_num}")

all_header_df = pd.DataFrame({'review_title': all_header})

In [6]:
# reviews personal details
reviewer_details = []

for page_num in range(1, 386):
    url = f'https://www.airlinequality.com/airline-reviews/british-airways/page/{page_num}/'
    response = requests.get(url)
    
    if response.status_code == 200:  # Check if the request was successful
        soup = BeautifulSoup(response.text, 'lxml')
        details = soup.find_all('h3', class_='text_sub_header userStatusWrapper')
        details_text = [detail.get_text(strip=True) for detail in details]
        
        reviewer_details.extend(details_text)
    else:
        print(f"Failed to retrieve data from page {page_num}")

reviewer_details_df = pd.DataFrame({'reviewer_detail' : reviewer_details})

In [7]:
reviews_df.shape, reviews_df.sample()

((3850, 1),
                                                 reviews
 3564  2 recent flights LHR to SFO (12 hrs) - 7th Sep...)

In [8]:
table_df.shape, table_df.sample()

((3850, 15),
       Food & Beverages  Inflight Entertainment  Seat Comfort  Staff Service  \
 2205               1.0                     1.0           1.0            NaN   
 
       Value for Money Type Of Traveller       Seat Type       Route  \
 2205              NaN      Solo Leisure  Business Class  JFK to LHR   
 
         Date Flown  Cabin Staff Service  Ground Service  Wifi & Connectivity  \
 2205  October 2016                  3.0             1.0                  NaN   
 
       Value For Money Recommended Aircraft  
 2205              1.0          no      NaN  )

In [9]:
reviewer_details_df

Unnamed: 0,reviewer_detail
0,Amanda Edgar(United Kingdom)2nd September 2024
1,S Morton(United Kingdom)1st September 2024
2,E Smyth(United Kingdom)30th August 2024
3,Lauren Boniface(United Kingdom)28th August 2024
4,Il Yong Jung(Hong Kong)28th August 2024
...,...
3845,W Benson(United Kingdom)29th August 2012
3846,S Luqman(United Kingdom)29th August 2012
3847,D Smith(United Kingdom)29th August 2012
3848,W Benson(United Kingdom)29th August 2012


In [10]:
reviewer_details_df.shape, reviewer_details_df.sample()

((3850, 1),
                             reviewer_detail
 134  T Maddern(Australia)21st December 2023)

In [11]:
BA_Final_Data = pd.concat([reviewer_details_df, reviews_df, table_df], axis = 1)
BA_Final_Data.to_csv('BA-Final-Data-Skytrax.csv')

In [14]:
BA_Final_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3850 entries, 0 to 3849
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   reviewer_detail         3850 non-null   object 
 1   reviews                 3850 non-null   object 
 2   Food & Beverages        3428 non-null   float64
 3   Inflight Entertainment  2637 non-null   float64
 4   Seat Comfort            3727 non-null   float64
 5   Staff Service           0 non-null      float64
 6   Value for Money         0 non-null      float64
 7   Type Of Traveller       3084 non-null   object 
 8   Seat Type               3849 non-null   object 
 9   Route                   3079 non-null   object 
 10  Date Flown              3077 non-null   object 
 11  Cabin Staff Service     3713 non-null   float64
 12  Ground Service          3005 non-null   float64
 13  Wifi & Connectivity     668 non-null    float64
 14  Value For Money         3850 non-null   