**Data Scraping**

In [1]:
#import required libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [2]:
base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 50
page_size = 100

reviews = []
rating_score = []
date = []
country = []
review_details = []


# for i in range(1, pages + 1):
for i in range(1, pages + 1):

    print(f"Scraping page {i}")

    # Create URL to collect links from paginated data
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"

    # Collect HTML data from this page
    response = requests.get(url)

    # Parse content
    soup = BeautifulSoup(response.content, "html.parser")
    for item in soup.find_all("div", class_="text_content"):
        reviews.append(item.text)

    for item in soup.find_all("div", class_ = "rating-10"):
        try:
            rating_score.append(item.span.text)
        except:
            print(f"Error on page {i}")
            rating_score.append("None")

    #date
    for item in soup.find_all("time"):
        date.append(item.text)

    #country
    for item in soup.find_all("h3"):
        country.append(item.span.next_sibling.text.strip(" ()"))

    #rating stats

    for review in soup.find_all('div', class_='body'):
        review_data = {}
        ratings = review.find('table', class_='review-ratings')

        #categories list
        cat_list = ['type_of_traveller', 'cabin_flown', 'route']
        for item in cat_list:
            cell = ratings.find('td', class_=f'review-rating-header {item}')
            review_data[item] = cell.next_sibling.text if cell else ''

        #star rating categories
        star_list = [
            'seat_comfort', 'cabin_staff_service', 'food_and_beverages',
            'ground_service', 'value_for_money', 'inflight_entertainment',
            'wifi_and_connectivity'
        ]
        for item in star_list:
            try:
                stars = ratings.find('td', class_=f'review-rating-header {item}').parent
                review_data[item] = len(stars.find_all('span', class_='star fill'))
            except:
                review_data[item] = 0  #default to 0 if no stars are found

        review_details.append(review_data)


    print(f"   ---> {len(reviews)} total reviews")

Scraping page 1
   ---> 100 total reviews
Scraping page 2
   ---> 200 total reviews
Scraping page 3
   ---> 300 total reviews
Scraping page 4
   ---> 400 total reviews
Scraping page 5
   ---> 500 total reviews
Scraping page 6
   ---> 600 total reviews
Scraping page 7
   ---> 700 total reviews
Scraping page 8
   ---> 800 total reviews
Scraping page 9
   ---> 900 total reviews
Scraping page 10
   ---> 1000 total reviews
Scraping page 11
   ---> 1100 total reviews
Scraping page 12
   ---> 1200 total reviews
Scraping page 13
   ---> 1300 total reviews
Scraping page 14
   ---> 1400 total reviews
Scraping page 15
   ---> 1500 total reviews
Scraping page 16
   ---> 1600 total reviews
Scraping page 17
   ---> 1700 total reviews
Scraping page 18
   ---> 1800 total reviews
Scraping page 19
   ---> 1900 total reviews
Scraping page 20
   ---> 2000 total reviews
Scraping page 21
   ---> 2100 total reviews
Scraping page 22
   ---> 2200 total reviews
Scraping page 23
   ---> 2300 total reviews
Scrapi

In [3]:
print(len(reviews), len(rating_score), len(date), len(country), len(review_details))

3877 3927 3877 3877 3877


In [4]:
#make sure that all lists are of the same length
min_len = min(len(reviews), len(rating_score), len(date), len(country), len(review_details))
reviews = reviews[:min_len]
rating_score = rating_score[:min_len]
date = date[:min_len]
country = country[:min_len]
review_details = review_details[:min_len]

In [5]:
#create df
df = pd.DataFrame({"reviews":reviews,
                   "rating_score": rating_score,
                   "date":date,
                   "country": country})

In [6]:
#convert `review_details` into df
df_details = pd.DataFrame(review_details)

In [7]:
#concatenate the two dfs
df = pd.concat([df, df_details], axis=1)

In [8]:
df.head()

Unnamed: 0,reviews,rating_score,date,country,type_of_traveller,cabin_flown,route,seat_comfort,cabin_staff_service,food_and_beverages,ground_service,value_for_money,inflight_entertainment,wifi_and_connectivity
0,✅ Trip Verified | On arriving at Mexico Airp...,\n\t\t\t\t\t\t\t\t\t\t\t\t5,25th October 2024,United Kingdom,Business,Business Class,Mexico City to London Heathrow,1,1,0,1,1,0,0
1,✅ Trip Verified | I have flown British Airwa...,1,24th October 2024,United States,Solo Leisure,Business Class,Paris to Boston via London,1,1,0,1,2,0,0
2,Not Verified | We bought tickets for a Geneva-...,1,22nd October 2024,Switzerland,Family Leisure,Economy Class,Geneva to London,0,0,0,1,1,0,0
3,✅ Trip Verified | Appalling service with fai...,1,14th October 2024,South Africa,Business,Business Class,Johannesburg to London,2,1,2,1,1,2,2
4,✅ Trip Verified | British Airways charge you f...,1,12th October 2024,United Kingdom,Business,Business Class,London to Mexico City,1,5,1,2,3,1,2


In [9]:
#save file
import os

cwd = os.getcwd()
df.to_csv(cwd+ "/BA_reviews.csv")