In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
import requests
from requests.exceptions import ConnectionError
import time

max_retries = 5
base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 10
page_size = 100

reviews = []

for i in range(1, pages + 1):
    print(f"Scraping page {i}")

    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"

    for retry in range(max_retries):
        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise an exception for 4xx and 5xx status codes
            break  # If successful, exit the retry loop
        except ConnectionError as e:
            print(f"Connection error (retry {retry + 1}/{max_retries}): {e}")
            time.sleep(5)  # Wait for a few seconds before retrying
    else:
        # If all retries failed, raise an exception or handle the error appropriately
        raise Exception(f"Failed to establish a connection to {url}")

    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')
    for para in parsed_content.find_all("div", {"class": "text_content"}):
        reviews.append(para.get_text())

    print(f"   ---> {len(reviews)} total reviews")


Scraping page 1
   ---> 100 total reviews
Scraping page 2
   ---> 200 total reviews
Scraping page 3
   ---> 300 total reviews
Scraping page 4
   ---> 400 total reviews
Scraping page 5
   ---> 500 total reviews
Scraping page 6
   ---> 600 total reviews
Scraping page 7
   ---> 700 total reviews
Scraping page 8
   ---> 800 total reviews
Scraping page 9
   ---> 900 total reviews
Scraping page 10
   ---> 1000 total reviews


In [3]:
df = pd.DataFrame()
df["reviews"] = reviews
df.head()

Unnamed: 0,reviews
0,"✅ Trip Verified | I had a flight from Miami, F..."
1,✅ Trip Verified | We started our day with BA ...
2,✅ Trip Verified | I fly British Airways weekl...
3,Not Verified | Everything was ok until our co...
4,Not Verified | My initial flight was cancelle...


In [4]:
import os

# Define the directory path
directory = "data"

# Check if the directory already exists or not
if not os.path.exists(directory):
    # If it doesn't exist, create it
    os.makedirs(directory)

df.to_csv("data/BA_reviews.csv")