In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

#1. British Airways

In [2]:
def scrape_page(page_number):
    url = f'https://www.airlinequality.com/airline-reviews/british-airways/page/{page_number}/'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    reviews = []
    for review_div in soup.find_all('div', class_='body'):
        name_tag = review_div.find('span', itemprop='name')
        location_tag = review_div.find('h3', class_='text_sub_header')
        date_tag = review_div.find('time', itemprop='datePublished')
        text_content_div = review_div.find('div', class_='text_content')

        # Extract review details with error handling
        name = name_tag.text if name_tag else 'N/A'
        location = location_tag.text.split(' ')[-1] if location_tag else 'N/A'
        date_published = date_tag['datetime'] if date_tag else 'N/A'
        text_content = text_content_div.text.strip() if text_content_div else 'N/A'

        # Extract ratings
        review_stats = review_div.find('div', class_='review-stats')
        rating_elements = review_stats.find_all('tr') if review_stats else []

        seat_type = rating_elements[0].find_all('td')[1].text.strip() if len(rating_elements) > 0 else 'N/A'
        seat_comfort = extract_rating(rating_elements[1].find('td', class_='review-rating-stars')) if len(rating_elements) > 1 else 'N/A'
        cabin_staff_service = extract_rating(rating_elements[2].find('td', class_='review-rating-stars')) if len(rating_elements) > 2 else 'N/A'
        food_beverages = extract_rating(rating_elements[3].find('td', class_='review-rating-stars')) if len(rating_elements) > 3 else 'N/A'
        inflight_entertainment = extract_rating(rating_elements[4].find('td', class_='review-rating-stars')) if len(rating_elements) > 4 else 'N/A'
        value_for_money = extract_rating(rating_elements[5].find('td', class_='review-rating-stars')) if len(rating_elements) > 5 else 'N/A'
        recommended_text = rating_elements[6].find_all('td')[1].text.strip() if len(rating_elements) > 6 else 'N/A'
        recommended = recommended_text.lower() == 'yes' if recommended_text != 'N/A' else None

        # Append extracted data to the list
        reviews.append({
            'Name': name,
            'Location': location,
            'Date Published': date_published,
            'Text Content': text_content,
            'Seat Type': seat_type,
            'Seat Comfort': seat_comfort,
            'Cabin Staff Service': cabin_staff_service,
            'Food & Beverages': food_beverages,
            'Inflight Entertainment': inflight_entertainment,
            'Value For Money': value_for_money,
            'Recommended': recommended
        })

    return reviews

In [3]:
def extract_rating(star_elements):
    if star_elements:
        filled_stars = star_elements.find_all('span', class_='star fill')
        return len(filled_stars)
    return None


In [4]:
# Main loop to scrape multiple pages and track progress
british_airways_reviews = []
total_pages = 385
for page_number in range(1, total_pages + 1):
    page_reviews = scrape_page(page_number)
    british_airways_reviews.extend(page_reviews)

    # Print progress message
    if page_number % 10 == 0:
        print(f"{page_number} pages extraction completed")


10 pages extraction completed
20 pages extraction completed
30 pages extraction completed
40 pages extraction completed
50 pages extraction completed
60 pages extraction completed
70 pages extraction completed
80 pages extraction completed
90 pages extraction completed
100 pages extraction completed
110 pages extraction completed
120 pages extraction completed
130 pages extraction completed
140 pages extraction completed
150 pages extraction completed
160 pages extraction completed
170 pages extraction completed
180 pages extraction completed
190 pages extraction completed
200 pages extraction completed
210 pages extraction completed
220 pages extraction completed
230 pages extraction completed
240 pages extraction completed
250 pages extraction completed
260 pages extraction completed
270 pages extraction completed
280 pages extraction completed
290 pages extraction completed
300 pages extraction completed
310 pages extraction completed
320 pages extraction completed
330 pages extract

In [5]:
british_airways_reviews_df = pd.DataFrame(british_airways_reviews)

In [6]:
british_airways_reviews_df.sample(5)

Unnamed: 0,Name,Location,Date Published,Text Content,Seat Type,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Value For Money,Recommended
1091,D Gold,2019,2019-02-26,✅ Trip Verified | Chicago to London Heathrow....,B747-400,,,,,1,False
2058,P Greeson,2016,2016-12-22,✅ Verified Review | Baltimore Washington to L...,Boeing 787,,,,,2,False
2326,Jim Zaza,2016,2016-07-13,✅ Verified Review | London Heathrow to Boston...,Boeing 777,,,,,4,False
873,Luis Casasola,2019,2019-10-13,✅ Trip Verified | Mexico City to Barcelona via...,"Boeing 787-9, A320-200",,,,,5,False
2419,S Williams,2016,2016-05-14,✅ Verified Review | British Airways have seri...,Couple Leisure,,,,3.0,2,False


In [7]:
british_airways_reviews_df.shape

(3841, 11)

In [8]:
british_airways_reviews_df.to_csv('british_airways_reviews.csv', index=False)

#2. Air France

In [9]:
def scrape_page(page_number):
    url = f'https://www.airlinequality.com/airline-reviews/air-france/page/{page_number}/'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    reviews = []
    for review_div in soup.find_all('div', class_='body'):
        name_tag = review_div.find('span', itemprop='name')
        location_tag = review_div.find('h3', class_='text_sub_header')
        date_tag = review_div.find('time', itemprop='datePublished')
        text_content_div = review_div.find('div', class_='text_content')

        # Extract review details with error handling
        name = name_tag.text if name_tag else 'N/A'
        location = location_tag.text.split(' ')[-1] if location_tag else 'N/A'
        date_published = date_tag['datetime'] if date_tag else 'N/A'
        text_content = text_content_div.text.strip() if text_content_div else 'N/A'

        # Extract ratings
        review_stats = review_div.find('div', class_='review-stats')
        rating_elements = review_stats.find_all('tr') if review_stats else []

        seat_type = rating_elements[0].find_all('td')[1].text.strip() if len(rating_elements) > 0 else 'N/A'
        seat_comfort = extract_rating(rating_elements[1].find('td', class_='review-rating-stars')) if len(rating_elements) > 1 else 'N/A'
        cabin_staff_service = extract_rating(rating_elements[2].find('td', class_='review-rating-stars')) if len(rating_elements) > 2 else 'N/A'
        food_beverages = extract_rating(rating_elements[3].find('td', class_='review-rating-stars')) if len(rating_elements) > 3 else 'N/A'
        inflight_entertainment = extract_rating(rating_elements[4].find('td', class_='review-rating-stars')) if len(rating_elements) > 4 else 'N/A'
        ground_service = extract_rating(rating_elements[5].find('td', class_='review-rating-stars')) if len(rating_elements) > 5 else 'N/A'
        wifi_connectivity = extract_rating(rating_elements[6].find('td', class_='review-rating-stars')) if len(rating_elements) > 6 else 'N/A'
        value_for_money = extract_rating(rating_elements[7].find('td', class_='review-rating-stars')) if len(rating_elements) > 7 else 'N/A'
        recommended_text = rating_elements[8].find_all('td')[1].text.strip() if len(rating_elements) > 8 else 'N/A'
        recommended = recommended_text.lower() == 'yes' if recommended_text != 'N/A' else None

        # Append extracted data to the list
        reviews.append({
            'Name': name,
            'Location': location,
            'Date Published': date_published,
            'Text Content': text_content,
            'Seat Type': seat_type,
            'Seat Comfort': seat_comfort,
            'Cabin Staff Service': cabin_staff_service,
            'Food & Beverages': food_beverages,
            'Inflight Entertainment': inflight_entertainment,
            'Ground Service': ground_service,
            'Wifi & Connectivity': wifi_connectivity,
            'Value For Money': value_for_money,
            'Recommended': recommended
        })

    return reviews

In [10]:
air_france_reviews = []
total_pages = 138
for page_number in range(1, total_pages + 1):
    page_reviews = scrape_page(page_number)
    air_france_reviews.extend(page_reviews)

    # Print progress message
    if page_number % 10 == 0:
        print(f"{page_number} pages extraction completed")


10 pages extraction completed
20 pages extraction completed
30 pages extraction completed
40 pages extraction completed
50 pages extraction completed
60 pages extraction completed
70 pages extraction completed
80 pages extraction completed
90 pages extraction completed
100 pages extraction completed
110 pages extraction completed
120 pages extraction completed
130 pages extraction completed


In [11]:
air_framce_reviews_df = pd.DataFrame(air_france_reviews)

In [12]:
air_framce_reviews_df.sample(5)

Unnamed: 0,Name,Location,Date Published,Text Content,Seat Type,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Wifi & Connectivity,Value For Money,Recommended
641,Guillaume Miceli,2017,2017-12-28,✅ Trip Verified | I flew from London to Paris ...,Boeing 787,,,,,5,4,5,False
511,P Walena,2018,2018-11-21,✅ Trip Verified | Krakow to Amsterdam via Par...,Boeing 737-800 / Embraer 190,,,,,5,5,5,False
214,D Lane,2022,2022-07-21,✅ Trip Verified | Moved my flight forward 24 ...,Boeing 777-300,,,,,1,1,1,False
627,Ricardo Feliu,2018,2018-02-15,"✅ Trip Verified | January 30, 2018, flight AF1...",Family Leisure,,,,1.0,1,3,3,False
21,S Tavani,2024,2024-05-29,✅ Trip Verified | Air France weighs cabin lug...,Solo Leisure,,,,3.0,2,3,1,False


In [13]:
air_framce_reviews_df.shape

(1378, 13)

In [14]:
air_framce_reviews_df.to_csv('air_france_reviews.csv', index=False)

#3. Qatar Airways

In [15]:
def scrape_page(page_number):
    url = f'https://www.airlinequality.com/airline-reviews/qatar-airways/page/{page_number}/'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    reviews = []
    for review_div in soup.find_all('div', class_='body'):
        name_tag = review_div.find('span', itemprop='name')
        location_tag = review_div.find('h3', class_='text_sub_header')
        date_tag = review_div.find('time', itemprop='datePublished')
        text_content_div = review_div.find('div', class_='text_content')

        # Extract review details with error handling
        name = name_tag.text if name_tag else 'N/A'
        location = location_tag.text.split(' ')[-1] if location_tag else 'N/A'
        date_published = date_tag['datetime'] if date_tag else 'N/A'
        text_content = text_content_div.text.strip() if text_content_div else 'N/A'

        # Extract ratings
        review_stats = review_div.find('div', class_='review-stats')
        rating_elements = review_stats.find_all('tr') if review_stats else []

        seat_type = rating_elements[0].find_all('td')[1].text.strip() if len(rating_elements) > 0 else 'N/A'
        seat_comfort = extract_rating(rating_elements[1].find('td', class_='review-rating-stars')) if len(rating_elements) > 1 else 'N/A'
        cabin_staff_service = extract_rating(rating_elements[2].find('td', class_='review-rating-stars')) if len(rating_elements) > 2 else 'N/A'
        food_beverages = extract_rating(rating_elements[3].find('td', class_='review-rating-stars')) if len(rating_elements) > 3 else 'N/A'
        inflight_entertainment = extract_rating(rating_elements[4].find('td', class_='review-rating-stars')) if len(rating_elements) > 4 else 'N/A'
        ground_service = extract_rating(rating_elements[5].find('td', class_='review-rating-stars')) if len(rating_elements) > 5 else 'N/A'
        wifi_connectivity = extract_rating(rating_elements[6].find('td', class_='review-rating-stars')) if len(rating_elements) > 6 else 'N/A'
        value_for_money = extract_rating(rating_elements[7].find('td', class_='review-rating-stars')) if len(rating_elements) > 7 else 'N/A'
        recommended_text = rating_elements[8].find_all('td')[1].text.strip() if len(rating_elements) > 8 else 'N/A'
        recommended = recommended_text.lower() == 'yes' if recommended_text != 'N/A' else None

        # Append extracted data to the list
        reviews.append({
            'Name': name,
            'Location': location,
            'Date Published': date_published,
            'Text Content': text_content,
            'Seat Type': seat_type,
            'Seat Comfort': seat_comfort,
            'Cabin Staff Service': cabin_staff_service,
            'Food & Beverages': food_beverages,
            'Inflight Entertainment': inflight_entertainment,
            'Ground Service': ground_service,
            'Wifi & Connectivity': wifi_connectivity,
            'Value For Money': value_for_money,
            'Recommended': recommended
        })

    return reviews

In [16]:
qatar_airways_reviews = []
total_pages = 247
for page_number in range(1, total_pages + 1):
    page_reviews = scrape_page(page_number)
    qatar_airways_reviews.extend(page_reviews)

    # Print progress message
    if page_number % 10 == 0:
        print(f"{page_number} pages extraction completed")


10 pages extraction completed
20 pages extraction completed
30 pages extraction completed
40 pages extraction completed
50 pages extraction completed
60 pages extraction completed
70 pages extraction completed
80 pages extraction completed
90 pages extraction completed
100 pages extraction completed
110 pages extraction completed
120 pages extraction completed
130 pages extraction completed
140 pages extraction completed
150 pages extraction completed
160 pages extraction completed
170 pages extraction completed
180 pages extraction completed
190 pages extraction completed
200 pages extraction completed
210 pages extraction completed
220 pages extraction completed
230 pages extraction completed
240 pages extraction completed


In [17]:
qatar_airways_reviews_df = pd.DataFrame(qatar_airways_reviews)

In [18]:
qatar_airways_reviews_df.sample(5)

Unnamed: 0,Name,Location,Date Published,Text Content,Seat Type,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Wifi & Connectivity,Value For Money,Recommended
979,Rajan Parrikar,2019,2019-08-03,✅ Trip Verified | Mumbai to Copenhagen via Do...,Boeing 777-300ER,,,,,5.0,5,5,False
575,Winsome Peter,2022,2022-02-19,✅ Trip Verified | My congratulations to the Q...,Solo Leisure,,,,5.0,5.0,5,5,False
1623,M Tan,2016,2016-10-23,✅ Verified Review | Singapore to Kilimanjaro ...,Business,,,,4.0,4.0,4,2,False
121,A Qarani,2024,2024-01-14,✅ Trip Verified | Following a downgrade of my...,Business,,,,1.0,3.0,2,1,False
2030,J Scholfield,2015,2015-06-13,Flew LHR to AUH via DOH return. Outbound on th...,787,,,,3.0,4.0,4,4,False


In [19]:
qatar_airways_reviews_df.shape

(2470, 13)

In [20]:
qatar_airways_reviews_df.to_csv('qatar_airways_reviews.csv', index=False)

#4. Qantas-Airways

In [21]:
def scrape_page(page_number):
    url = f'https://www.airlinequality.com/airline-reviews/qantas-airways/page/{page_number}/'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    reviews = []
    for review_div in soup.find_all('div', class_='body'):
        name_tag = review_div.find('span', itemprop='name')
        location_tag = review_div.find('h3', class_='text_sub_header')
        date_tag = review_div.find('time', itemprop='datePublished')
        text_content_div = review_div.find('div', class_='text_content')

        # Extract review details with error handling
        name = name_tag.text if name_tag else 'N/A'
        location = location_tag.text.split(' ')[-1] if location_tag else 'N/A'
        date_published = date_tag['datetime'] if date_tag else 'N/A'
        text_content = text_content_div.text.strip() if text_content_div else 'N/A'

        # Extract ratings
        review_stats = review_div.find('div', class_='review-stats')
        rating_elements = review_stats.find_all('tr') if review_stats else []

        seat_type = rating_elements[0].find_all('td')[1].text.strip() if len(rating_elements) > 0 else 'N/A'
        seat_comfort = extract_rating(rating_elements[1].find('td', class_='review-rating-stars')) if len(rating_elements) > 1 else 'N/A'
        cabin_staff_service = extract_rating(rating_elements[2].find('td', class_='review-rating-stars')) if len(rating_elements) > 2 else 'N/A'
        food_beverages = extract_rating(rating_elements[3].find('td', class_='review-rating-stars')) if len(rating_elements) > 3 else 'N/A'
        inflight_entertainment = extract_rating(rating_elements[4].find('td', class_='review-rating-stars')) if len(rating_elements) > 4 else 'N/A'
        ground_service = extract_rating(rating_elements[5].find('td', class_='review-rating-stars')) if len(rating_elements) > 5 else 'N/A'
        wifi_connectivity = extract_rating(rating_elements[6].find('td', class_='review-rating-stars')) if len(rating_elements) > 6 else 'N/A'
        value_for_money = extract_rating(rating_elements[7].find('td', class_='review-rating-stars')) if len(rating_elements) > 7 else 'N/A'
        recommended_text = rating_elements[8].find_all('td')[1].text.strip() if len(rating_elements) > 8 else 'N/A'
        recommended = recommended_text.lower() == 'yes' if recommended_text != 'N/A' else None

        # Append extracted data to the list
        reviews.append({
            'Name': name,
            'Location': location,
            'Date Published': date_published,
            'Text Content': text_content,
            'Seat Type': seat_type,
            'Seat Comfort': seat_comfort,
            'Cabin Staff Service': cabin_staff_service,
            'Food & Beverages': food_beverages,
            'Inflight Entertainment': inflight_entertainment,
            'Ground Service': ground_service,
            'Wifi & Connectivity': wifi_connectivity,
            'Value For Money': value_for_money,
            'Recommended': recommended
        })

    return reviews

In [22]:
qantas_airways_reviews = []
total_pages = 192
for page_number in range(1, total_pages + 1):
    page_reviews = scrape_page(page_number)
    qantas_airways_reviews.extend(page_reviews)

    # Print progress message
    if page_number % 10 == 0:
        print(f"{page_number} pages extraction completed")


10 pages extraction completed
20 pages extraction completed
30 pages extraction completed
40 pages extraction completed
50 pages extraction completed
60 pages extraction completed
70 pages extraction completed
80 pages extraction completed
90 pages extraction completed
100 pages extraction completed
110 pages extraction completed
120 pages extraction completed
130 pages extraction completed
140 pages extraction completed
150 pages extraction completed
160 pages extraction completed
170 pages extraction completed
180 pages extraction completed
190 pages extraction completed


In [25]:
qantas_airways_reviews_df = pd.DataFrame(qantas_airways_reviews)

In [26]:
qantas_airways_reviews_df.sample(5)

Unnamed: 0,Name,Location,Date Published,Text Content,Seat Type,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Wifi & Connectivity,Value For Money,Recommended
789,F Irving,2018,2018-03-29,✅ Trip Verified | QF2 flight from Dubai to Syd...,A380,,,,,1,1.0,1.0,False
880,D Peters,2017,2017-10-28,"✅ Verified Review | QF971, October 26, Townsv...",Solo Leisure,,,,5.0,5,5.0,5.0,False
427,C Thompson,2020,2020-01-21,✅ Trip Verified | Perth to Cairns via Brisbane...,Boeing 737,,,,,2,1.0,3.0,False
1684,Jay Howard,2014,2014-04-27,BKK-SYD. Off to a bad start when they tried to...,Business Class,3.0,3.0,1.0,4.0,3,,,
1016,J Singh,2017,2017-01-14,✅ Verified Review | Melbourne to Dubai with Q...,A380,,,,,3,2.0,2.0,False


In [27]:
qantas_airways_reviews_df.shape

(1916, 13)

In [28]:
qantas_airways_reviews_df.to_csv('qantas_airways_reviews.csv', index=False)

#5.Singapore Airlines

In [23]:
def scrape_page(page_number):
    url = f'https://www.airlinequality.com/airline-reviews/singapore-airlines/page/{page_number}/'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    reviews = []
    for review_div in soup.find_all('div', class_='body'):
        name_tag = review_div.find('span', itemprop='name')
        location_tag = review_div.find('h3', class_='text_sub_header')
        date_tag = review_div.find('time', itemprop='datePublished')
        text_content_div = review_div.find('div', class_='text_content')

        # Extract review details with error handling
        name = name_tag.text if name_tag else 'N/A'
        location = location_tag.text.split(' ')[-1] if location_tag else 'N/A'
        date_published = date_tag['datetime'] if date_tag else 'N/A'
        text_content = text_content_div.text.strip() if text_content_div else 'N/A'

        # Extract ratings
        review_stats = review_div.find('div', class_='review-stats')
        rating_elements = review_stats.find_all('tr') if review_stats else []

        seat_type = rating_elements[0].find_all('td')[1].text.strip() if len(rating_elements) > 0 else 'N/A'
        seat_comfort = extract_rating(rating_elements[1].find('td', class_='review-rating-stars')) if len(rating_elements) > 1 else 'N/A'
        cabin_staff_service = extract_rating(rating_elements[2].find('td', class_='review-rating-stars')) if len(rating_elements) > 2 else 'N/A'
        food_beverages = extract_rating(rating_elements[3].find('td', class_='review-rating-stars')) if len(rating_elements) > 3 else 'N/A'
        inflight_entertainment = extract_rating(rating_elements[4].find('td', class_='review-rating-stars')) if len(rating_elements) > 4 else 'N/A'
        ground_service = extract_rating(rating_elements[5].find('td', class_='review-rating-stars')) if len(rating_elements) > 5 else 'N/A'
        wifi_connectivity = extract_rating(rating_elements[6].find('td', class_='review-rating-stars')) if len(rating_elements) > 6 else 'N/A'
        value_for_money = extract_rating(rating_elements[7].find('td', class_='review-rating-stars')) if len(rating_elements) > 7 else 'N/A'
        recommended_text = rating_elements[8].find_all('td')[1].text.strip() if len(rating_elements) > 8 else 'N/A'
        recommended = recommended_text.lower() == 'yes' if recommended_text != 'N/A' else None

        # Append extracted data to the list
        reviews.append({
            'Name': name,
            'Location': location,
            'Date Published': date_published,
            'Text Content': text_content,
            'Seat Type': seat_type,
            'Seat Comfort': seat_comfort,
            'Cabin Staff Service': cabin_staff_service,
            'Food & Beverages': food_beverages,
            'Inflight Entertainment': inflight_entertainment,
            'Ground Service': ground_service,
            'Wifi & Connectivity': wifi_connectivity,
            'Value For Money': value_for_money,
            'Recommended': recommended
        })

    return reviews

In [24]:
singapore_airlines_reviews = []
total_pages = 160
for page_number in range(1, total_pages + 1):
    page_reviews = scrape_page(page_number)
    singapore_airlines_reviews.extend(page_reviews)

    # Print progress message
    if page_number % 10 == 0:
        print(f"{page_number} pages extraction completed")


10 pages extraction completed
20 pages extraction completed
30 pages extraction completed
40 pages extraction completed
50 pages extraction completed
60 pages extraction completed
70 pages extraction completed
80 pages extraction completed
90 pages extraction completed
100 pages extraction completed
110 pages extraction completed
120 pages extraction completed
130 pages extraction completed
140 pages extraction completed
150 pages extraction completed
160 pages extraction completed


In [29]:
singapore_airlines_reviews_df = pd.DataFrame(singapore_airlines_reviews)

In [30]:
singapore_airlines_reviews_df.sample(5)

Unnamed: 0,Name,Location,Date Published,Text Content,Seat Type,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Wifi & Connectivity,Value For Money,Recommended
428,Elaine Loh,2019,2019-11-06,✅ Trip Verified | Perth to Singapore. A pleas...,Solo Leisure,,,,4.0,5,4,4,False
118,Lukas Fajt,2023,2023-08-29,Not Verified | We flew with Singapore to New ...,A380,,,,,5,5,5,False
405,L Tran,2019,2019-12-02,✅ Trip Verified | Singapore to Tokyo. I chose...,A380-800,,,,,5,5,4,False
396,R Margathan,2019,2019-12-20,✅ Trip Verified | Would like to shout out a s...,Family Leisure,,,,5.0,5,4,5,False
297,V Ellis,2022,2022-03-20,Not Verified | A350 from Melbourne to Singapo...,A350 / A380,,,,,2,3,2,False


In [31]:
singapore_airlines_reviews_df.shape

(1598, 13)

In [32]:
singapore_airlines_reviews_df.to_csv('singapore_airlines_reviews.csv', index=False)

#6. Emirates

In [33]:
def scrape_page(page_number):
    url = f'https://www.airlinequality.com/airline-reviews/emirates/page/{page_number}/'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    reviews = []
    for review_div in soup.find_all('div', class_='body'):
        name_tag = review_div.find('span', itemprop='name')
        location_tag = review_div.find('h3', class_='text_sub_header')
        date_tag = review_div.find('time', itemprop='datePublished')
        text_content_div = review_div.find('div', class_='text_content')

        # Extract review details with error handling
        name = name_tag.text if name_tag else 'N/A'
        location = location_tag.text.split(' ')[-1] if location_tag else 'N/A'
        date_published = date_tag['datetime'] if date_tag else 'N/A'
        text_content = text_content_div.text.strip() if text_content_div else 'N/A'

        # Extract ratings
        review_stats = review_div.find('div', class_='review-stats')
        rating_elements = review_stats.find_all('tr') if review_stats else []

        seat_type = rating_elements[0].find_all('td')[1].text.strip() if len(rating_elements) > 0 else 'N/A'
        seat_comfort = extract_rating(rating_elements[1].find('td', class_='review-rating-stars')) if len(rating_elements) > 1 else 'N/A'
        cabin_staff_service = extract_rating(rating_elements[2].find('td', class_='review-rating-stars')) if len(rating_elements) > 2 else 'N/A'
        food_beverages = extract_rating(rating_elements[3].find('td', class_='review-rating-stars')) if len(rating_elements) > 3 else 'N/A'
        inflight_entertainment = extract_rating(rating_elements[4].find('td', class_='review-rating-stars')) if len(rating_elements) > 4 else 'N/A'
        ground_service = extract_rating(rating_elements[5].find('td', class_='review-rating-stars')) if len(rating_elements) > 5 else 'N/A'
        wifi_connectivity = extract_rating(rating_elements[6].find('td', class_='review-rating-stars')) if len(rating_elements) > 6 else 'N/A'
        value_for_money = extract_rating(rating_elements[7].find('td', class_='review-rating-stars')) if len(rating_elements) > 7 else 'N/A'
        recommended_text = rating_elements[8].find_all('td')[1].text.strip() if len(rating_elements) > 8 else 'N/A'
        recommended = recommended_text.lower() == 'yes' if recommended_text != 'N/A' else None

        # Append extracted data to the list
        reviews.append({
            'Name': name,
            'Location': location,
            'Date Published': date_published,
            'Text Content': text_content,
            'Seat Type': seat_type,
            'Seat Comfort': seat_comfort,
            'Cabin Staff Service': cabin_staff_service,
            'Food & Beverages': food_beverages,
            'Inflight Entertainment': inflight_entertainment,
            'Ground Service': ground_service,
            'Wifi & Connectivity': wifi_connectivity,
            'Value For Money': value_for_money,
            'Recommended': recommended
        })

    return reviews

In [34]:
emirates_reviews = []
total_pages = 239
for page_number in range(1, total_pages + 1):
    page_reviews = scrape_page(page_number)
    emirates_reviews.extend(page_reviews)

    # Print progress message
    if page_number % 10 == 0:
        print(f"{page_number} pages extraction completed")


10 pages extraction completed
20 pages extraction completed
30 pages extraction completed
40 pages extraction completed
50 pages extraction completed
60 pages extraction completed
70 pages extraction completed
80 pages extraction completed
90 pages extraction completed
100 pages extraction completed
110 pages extraction completed
120 pages extraction completed
130 pages extraction completed
140 pages extraction completed
150 pages extraction completed
160 pages extraction completed
170 pages extraction completed
180 pages extraction completed
190 pages extraction completed
200 pages extraction completed
210 pages extraction completed
220 pages extraction completed
230 pages extraction completed


In [35]:
emirates_reviews_df = pd.DataFrame(emirates_reviews)

In [36]:
emirates_reviews_df.sample(5)

Unnamed: 0,Name,Location,Date Published,Text Content,Seat Type,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Wifi & Connectivity,Value For Money,Recommended
1342,Bojan Tercon,2016,2016-08-14,"✅ Verified Review | To me, Emirates A380 has ...",A380,,,,,5.0,3.0,2.0,False
1453,Marcin Gorczyński,2016,2016-03-13,"Dubai to Warsaw with Emirates. My seats, selec...",Boeing 777,,,,,1.0,1.0,4.0,False
1109,John Curtain,2017,2017-06-27,✅ Verified Review | Athens to Dubai. I enjoye...,Boeing 777,,,,,3.0,4.0,4.0,False
2004,Ian Solomon,2014,2014-10-08,Having flown First or Business class long-haul...,First Class,4.0,2.0,5.0,5.0,1.0,,,
545,Badr Darwish,2019,2019-08-22,✅ Trip Verified | \r\nDubai to Tunis. Flight ...,Couple Leisure,,,,1.0,3.0,3.0,2.0,False


In [37]:
emirates_reviews_df.shape

(2387, 13)

In [38]:
emirates_reviews_df.to_csv('emirates_reviews.csv', index=False)

In [43]:
int(british_airways_reviews_df.shape[0]) + \
int(air_framce_reviews_df.shape[0]) + \
int(qatar_airways_reviews_df.shape[0]) + \
int(qantas_airways_reviews_df.shape[0])+ \
int(singapore_airlines_reviews_df.shape[0])+int(emirates_reviews_df.shape[0])

13590

In [44]:
british_airways_reviews_df['Airlines']='british_airways'

In [47]:
british_airways_reviews_df.sample(5)

Unnamed: 0,Name,Location,Date Published,Text Content,Seat Type,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Value For Money,Recommended
1509,Raymond Grayson,2017,2017-11-30,✅ Trip Verified | Flew Newcastle to San Franci...,Couple Leisure,,,,4.0,5,False
2658,Patrick Beet,2015,2015-12-10,Flew British Airways back from Bangkok to Lond...,Boeing 777,,,,,5,False
945,Dave Matthews,2019,2019-08-09,Not Verified | British Airways flight to Brus...,Couple Leisure,,,,2.0,4,False
2463,Alistair Baker,2016,2016-04-12,✅ Verified Review | I decided to treat myself...,Boeing 747-400,,,,,5,False
1364,Helen Santry,2018,2018-04-29,✅ Trip Verified | London Heathrow to Vancouve...,Boeing 747,,,,,1,False


In [48]:
british_airways_reviews_df.insert(0, 'Airlines', 'british_airways')

In [49]:
british_airways_reviews_df

Unnamed: 0,Airlines,Name,Location,Date Published,Text Content,Seat Type,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Value For Money,Recommended
0,british_airways,C Hayne,2024,2024-08-15,"Not Verified | Before my flight, I was forced ...",Solo Leisure,,,,1,1,False
1,british_airways,C Porter,2024,2024-08-12,✅ Trip Verified | British Airways at its bes...,A350,,,,,5,False
2,british_airways,G Jones,2024,2024-08-12,✅ Trip Verified | An excellent flight! Despite...,A320,,,,,3,False
3,british_airways,Edward King,2024,2024-08-11,✅ Trip Verified | I recently traveled with Bri...,A380,,,,,3,False
4,british_airways,N Kwok,2024,2024-08-09,✅ Trip Verified | My family and I were booke...,Family Leisure,,,,1,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...
3836,british_airways,C Mcculloch,2012,2012-08-29,Flight from Heathrow to Toronto. Booked emerge...,Economy Class,3.0,3.0,3,4,3,False
3837,british_airways,Nick Berry,2012,2012-08-28,LHR to HAM. Purser addresses all club passenge...,Business Class,4.0,5.0,4,0,3,True
3838,british_airways,Avril Barclay,2011,2011-10-12,My son who had worked for British Airways urge...,Economy Class,4.0,,,,,
3839,british_airways,C Volz,2011,2011-10-11,London City-New York JFK via Shannon on A318 b...,Premium Economy,1.0,3.0,5,0,1,False


In [50]:
air_framce_reviews_df.insert(0, 'Airlines', 'air_france')
emirates_reviews_df.insert(0, 'Airlines', 'emirates')
qatar_airways_reviews_df.insert(0, 'Airlines', 'qatar_airways')
qantas_airways_reviews_df.insert(0, 'Airlines', 'qantas_airways')
singapore_airlines_reviews_df.insert(0, 'Airlines', 'singapore_airlines')

In [51]:
air_framce_reviews_df

Unnamed: 0,Airlines,Name,Location,Date Published,Text Content,Seat Type,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Wifi & Connectivity,Value For Money,Recommended
0,air_france,J Tung,2024,2024-08-06,Not Verified | First time flying the newly ren...,Boeing 777-300,,,,,4,5,5,False
1,air_france,N Reddy,2024,2024-08-05,✅ Trip Verified | I had a very disappointing e...,Solo Leisure,,,,2.0,1,1,1,False
2,air_france,S Baine,2024,2024-08-05,✅ Trip Verified | Only at first everything w...,Family Leisure,,,,1.0,1,1,2,False
3,air_france,Ahmed Ahmed-Yahia,2024,2024-08-04,Not Verified | My son Ryan Ahmed-Yahia and I c...,A350-900,,,,,3,4,2,False
4,air_france,Chloe Davis,2024,2024-08-03,Not Verified | Air France made an error when...,Family Leisure,,,,5.0,5,1,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1373,air_france,T Price,2013,2013-06-19,BSL-CDG-ATH 28/5/13. Small ATR72 to CDG no han...,Economy Class,3.0,2.0,3.0,1.0,1,,,
1374,air_france,H Dalton,2013,2013-06-16,2 June 2013 Johannesburg to Paris CDG AF 995. ...,Business Class,4.0,1.0,2.0,2.0,1,,,
1375,air_france,F Francis,2013,2013-06-16,CDG-JFK. AF-006 2013-05-21. The airline offers...,Economy Class,3.0,3.0,5.0,3.0,4,,,
1376,air_france,Y Fall,2013,2013-06-12,Seoul-Paris CDG 777-300ER. Friendly and attent...,Economy Class,3.0,4.0,4.0,5.0,4,,,


In [52]:
airlines_review = pd.concat([
    british_airways_reviews_df,
    air_framce_reviews_df,
    emirates_reviews_df,
    qatar_airways_reviews_df,
    qantas_airways_reviews_df,
    singapore_airlines_reviews_df
], axis=0, ignore_index=True)

In [54]:
airlines_review.sample(10)

Unnamed: 0,Airlines,Name,Location,Date Published,Text Content,Seat Type,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Value For Money,Recommended,Ground Service,Wifi & Connectivity
5805,emirates,T Neyrinck,2019,2019-05-31,Not Verified | Emirates business class offers...,Boeing 777-300 / A380,,,,,1.0,False,4.0,3.0
1205,british_airways,C Hoffmann,2018,2018-10-23,✅ Trip Verified | London to Munich. The groun...,A321,,,,,2.0,False,,
3077,british_airways,Thomas Varughese,2015,2015-06-07,I had this terrible experience of traveling Br...,Economy Class,1.0,4.0,1.0,2.0,1.0,False,,
6625,emirates,P Evans,2016,2016-05-27,Expecting a great deal based on other reviews ...,Family Leisure,,,,3.0,4.0,False,1.0,3.0
13119,singapore_airlines,S Siauw,2015,2015-10-20,Singapore to Tokyo Narita. I'm a Krisflyer gol...,Boeing 777-300ER,,,,,3.0,False,5.0,3.0
1701,british_airways,Neil Baines,2017,2017-07-06,✅ Verified Review | London Heathrow to Larnac...,Boeing 767,,,,,2.0,False,,
10599,qantas_airways,Simon Allan,2019,2019-08-06,✅ Trip Verified | Sydney to Perth. Caveat emp...,Business,,,,1.0,3.0,False,4.0,3.0
557,british_airways,Haydn Robinson,2022,2022-04-04,Not Verified | The worst budget airline I’ve ...,Business,,,,1.0,1.0,False,,
9975,qatar_airways,M Hasan,2014,2014-01-22,I used Qatar for the first time in January 201...,Business Class,5.0,5.0,5.0,5.0,,,4.0,
11591,qantas_airways,John Smith,2014,2014-12-04,Flew on QF1/QF2 flight from London to Sydney r...,Business Class,3.0,2.0,1.0,5.0,,,4.0,


In [55]:
airlines_review.shape

(13590, 14)

In [56]:
airlines_review.to_csv('airlines_review.csv', index=False)