In [None]:
pip install beautifulsoup4



In [None]:
pip install matplotlib



In [None]:
pip install textblob



In [None]:
pip install openpyxl



In [None]:
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
from dateutil import parser
import matplotlib.pyplot as plt
from textblob import TextBlob
from collections import Counter
from openpyxl import Workbook

## Define a function to implement the scrapping

In [None]:
def scrape_airline_info(base_url, airline):
    url = base_url
    response = requests.get(url)
    content = response.content
    soup = BeautifulSoup(content, 'html.parser')

    Categories = ['Food & Beverages',
                'Inflight Entertainment',
                'Seat Comfort',
                'Staff Service',
                'Value for Money']
    ratings = {category: [] for category in Categories}
    info = pd.DataFrame(columns=['Airline','star_airline', 'Review Count', 'Rating'] + Categories)

    for div in soup.find_all('section', class_='comp_review-header'):

        category_values = {}
        for category in Categories:
            category_values[category] = "N/A"

        for row in div.find_all('tr'):
            header = row.find('td', class_='review-rating-header').text
            value = len(row.find_all('span', class_='star fill')) if row.find_all('span', class_='star fill') else "N/A"
            if header in Categories:
                category_values[header] = value

    airline = soup.find("h1", itemprop="name").text.strip()

    img_tags = soup.find_all('img')
    star_airline_img = soup.find('img', class_='skytrax-rating')
    star_airline = re.search(r'\d+', star_airline_img['alt']).group() if star_airline_img else "N/A"

    review_count = soup.find('span', itemprop='reviewCount').text.strip()

    rating = soup.find('span', itemprop='ratingValue').text.strip()

    info_new_row = pd.DataFrame({
                'Airline': [airline],
                'star_airline':[star_airline],
                'Review Count':[review_count],
                'Rating':[rating],
                **category_values
            })

    info = pd.concat([info, info_new_row], ignore_index=True)
    return info


In [None]:


def scrape_airline_reviews(base_url, pages, airline, region):
    # Create empty lists for ratings
    categories = ['Seat Comfort', 'Cabin Staff Service', 'Food & Beverages',
              'Inflight Entertainment', 'Ground Service', 'Wifi & Connectivity', 'Value For Money']
    ratings = {category: [] for category in categories}

    # Create an empty DataFrame
    db = pd.DataFrame(columns=['Airline', 'Region',  'Title', 'Rating', 'Date Flown', 'Aircraft','Type_of_Traveller',
                               'Seat_Type','Route','Recommended', 'Review','Verified'] + categories)


    for i in range(1, pages + 1):
        url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize=100"
        response = requests.get(url)
        content = response.content
        soup = BeautifulSoup(content, 'html.parser')

        for div in soup.find_all("article", itemprop="review"):
            rating = div.find("span", itemprop="ratingValue")
            rating = rating.text if rating else "N/A"

            title = div.find("h2", class_="text_header").get_text().strip('"')

            date = div.find("time", itemprop="datePublished")
            date = parser.parse(date.text).strftime('%Y-%m-%d') if date else "N/A"

            verified = 'No' if div.find('em', string='Not Verified') else 'Yes'
            # review = div.find("div", class_="text_content").get_text().strip('✅')
            review = div.find("div", class_="text_content").get_text().split('|', 1)[-1].strip()

            aircraft = div.find("td", class_="review-rating-header aircraft")
            aircraft = aircraft.find_next("td", class_="review-value").text if aircraft else "N/A"

            traveller = div.find("td", class_="review-rating-header type_of_traveller")
            traveller = traveller.find_next("td", class_="review-value").text if traveller else "N/A"

            seat_type = div.find("td", class_="review-rating-header cabin_flown")
            seat_type = seat_type.find_next("td", class_="review-value").text if seat_type else "N/A"

            route = div.find("td", class_="review-rating-header route")
            route = route.find_next("td", class_="review-value").text if route else "N/A"

            date_flown = div.find("td", string="Date Flown")
            date_flown = parser.parse(date_flown.find_next("td", class_="review-value").text).strftime('%Y-%m') if date_flown else "N/A"

            recommended = div.find("td", class_="review-rating-header recommended")
            recommended = recommended.find_next("td", class_=lambda x: x and x.startswith("review-value rating-")).text if recommended else "N/A"

            category_values = {}

            category_values = {}
            for category in categories:
                category_values[category] = "N/A"

            for row in div.find_all('tr'):
                header = row.find('td', class_='review-rating-header').text
                value = len(row.find_all('span', class_='star fill')) if row.find_all('span', class_='star fill') else "N/A"
                if header in categories:
                    category_values[header] = value

            # Append the data to the DataFrame
            db_new_row = pd.DataFrame({
                'Airline': [airline],
                'Aircraft': [aircraft],
                'Type_of_Traveller': [traveller],
                'Seat_Type': [seat_type],
                'Route': [route],
                'Region': [region],
                'Rating': [rating],
                'Title': [title],
                'Date_of_Review': [date],
                'Verified':[verified],
                'Review': [review],
                'Date Flown': [date_flown],
                'Recommended': [recommended],
                **category_values
            })

            db = pd.concat([db, db_new_row], ignore_index=True)
            csv_file_name = f'{airline}_reviews.csv'
            db.to_csv(csv_file_name, index=False)

    return db

### List all the airlines

British_Airways  

Lufthansa

Air_France

KLM_Royal_Dutch_Airlines

Ryanair

Turkish_Airlines

In [None]:
airlines = [
    {
        "name": "British_Airways",
        "url": "https://www.airlinequality.com/airline-reviews/british-airways",
        "pages": 37,
        "region": "Europe"
    },
    {
        "name": "Lufthansa",
        "url": "https://www.airlinequality.com/airline-reviews/lufthansa",
        "pages": 24,
        "region": "Europe"
    },
    {
        "name": "Air_France",
        "url": "https://www.airlinequality.com/airline-reviews/air-france/",
        "pages": 14,
        "region": "Europe"
    },

    {
        "name": "KLM_Royal_Dutch_Airlines",
        "url": "https://www.airlinequality.com/airline-reviews/klm-royal-dutch-airlines",
        "pages": 16,
        "region": "Europe"
    },
    {
        "name": "Ryanair",
        "url": "https://www.airlinequality.com/airline-reviews/ryanair",
        "pages": 23,
        "region": "Europe"
    },
        {
        "name": "Turkish_Airlines",
        "url": "https://www.airlinequality.com/airline-reviews/turkish-airlines",
        "pages": 25,
        "region": "Europe"
    }
]


### Run the function and concatenate them into a single DataFrame

In [None]:
all_info = pd.DataFrame()
# Loop through each airline
for airline_info in airlines:
    airline_name = airline_info["name"]
    base_url = airline_info["url"]
    pages = airline_info["pages"]

    # Call the function for scraping reviews
    info = scrape_airline_info(base_url, airline)

    # Concatenate the reviews to the combined DataFrame
    all_info = pd.concat([all_info, info], ignore_index=True)

# Save the combined DataFrame to a CSV file
all_info.to_csv('combined_airline_info.csv', index=False)

In [None]:
all_info

Unnamed: 0,Airline,star_airline,Review Count,Rating,Food & Beverages,Inflight Entertainment,Seat Comfort,Staff Service,Value for Money
0,British Airways,4,3733,5,3,3,3,3,3
1,Lufthansa,4,2400,5,3,3,3,4,3
2,Air France,4,1332,5,3,3,3,3,3
3,KLM Royal Dutch Airlines,4,1567,5,3,3,3,4,3
4,Ryanair,3,2246,4,2,1,2,3,3
5,Turkish Airlines,4,2481,4,3,3,3,3,3


In [None]:
all_reviews = pd.DataFrame()
pagesize = 100
# Loop through each airline
for airline_info in airlines:
    airline_name = airline_info["name"]
    base_url = airline_info["url"]
    # pagesize = airline_info["pagesize"]
    pages = airline_info["pages"]
    region = airline_info["region"]

    # Call the function for scraping reviews
    reviews = scrape_airline_reviews(base_url, pages, airline_name, region)

    # Concatenate the reviews to the combined DataFrame
    all_reviews = pd.concat([all_reviews, reviews], ignore_index=True)

# Save the combined DataFrame to a CSV file
all_reviews.to_csv('combined_airline_reviews.csv', index=False)

In [None]:
all_reviews

Unnamed: 0,Airline,Region,Title,Rating,Date Flown,Aircraft,Type_of_Traveller,Seat_Type,Route,Recommended,Review,Verified,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Wifi & Connectivity,Value For Money,Date_of_Review
0,British_Airways,Europe,"baggage arrival nearly took 1.5 hours""",4,2024-01,,Business,Economy Class,Glasgow to London,no,"Overall journey wasn’t bad however at the end,...",No,3,4,,,4,,3,2024-01-12
1,British_Airways,Europe,Overall very satisfied,9,2024-01,A350-1000,Solo Leisure,Economy Class,Vancouver to London,yes,Overall very satisfied. Ground staff member at...,Yes,4,5,3,4,5,4,5,2024-01-12
2,British_Airways,Europe,"airline is a total disgrace""",1,2023-10,A320,Business,Business Class,Heathrow to Madrid,no,As always when I fly BA it was a total shamble...,Yes,1,1,1,1,1,1,1,2024-01-09
3,British_Airways,Europe,we were pleased with the service,9,2023-11,Boeing 777 -200,Couple Leisure,Business Class,London to New York,yes,First time using BA business class but we were...,Yes,5,5,5,5,5,,4,2024-01-07
4,British_Airways,Europe,Gate agent was extremely rude,6,2024-01,,Family Leisure,Economy Class,Rome to London,no,Extremely rude ground service. We were non-rev...,No,4,5,4,,1,,2,2024-01-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13721,Turkish_Airlines,Europe,Turkish Airlines customer review,10,,,,Economy Class,,yes,19/08 ATH-IST Boeing 737-800. A small flight n...,Yes,5,5,4,4,,,5,2013-09-03
13722,Turkish_Airlines,Europe,Turkish Airlines customer review,9,,,,Economy Class,,yes,Istanbul - Odessa. Flight was full but left an...,Yes,5,4,4,1,,,5,2013-09-02
13723,Turkish_Airlines,Europe,Turkish Airlines customer review,8,,,,Economy Class,,yes,Singapore to Dublin via Istanbul and return Du...,Yes,4,4,4,4,,,4,2013-08-28
13724,Turkish_Airlines,Europe,Turkish Airlines customer review,10,,,,Economy Class,,yes,I flew 2 weeks ago Amman-Istanbul-Dnepropetrov...,Yes,4,5,5,5,,,4,2013-08-27
