In [1]:
# Importing Libraries

import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re
from time import sleep
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Launching the driver

driver=webdriver.Chrome(r"C:/Users/acer/Downloads/Compressed/chromedriver_win32_2/chromedriver.exe")
driver.implicitly_wait(0.5)

In [3]:
# Get user input for routes

sources = []
destinations = []
print("Please enter -1 when done.")
print("-"*10)
while True:
    sources.append(input("From which city?\n"))
    if "-1" in sources: 
        sources.pop(-1)
        break
    destinations.append(input("Where to?\n"))
    if "-1" in destinations: 
        sources.pop(-1)
        destinations.pop(-1)
        break
    print("-"*10)

print("\nRoutes:")
for i in range(len(sources)):
    print(f"{sources[i]} => {destinations[i]}")

Please enter -1 when done.
----------
From which city?
DEL
Where to?
BOM
----------
From which city?
BOM
Where to?
BLR
----------
From which city?
BLR
Where to?
HYD
----------
From which city?
HYD
Where to?
MAA
----------
From which city?
MAA
Where to?
DEL
----------
From which city?
-1

Routes:
DEL => BOM
BOM => BLR
BLR => HYD
HYD => MAA
MAA => DEL


In [4]:
# get user input for period (start and end date)

start_date = np.datetime64(input('Start Date, Please use YYYY-MM-DD format only '))
end_date = np.datetime64(input('End Date, Please use YYYY-MM-DD format only '))
days = end_date - start_date
num_days = days.item().days

Start Date, Please use YYYY-MM-DD format only 2022-04-01
End Date, Please use YYYY-MM-DD format only 2022-04-10


In [5]:
# Defing functions for scraping

def get_airlines(soup):
    airline = []
    airlines = soup.find_all('span',class_='codeshares-airline-names',text=True)
    for i in airlines:
        airline.append(i.text)
    return airline
    
def get_total_stops(soup):
    stops_list = []
    stops = soup.find_all('div',class_='section stops')

    for i in stops:
        for j in i.find_all('span',class_='stops-text'):
               stops_list.append(j.text)
    return stops_list

def get_price(soup):
    prices = []
    price = soup.find_all('div',class_='Flights-Results-FlightPriceSection right-alignment sleek')

    for i in price:
        for j in i.find_all('span', class_='price-text'):
            prices.append(j.text)
    return prices

def get_duration(soup):
    duration_list = []
    duration = soup.find_all('div' , class_='section duration allow-multi-modal-icons')
    for i in duration:
        for j in i.find_all('div',class_='top'):
            duration_list.append(j.text)
    return duration_list

In [9]:
# Scraping

for i in range(len(sources)):
    column_names = ["Airline", "Source", "Destination","Duration" ,"Total stops", "Price","Date"]
    df = pd.DataFrame(columns = column_names)
    for j in tqdm(range(num_days+1)):
        
        # close and open driver every 10 days to avoid captcha
        if j % 10 == 0:
            driver.quit()
            driver = webdriver.Chrome(r"C:/Users/acer/Downloads/Compressed/chromedriver_win32_2/chromedriver.exe")
            
        url = f"https://www.en.kayak.sa/flights/{sources[i]}-{destinations[i]}/{start_date+j}"
        driver.get(url)
        sleep(15)
        
        # click show more button to get all flights
        try:
            show_more_button = driver.find_element_by_xpath('//a[@class = "moreButton"]')
        except:
            
            # in case a captcha appears, require input from user so that the for loop pauses and the user can continue the
            # loop after solving the captcha
            input("Please solve the captcha then enter anything here to resume scraping.")
            
        while True:
            try:
                show_more_button.click()
                driver.implicitly_wait(10)
            except:
                break
    
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        airlines = get_airlines(soup)
        total_stops = get_total_stops(soup)
        prices = get_price(soup)
        duration = get_duration(soup)
        df = df.append(pd.DataFrame({
            'Airline': airlines,
            'Duration': duration,
            'Total stops' : total_stops,
            'Price' : prices,
            'Date' : start_date+j
                                    }))
        
    df['Source'] = sources[i]
    df['Destination'] = destinations[i]
    df = df.replace('\n','', regex=True)
    df = df.reset_index(drop = True)
    
    # save data as csv file for each route
    df.to_csv(f'{sources[i]}_{destinations[i]}.csv',index=False)
    print(f"Succesfully saved {sources[i]} => {destinations[i]} route as {sources[i]}_{destinations[i]}.csv ")
    
driver.quit()




  0%|                                                                                           | 0/10 [00:00<?, ?it/s][A[A[A


 10%|████████▎                                                                          | 1/10 [01:12<10:55, 72.79s/it][A[A[A


 20%|████████████████▌                                                                  | 2/10 [02:09<09:02, 67.85s/it][A[A[A


 30%|████████████████████████▉                                                          | 3/10 [03:05<07:31, 64.53s/it][A[A[A


 40%|█████████████████████████████████▏                                                 | 4/10 [04:08<06:22, 63.81s/it][A[A[A


 50%|█████████████████████████████████████████▌                                         | 5/10 [05:06<05:10, 62.10s/it][A[A[A


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [06:00<03:59, 59.76s/it][A[A[A


 70%|██████████████████████████████████████████████████████████                 

Succesfully saved DEL => BOM route as DEL_BOM.csv 





 10%|████████▎                                                                          | 1/10 [00:52<07:54, 52.73s/it][A[A[A


 20%|████████████████▌                                                                  | 2/10 [01:41<06:53, 51.64s/it][A[A[A


 30%|████████████████████████▉                                                          | 3/10 [02:23<05:40, 48.68s/it][A[A[A


 40%|█████████████████████████████████▏                                                 | 4/10 [03:05<04:39, 46.64s/it][A[A[A


 50%|█████████████████████████████████████████▌                                         | 5/10 [03:42<03:38, 43.80s/it][A[A[A


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [04:33<03:04, 46.05s/it][A[A[A


 70%|██████████████████████████████████████████████████████████                         | 7/10 [05:12<02:11, 43.82s/it][A[A[A


 80%|██████████████████████████████████████████████████████████████████▍        

Succesfully saved BOM => BLR route as BOM_BLR.csv 


ValueError: All arrays must be of the same length

In [16]:
df1 = pd.read_csv("DEL_BOM.csv")
df1

Unnamed: 0,Airline,Source,Destination,Duration,Total stops,Price,Date
0,Air India,DEL,BOM,2h 05m,nonstop,423 SAR,2022-04-01
1,Vistara,DEL,BOM,2h 00m,nonstop,360 SAR,2022-04-01
2,SpiceJet,DEL,BOM,2h 15m,nonstop,320 SAR,2022-04-01
3,Air India,DEL,BOM,2h 05m,nonstop,358 SAR,2022-04-01
4,Air India,DEL,BOM,2h 10m,nonstop,358 SAR,2022-04-01
...,...,...,...,...,...,...,...
1111,SpiceJet,DEL,BOM,8h 00m,1 stop,"2,067 SAR",2022-04-10
1112,SpiceJet,DEL,BOM,6h 50m,1 stop,"2,335 SAR",2022-04-10
1113,SpiceJet,DEL,BOM,6h 20m,1 stop,"2,671 SAR",2022-04-10
1114,SpiceJet,DEL,BOM,6h 20m,1 stop,"3,646 SAR",2022-04-10


In [17]:
df2 = pd.read_csv("BOM_BLR.csv")
df2

Unnamed: 0,Airline,Source,Destination,Duration,Total stops,Price,Date
0,Air India,BOM,BLR,1h 40m,nonstop,366 SAR,2022-04-01
1,Air India,BOM,BLR,1h 40m,nonstop,300 SAR,2022-04-01
2,SpiceJet,BOM,BLR,1h 50m,nonstop,269 SAR,2022-04-01
3,Air India,BOM,BLR,1h 40m,nonstop,300 SAR,2022-04-01
4,Air India,BOM,BLR,1h 40m,nonstop,300 SAR,2022-04-01
...,...,...,...,...,...,...,...
727,SpiceJet,BOM,BLR,6h 55m,1 stop,"1,909 SAR",2022-04-10
728,SpiceJet,BOM,BLR,6h 55m,1 stop,"1,943 SAR",2022-04-10
729,Vistara,BOM,BLR,6h 25m,1 stop,732 SAR,2022-04-10
730,Vistara,BOM,BLR,6h 55m,1 stop,732 SAR,2022-04-10


In [20]:
df =pd.concat([df1, df2], ignore_index=True)
df

Unnamed: 0,Airline,Source,Destination,Duration,Total stops,Price,Date
0,Air India,DEL,BOM,2h 05m,nonstop,423 SAR,2022-04-01
1,Vistara,DEL,BOM,2h 00m,nonstop,360 SAR,2022-04-01
2,SpiceJet,DEL,BOM,2h 15m,nonstop,320 SAR,2022-04-01
3,Air India,DEL,BOM,2h 05m,nonstop,358 SAR,2022-04-01
4,Air India,DEL,BOM,2h 10m,nonstop,358 SAR,2022-04-01
...,...,...,...,...,...,...,...
1843,SpiceJet,BOM,BLR,6h 55m,1 stop,"1,909 SAR",2022-04-10
1844,SpiceJet,BOM,BLR,6h 55m,1 stop,"1,943 SAR",2022-04-10
1845,Vistara,BOM,BLR,6h 25m,1 stop,732 SAR,2022-04-10
1846,Vistara,BOM,BLR,6h 55m,1 stop,732 SAR,2022-04-10


In [21]:
print(df.to_string())

                         Airline Source Destination Duration Total stops       Price        Date
0                      Air India    DEL         BOM  2h 05m     nonstop     423 SAR   2022-04-01
1                        Vistara    DEL         BOM  2h 00m     nonstop     360 SAR   2022-04-01
2                       SpiceJet    DEL         BOM  2h 15m     nonstop     320 SAR   2022-04-01
3                      Air India    DEL         BOM  2h 05m     nonstop     358 SAR   2022-04-01
4                      Air India    DEL         BOM  2h 10m     nonstop     358 SAR   2022-04-01
5                      Air India    DEL         BOM  2h 10m     nonstop     358 SAR   2022-04-01
6                      Air India    DEL         BOM  2h 15m     nonstop     358 SAR   2022-04-01
7                      Air India    DEL         BOM  2h 15m     nonstop     358 SAR   2022-04-01
8                        Vistara    DEL         BOM  2h 05m     nonstop     360 SAR   2022-04-01
9                        Vista

In [22]:
# Save DataSet

df.to_csv("df_flight.csv")