# Web Scrapping (Data Collection)

You have to scrape at least 1500 rows of data. You can scrape more data as well, it’s up to you, More the data better the model In this section you have to scrape the data of flights from different websites (yatra.com, skyscanner.com, official websites of airlines, etc). The number of columns for data doesn’t have limit, it’s up to you and your creativity. Generally, these columns are airline name, date of journey, source, destination, route, departure time, arrival time, duration, total stops and the target variable price. You can make changes to it, you can add or you can remove some columns, it completely depends on the website from which you are fetching the data.

airline name, date of journey, source, destination, departure time, arrival time, duration, total stops and the target variable price

In [1]:
# Importing Libraries/Dependencies
import selenium
import pandas as pd
import numpy as np
import time
import requests
import re
from selenium import webdriver 
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementNotVisibleException
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException, ElementClickInterceptedException

In [None]:
# Connecting to the webdriver 
driver = webdriver.Chrome("C:\web driver\chromedriver.exe")
driver.maximize_window()        # Maximizing the window

In [None]:
# Getting the webpage of mentioned url
url = "https://flight.yatra.com/air-search-ui/dom2/trigger?type=O&viewName=normal&flexi=0&noOfSegments=1&origin=GOI&originCountry=IN&destination=BOM&destinationCountry=IN&flight_depart_date=05%2F08%2F2022&ADT=1&CHD=0&INF=0&class=Economy&source=fresco-home&unqvaldesktop=846570657840"
driver.get(url)
time.sleep(25)

In [None]:
# Let's make a list of locations to scrap data
loc_lst = ['New Delhi', 'Mumbai', 'Bangalore', 'Chennai','Hyderabad ','Goa ','Kolkata ','Jaipur ','Lucknow ']

In [None]:
# Creating empty lists to fetch required data

Airline_name=[]      # The name of the airline.
Departure_time = []  # The time when the journey starts from the source.
Arrival_time = []    # Time of arrival at the destination.
Duration=[]          # Total duration taken by the flight to reach the destination from the source.
Stops = []           # Total stops between the source and destination.
Source = []          # The source from which the service begins.
Destination = []     # The destination where the service ends.
Meal=[]              # Availability of meals in the flight.
Price=[]             # The price of the flight ticket.
Location=[]          # The location of the flights 

In [None]:
# Scraping the data from the mentioned url

# Fetching webelement of source 
for x in loc_lst:
    for y in loc_lst:
        if x!=y:                 
            Source = driver.find_element_by_xpath("//div[@class='input-holder pb-2 bdr-btm']/input")
            time.sleep(2)
            Source.clear() 
            Source.send_keys(x)
            time.sleep(2)
            
            # Fetching webelement of destination
            Destination = driver.find_element_by_xpath("//div[@class='input-holder  bdr-btm pb-2']/input")
            Destination.click()
            Destination.clear()
            time.sleep(2)
            Destination.send_keys(y)
            time.sleep(2)
            
            # Searching for flights again and clicking in search again button
            try:
                srch_btn = driver.find_element_by_xpath("//button[@class='fs-14 btn-submit cursor-pointer bold']")
                wait = WebDriverWait(driver, 10)
                wait.until(EC.visibility_of(srch_btn))
                srch_btn.click()
            except NoSuchElementException:
                pass
            time.sleep(2)
            
            # Scrolling the data 5000 times 
            for _ in range(15):
                time.sleep(1)
                driver.execute_script("window.scrollBy(0,5000)")
                time.sleep(5)
            
            # Fetching web element of scroll to top button
            try:
                driver.find_element_by_xpath("//div[@title='scroll to top']").click()
            except NoSuchElementException:
                pass
            
           # Fetching name of Airline
            try:
                for i in driver.find_elements_by_xpath("//div[@class='fs-13 airline-name no-pad col-8']/span"):
                    Airline_name.append(i.text)
            except NoSuchElementException:
                pass
            
            # Fetching departure time of the flights
            try:
                for i in driver.find_elements_by_xpath("//div[@class='i-b pr']"):
                    Departure_time.append(i.text)
            except NoSuchElementException:
                pass
            
            # Fetching arrival time of the flights 
            try:
                for i in driver.find_elements_by_xpath("//div[@class='i-b pdd-0 text-left atime col-5']//p[1]"):
                    Arrival_time.append(i.get_attribute("innerHTML").splitlines()[0][0:5])
            except NoSuchElementException:
                pass
            
            # Fetching Duration of flights journey
            try:
                for i in driver.find_elements_by_xpath("//div[@class='stop-cont pl-13']/p"):
                    Duration.append(i.text)
            except NoSuchElementException:
                pass
            
            # Fetching number of stops the flights have between source and destination
            try:
                for i in driver.find_elements_by_xpath("//div[@class=' font-lightgrey fs-10 tipsy i-b fs-10']/span[1]"):
                    Stops.append(i.text)
            except NoSuchElementException:
                pass
            
            # Fetching availability of meal in flights
            try:
                for i in driver.find_elements_by_xpath("//div[@class='features pull-right fs-12 flex']/div[1]"):
                    Meal.append(i.text)
            except NoSuchElementException:
                pass
            
            # Fetching Locations
            try:
                for i in driver.find_elements_by_xpath("//p[@class='fs-10 font-lightgrey no-wrap city ellipsis']"):
                    Location.append(i.text)
            except NoSuchElementException:
                pass
            
            # Fetching Prices of flight tickets
            try:
                for i in driver.find_elements_by_xpath("//div[@class='i-b tipsy fare-summary-tooltip fs-18']"):
                    Price.append(i.text)
            except NoSuchElementException:
                pass

In [None]:
Source = Location[0:len(Location):2]
Destination = Location[1:len(Location):2]

In [None]:
# Checking length of source and destination
len(Source), len(Destination)

In [None]:
# Checking lengths of all features
print(len(Price),len(Airline_name),len(Departure_time),len(Arrival_time),len(Duration),len(Stops),len(Meal))

In [None]:
# Creating a dataframe for scraped data
data = list(zip(Airline_name,Departure_time,Arrival_time,Duration,Source,Destination,Meal,Stops,Price))
df = pd.DataFrame(data, columns = ["Airline","Departure_time","Time_of_arrival","Duration", "Source","Destination","Meal_availability","Number_of_stops","Price"])

In [None]:
df

In [None]:
# Saving the data into excel file
df.to_excel("Flight_Prices.xlsx")

I have successfully scraped the required data from the website yatra.com. The dataframe consists of 5303 rows and 9 columns. And I have saved this dataframe in excel format as "Flight_Prices".