### Importing Libraries

In [1]:
import numpy as np
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup


### Data of buses from HYD to VJY

In [2]:

# Initialize a Selenium WebDriver (you need to have appropriate driver installed)
driver = webdriver.Chrome()  # You can use other drivers like Firefox, etc.
url = r'https://www.redbus.in/bus-tickets/hyderabad-to-vijayawada?fromCityName=Hyderabad&fromCityId=124&srcCountry=IND&toCityName=Vijayawada&toCityId=134&destCountry=IND&onward=14-Mar-2024&opId=0&busType=Any'
# Load the webpage
driver.get(url)

# Scroll down to load more results (assuming you need to scroll multiple times)
# You may need to adjust the scrolling mechanism based on the structure of the webpage
# Example:
for _ in range(40):  # Scroll down 5 times (adjust as needed)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    # Wait for some time for the content to load
    WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.clearfix.bus-item")))

# Now that all results are loaded, get the page source
page_source = driver.page_source

# Parse the page source using BeautifulSoup
soup = BeautifulSoup(page_source, "html.parser")
# Extract the desired information
bus_names = []
from_=[]
to_=[]
pickup_time=[]
drop_time=[]
prices=[]
ratings=[]
bus_type=[]
duration=[]

for i in soup.find_all('div', class_='clearfix bus-item'):
    bus = i.find('div', class_='travels lh-24 f-bold d-color')
    bus_names.append(bus.text)
    from_.append('HYD')
    to_.append('VJY')
    price = i.find('div',class_='seat-fare')
    p = re.findall('INR\s(\d+)',price.text)
    prices.append(p[0])
    rating = i.find('div',class_='column-six p-right-10 w-10 fl')
    rat = re.findall('([\d\.]+)',rating.text)
    try:
        ratings.append(rat[0]) 
    except:
        ratings.append(np.nan) 
    ptime = i.find('div', class_='dp-time f-19 d-color f-bold')
    pickup_time.append(ptime.text)
    dtime=i.find('div', class_='bp-time f-19 d-color disp-Inline')
    drop_time.append(dtime.text)
    btype=i.find('div', class_='bus-type f-12 m-top-16 l-color evBus')
    bus_type.append(btype.text)
    dur=i.find('div',class_='dur l-color lh-24')
    reg=re.findall('(\d+)h',dur.text)
    duration.append(reg[0])
        

# Close the WebDriver
driver.quit()


In [3]:
df=pd.DataFrame({'Bus_Name':bus_names,
                 'Bus_type':bus_type,
              "from":from_,
              "to":to_,
              "pickup_time":pickup_time,
              "drop_time":drop_time,
                 'duration':duration,
              'prices':prices,
              'ratings':ratings})

In [4]:
df

Unnamed: 0,Bus_Name,Bus_type,from,to,pickup_time,drop_time,duration,prices,ratings
0,IntrCity SmartBus,A/C Seater / Sleeper (2+1),HYD,VJY,23:50,05:35,05,706,4.7
1,IntrCity SmartBus,AC Sleeper (2+1),HYD,VJY,23:15,06:20,07,817,4.7
2,IntrCity SmartBus,A/C Seater / Sleeper (2+1),HYD,VJY,23:30,06:05,06,642,4.7
3,AdFRESHBUS,Electric A/C Seater (2+2),HYD,VJY,11:00,17:40,06,489,4.5
4,NueGo,Electric A/C Seater (2+2),HYD,VJY,05:00,11:05,06,499,4.6
...,...,...,...,...,...,...,...,...,...
205,SAMANVI CITICONNECT,A/C Sleeper (2+1),HYD,VJY,22:25,04:05,05,1200,3.6
206,CVR Travels,Non A/C Seater / Sleeper (2+1),HYD,VJY,22:15,04:17,06,780,3.6
207,BSR Tours And Travels,A/C Seater / Sleeper (2+1),HYD,VJY,22:00,04:10,06,780,3.6
208,Ajay Bus,Non A/C Seater / Sleeper (2+1),HYD,VJY,22:45,04:40,05,780,3.6


### Data of buses from HYD to BLR

In [5]:

# Initialize a Selenium WebDriver (you need to have appropriate driver installed)
driver = webdriver.Chrome()  # You can use other drivers like Firefox, etc.
url = r'https://www.redbus.in/bus-tickets/hyderabad-to-bangalore?fromCityName=Hyderabad&fromCityId=124&srcCountry=IND&toCityName=Bangalore&toCityId=122&destCountry=IND&onward=14-Mar-2024&opId=0&busType=Any'
# Load the webpage
driver.get(url)

# Scroll down to load more results (assuming you need to scroll multiple times)
# You may need to adjust the scrolling mechanism based on the structure of the webpage
# Example:
for _ in range(40):  # Scroll down 5 times (adjust as needed)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    # Wait for some time for the content to load
    WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.clearfix.bus-item")))

# Now that all results are loaded, get the page source
page_source = driver.page_source

# Parse the page source using BeautifulSoup
soup = BeautifulSoup(page_source, "html.parser")

# Extract the desired information
bus_names = []
from_=[]
to_=[]
pickup_time=[]
drop_time=[]
prices=[]
ratings=[]
bus_type=[]
duration=[]
for i in soup.find_all('div', class_='clearfix bus-item'):
    bus = i.find('div', class_='travels lh-24 f-bold d-color')
    bus_names.append(bus.text)
    from_.append('HYD')
    to_.append('BLR')
    price = i.find('div',class_='seat-fare')
    p = re.findall('INR\s(\d+)',price.text)
    prices.append(p[0])
    rating = i.find('div',class_='column-six p-right-10 w-10 fl')
    rat = re.findall('([\d\.]+)',rating.text)
    try:
        ratings.append(rat[0]) 
    except:
        ratings.append(np.nan) 
    ptime = i.find('div', class_='dp-time f-19 d-color f-bold')
    pickup_time.append(ptime.text)
    dtime=i.find('div', class_='bp-time f-19 d-color disp-Inline')
    drop_time.append(dtime.text)
    btype=i.find('div', class_='bus-type f-12 m-top-16 l-color evBus')
    bus_type.append(btype.text)
    dur=i.find('div',class_='dur l-color lh-24')
    reg=re.findall('(\d+)h',dur.text)
    duration.append(reg[0])

# Close the WebDriver
driver.quit()

In [6]:
df1=pd.DataFrame({'Bus_Name':bus_names,
                 'Bus_type':bus_type,
              "from":from_,
              "to":to_,
              "pickup_time":pickup_time,
              "drop_time":drop_time,
                  'duration':duration,
              'prices':prices,
              'ratings':ratings})

In [7]:
df1

Unnamed: 0,Bus_Name,Bus_type,from,to,pickup_time,drop_time,duration,prices,ratings
0,KSM Roadways,A/C Volvo B11R Multi-Axle Sleeper (2+1),HYD,BLR,22:00,07:46,09,1899,4.8
1,VSR Tours and Travels,A/C Sleeper (2+1),HYD,BLR,20:40,07:15,10,899,4.7
2,Jabbar Travels,Scania AC Multi Axle Sleeper (2+1),HYD,BLR,22:30,08:10,09,1200,4.7
3,AdNovember Travels,A/C Seater / Sleeper (2+1),HYD,BLR,23:20,09:15,09,1070,4.3
4,Jabbar Travels,VE A/C Sleeper (2+1),HYD,BLR,21:40,07:35,09,1200,4.7
...,...,...,...,...,...,...,...,...,...
169,VRL Travels,A/C Sleeper (2+1),HYD,BLR,21:00,07:15,10,1000,1.9
170,VRL Travels,A/C Sleeper (2+1),HYD,BLR,20:00,07:30,11,1200,3.2
171,VRL Travels,NON A/C Sleeper (2+1),HYD,BLR,23:30,07:30,08,1000,4.3
172,VRL Travels,NON A/C Sleeper (2+1),HYD,BLR,21:45,06:30,08,900,3.5


### Data of buses from BLR to CHN

In [8]:


# Initialize a Selenium WebDriver (you need to have appropriate driver installed)
driver = webdriver.Chrome()  # You can use other drivers like Firefox, etc.
url = r'https://www.redbus.in/search?fromCityName=Bangalore&fromCityId=122&srcCountry=IND&toCityName=Chennai&toCityId=123&destCountry=IND&onward=14-Mar-2024&opId=0&busType=Anyy'
# Load the webpage
driver.get(url)

# Scroll down to load more results (assuming you need to scroll multiple times)
# You may need to adjust the scrolling mechanism based on the structure of the webpage
# Example:
for _ in range(40):  # Scroll down 5 times (adjust as needed)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    # Wait for some time for the content to load
    WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.clearfix.bus-item")))

# Now that all results are loaded, get the page source
page_source = driver.page_source

# Parse the page source using BeautifulSoup
soup = BeautifulSoup(page_source, "html.parser")
# Extract the desired information
bus_names = []
from_=[]
to_=[]
pickup_time=[]
drop_time=[]
prices=[]
ratings=[]
bus_type=[]
duration=[]

for i in soup.find_all('div', class_='clearfix bus-item'):
    bus = i.find('div', class_='travels lh-24 f-bold d-color')
    bus_names.append(bus.text)
    from_.append('BLR')
    to_.append('CHN')
    price = i.find('div',class_='seat-fare')
    p = re.findall('INR\s(\d+)',price.text)
    prices.append(p[0])
    rating = i.find('div',class_='column-six p-right-10 w-10 fl')
    rat = re.findall('([\d\.]+)',rating.text)
    try:
        ratings.append(rat[0]) 
    except:
        ratings.append(np.nan) 

    ptime = i.find('div', class_='dp-time f-19 d-color f-bold')
    pickup_time.append(ptime.text)
    dtime=i.find('div', class_='bp-time f-19 d-color disp-Inline')
    drop_time.append(dtime.text)
    btype=i.find('div', class_='bus-type f-12 m-top-16 l-color evBus')
    bus_type.append(btype.text)
    dur=i.find('div',class_='dur l-color lh-24')
    reg=re.findall('(\d+)h',dur.text)
    duration.append(reg[0])
        

# Close the WebDriver
driver.quit()


In [9]:
df2=pd.DataFrame({'Bus_Name':bus_names,
                 'Bus_type':bus_type,
              "from":from_,
              "to":to_,
              "pickup_time":pickup_time,
              "drop_time":drop_time,
                  'duration':duration,
              'prices':prices,
              'ratings':ratings})

In [10]:
df2

Unnamed: 0,Bus_Name,Bus_type,from,to,pickup_time,drop_time,duration,prices,ratings
0,KMRL Kalaimakal,A/C Seater / Sleeper (2+1),BLR,CHN,22:15,05:00,06,640,4.8
1,V2K Travels,A/C Sleeper (2+1),BLR,CHN,22:00,04:50,06,800,4.7
2,Jabbar Travels,Volvo A/C B11R Multi Axle Semi Sleeper (2+2),BLR,CHN,23:15,05:25,06,500,4.6
3,AdIntrCity SmartBus,A/C Seater / Sleeper (2+1),BLR,CHN,23:05,06:10,07,607,4.1
4,KMRL Kalaimakal,A/C Sleeper (2+1),BLR,CHN,22:00,04:45,06,850,4.7
...,...,...,...,...,...,...,...,...,...
127,National travels,NON AC Seater / Sleeper 2+1,BLR,CHN,23:40,06:20,06,399,3.2
128,National travels,A/C Sleeper (2+1),BLR,CHN,14:10,21:20,07,745,3.4
129,National travels,A/C Sleeper (2+1),BLR,CHN,22:30,04:55,06,650,3.9
130,VJS Travels,Bharat Benz A/C Sleeper (2+1),BLR,CHN,04:30,10:20,05,700,2.9


### Concatination of all bus journeys

In [11]:
bus_data=pd.concat([df,df1,df2],axis=0,ignore_index=True)

In [12]:
bus_data

Unnamed: 0,Bus_Name,Bus_type,from,to,pickup_time,drop_time,duration,prices,ratings
0,IntrCity SmartBus,A/C Seater / Sleeper (2+1),HYD,VJY,23:50,05:35,05,706,4.7
1,IntrCity SmartBus,AC Sleeper (2+1),HYD,VJY,23:15,06:20,07,817,4.7
2,IntrCity SmartBus,A/C Seater / Sleeper (2+1),HYD,VJY,23:30,06:05,06,642,4.7
3,AdFRESHBUS,Electric A/C Seater (2+2),HYD,VJY,11:00,17:40,06,489,4.5
4,NueGo,Electric A/C Seater (2+2),HYD,VJY,05:00,11:05,06,499,4.6
...,...,...,...,...,...,...,...,...,...
511,National travels,NON AC Seater / Sleeper 2+1,BLR,CHN,23:40,06:20,06,399,3.2
512,National travels,A/C Sleeper (2+1),BLR,CHN,14:10,21:20,07,745,3.4
513,National travels,A/C Sleeper (2+1),BLR,CHN,22:30,04:55,06,650,3.9
514,VJS Travels,Bharat Benz A/C Sleeper (2+1),BLR,CHN,04:30,10:20,05,700,2.9


##### Classification of buses of AC_type

In [52]:
AC_type=[]
for i in bus_data["Bus_type"]:
            if "NON A/C" in i:
                AC_type.append('NON A/C')
            elif "Non A/C" in i:
                AC_type.append('NON A/C')
            else:
                AC_type.append('A/C')

##### Classification of buses of Seater

In [53]:
Seater=[]
for i in bus_data["Bus_type"]:
            if "Seater"  in i:
                Seater.append('yes')
            else:
                Seater.append('no')

##### Classification of buses of Sleeper

In [54]:
sleeper=[]
for i in bus_data['Bus_type']:
    reg=re.findall('^(?!.*Semi Sleeper).*(Sleeper).*$',i)
    if len(reg)==1:
        sleeper.append('yes')
    else:
        sleeper.append('no')

##### Classification of buses of Semi Sleeper

In [55]:
semi_sleeper=[]
for i in bus_data['Bus_type']:
    reg=re.findall('Semi Sleeper',i)
    if len(reg)==1:
        semi_sleeper.append('yes')
    else:
        semi_sleeper.append('no')

### Adding new columns to dataframes

In [56]:
bus_data['AC_type'] = AC_type
bus_data['Seater'] = Seater
bus_data['semi_sleeper'] = semi_sleeper
bus_data['sleeper'] = sleeper

In [57]:
bus_data

Unnamed: 0,Bus_Name,Bus_type,from,to,pickup_time,drop_time,duration,prices,ratings,AC_type,Seater,semi_sleeper,sleeper
0,IntrCity SmartBus,A/C Seater / Sleeper (2+1),HYD,VJY,23:50,05:35,05,706,4.7,A/C,yes,no,yes
1,IntrCity SmartBus,AC Sleeper (2+1),HYD,VJY,23:15,06:20,07,817,4.7,A/C,no,no,yes
2,IntrCity SmartBus,A/C Seater / Sleeper (2+1),HYD,VJY,23:30,06:05,06,642,4.7,A/C,yes,no,yes
3,AdFRESHBUS,Electric A/C Seater (2+2),HYD,VJY,11:00,17:40,06,489,4.5,A/C,yes,no,no
4,NueGo,Electric A/C Seater (2+2),HYD,VJY,05:00,11:05,06,499,4.6,A/C,yes,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...
511,National travels,NON AC Seater / Sleeper 2+1,BLR,CHN,23:40,06:20,06,399,3.2,A/C,yes,no,yes
512,National travels,A/C Sleeper (2+1),BLR,CHN,14:10,21:20,07,745,3.4,A/C,no,no,yes
513,National travels,A/C Sleeper (2+1),BLR,CHN,22:30,04:55,06,650,3.9,A/C,no,no,yes
514,VJS Travels,Bharat Benz A/C Sleeper (2+1),BLR,CHN,04:30,10:20,05,700,2.9,A/C,no,no,yes


## Missing Values

In [58]:
bus_data.isna().sum()

Bus_Name        0
Bus_type        0
from            0
to              0
pickup_time     0
drop_time       0
duration        0
prices          0
ratings         0
AC_type         0
Seater          0
semi_sleeper    0
sleeper         0
dtype: int64

In [59]:
bus_data['ratings'].unique()

array(['4.7', '4.5', '4.6', '4.4', '4.3', '4.2', '4.1', '4.0', '3.9',
       '3.8', '3.6', '3.5', '3.4', '3.7', '3.3', '3.2', '3.0', '4.8',
       '3.1', '2.9', '2.8', '2.6', '2.5', '2.3', '2.1', '2.7', '1.8',
       '2.0', '2.2', '1.6', '2.4', '1.7', '1.5', '1.9', '1.3', '4.9'],
      dtype=object)

In [60]:
bus_data.dropna(subset=['ratings'],inplace=True)

In [61]:
bus_data['ratings'].unique()

array(['4.7', '4.5', '4.6', '4.4', '4.3', '4.2', '4.1', '4.0', '3.9',
       '3.8', '3.6', '3.5', '3.4', '3.7', '3.3', '3.2', '3.0', '4.8',
       '3.1', '2.9', '2.8', '2.6', '2.5', '2.3', '2.1', '2.7', '1.8',
       '2.0', '2.2', '1.6', '2.4', '1.7', '1.5', '1.9', '1.3', '4.9'],
      dtype=object)

In [62]:
bus_data.isna().sum()

Bus_Name        0
Bus_type        0
from            0
to              0
pickup_time     0
drop_time       0
duration        0
prices          0
ratings         0
AC_type         0
Seater          0
semi_sleeper    0
sleeper         0
dtype: int64

## Duplicates Value

In [63]:
bus_data.duplicated().sum()

0

In [64]:
bus_data.drop_duplicates(inplace=True)

In [65]:
bus_data.duplicated().sum()

0

### Coverting Dataframe to csv file to perform Exploratory Data Analysis

In [66]:
bus_data

Unnamed: 0,Bus_Name,Bus_type,from,to,pickup_time,drop_time,duration,prices,ratings,AC_type,Seater,semi_sleeper,sleeper
0,IntrCity SmartBus,A/C Seater / Sleeper (2+1),HYD,VJY,23:50,05:35,05,706,4.7,A/C,yes,no,yes
1,IntrCity SmartBus,AC Sleeper (2+1),HYD,VJY,23:15,06:20,07,817,4.7,A/C,no,no,yes
2,IntrCity SmartBus,A/C Seater / Sleeper (2+1),HYD,VJY,23:30,06:05,06,642,4.7,A/C,yes,no,yes
3,AdFRESHBUS,Electric A/C Seater (2+2),HYD,VJY,11:00,17:40,06,489,4.5,A/C,yes,no,no
4,NueGo,Electric A/C Seater (2+2),HYD,VJY,05:00,11:05,06,499,4.6,A/C,yes,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...
511,National travels,NON AC Seater / Sleeper 2+1,BLR,CHN,23:40,06:20,06,399,3.2,A/C,yes,no,yes
512,National travels,A/C Sleeper (2+1),BLR,CHN,14:10,21:20,07,745,3.4,A/C,no,no,yes
513,National travels,A/C Sleeper (2+1),BLR,CHN,22:30,04:55,06,650,3.9,A/C,no,no,yes
514,VJS Travels,Bharat Benz A/C Sleeper (2+1),BLR,CHN,04:30,10:20,05,700,2.9,A/C,no,no,yes


In [67]:
bus_data.to_csv('redbus.csv')