<a href="https://colab.research.google.com/github/varsha-angadi/Homework-Data-Scrapping/blob/main/MilitaryAircraft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Installing the required libraries

!pip install requests BeautifulSoup4 pandas



In [None]:
# importing the required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
# set up headers to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
}

In [None]:
# url of the website to scrape
url='https://en.wikipedia.org/wiki/List_of_active_Indian_military_aircraft'

In [None]:
# send GET request to the website
response = requests.get(url, headers=headers)

In [None]:
# parsing the HTML content of the website using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
# extract the tables from the website using BeautifulSoup
tables = soup.find_all('table', class_='wikitable')

In [None]:
all_aircraft = []
all_origins = []

# Iterate through each table
for table in tables:
    headers = []
    for th in table.find_all('th'):
        headers.append(th.text.strip())

    # Find indices of 'Aircraft' and 'Origin' columns if they exist
    aircraft_index = headers.index('Aircraft') if 'Aircraft' in headers else None
    origin_index = headers.index('Origin') if 'Origin' in headers else None

    # Extract data rows
    rows = []
    for tr in table.find_all('tr')[1:]:  # Skip the header row
        cells = tr.find_all(['td', 'th'])
        if aircraft_index is not None and origin_index is not None:
            if len(cells) > max(aircraft_index, origin_index):
                row_data = {
                    'Aircraft': cells[aircraft_index].text.strip(),
                    'Origin': cells[origin_index].text.strip()
                }
                rows.append(row_data)

    # Separate aircraft and origin into separate lists
    aircraft_list = [row['Aircraft'] for row in rows]
    origin_list = [row['Origin'] for row in rows]

    all_aircraft.extend(aircraft_list)
    all_origins.extend(origin_list)



In [None]:
# Create a DataFrame
df = pd.DataFrame({
    'Aircraft': all_aircraft,
    'Origin': all_origins
})

In [None]:
df

Unnamed: 0,Aircraft,Origin
0,Dassault Rafale,France
1,EH,28[1]
2,HAL Tejas,India
3,Conversion trainer,Mk.1 Trainer
4,Multirole,Mk.1A
...,...,...
81,MQ-9 Reaper,United States
82,Drishti-10,Israel/India
83,Dornier Do 228,Germany
84,HAL Dhruv,India


In [None]:
# prompt: Using dataframe df: remove rows where columns containing numbers in origin

udf = df[~df['Origin'].str.contains(r'\d+')]


In [None]:
udf

Unnamed: 0,Aircraft,Origin
0,Dassault Rafale,France
2,HAL Tejas,India
5,Sukhoi Su-30MKI,Russia
6,Mikoyan MiG-29,Soviet Union
8,Dassault Mirage 2000,France
...,...,...
81,MQ-9 Reaper,United States
82,Drishti-10,Israel/India
83,Dornier Do 228,Germany
84,HAL Dhruv,India


In [None]:
udf.to_csv('indian_military_aircraft_names_origin.csv', index=False)

In [None]:
print("Data has been successfully extracted and saved to 'indian_military_aircraft_names_origin.csv'.")

Data has been successfully extracted and saved to 'indian_military_aircraft_names_origin.csv'.
