# Scrapping using Python's beautiful soup

### Import libraries

In [1]:
import bs4 as bs
import urllib.request
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from functools import wraps

### Test upon a single page

In [2]:
source = urllib.request.urlopen('https://www.formula1.com/en/results.html/1950/drivers.html').read()
soup = bs.BeautifulSoup(source,'lxml')
table = soup.find_all('table')[0] 

In [3]:
df = pd.read_html(str(table), flavor='bs4', header=[0])[0]
df.drop(["Unnamed: 0","Unnamed: 6"],axis=1, inplace=True)
df.head()

Unnamed: 0,Pos,Driver,Nationality,Car,PTS
0,1,Nino Farina FAR,ITA,Alfa Romeo,30
1,2,Juan Manuel Fangio FAN,ARG,Alfa Romeo,27
2,3,Luigi Fagioli FAG,ITA,Alfa Romeo,24
3,4,Louis Rosier ROS,FRA,Talbot-Lago,13
4,5,Alberto Ascari ASC,ITA,Ferrari,11


### Scrap all pages

In [4]:
homePage='https://www.formula1.com/en/results.html/'
driverPage='/drivers.html'

pages scatters between 1950 and 2021

In [5]:
for year in range(1950,2022):
    source = urllib.request.urlopen(homePage+str(year)+driverPage).read()
    soup = bs.BeautifulSoup(source,'lxml')
    table = soup.find_all('table')[0]  
    df = df.append(pd.read_html(str(table), flavor='bs4', header=[0])[0])
    df.drop(["Unnamed: 0","Unnamed: 6"],axis=1, inplace=True)

shape of df

In [6]:
df.shape

(1618, 5)

In [7]:
df

Unnamed: 0,Pos,Driver,Nationality,Car,PTS
0,1,Nino Farina FAR,ITA,Alfa Romeo,30.0
1,2,Juan Manuel Fangio FAN,ARG,Alfa Romeo,27.0
2,3,Luigi Fagioli FAG,ITA,Alfa Romeo,24.0
3,4,Louis Rosier ROS,FRA,Talbot-Lago,13.0
4,5,Alberto Ascari ASC,ITA,Ferrari,11.0
...,...,...,...,...,...
16,17,Nicholas Latifi LAT,CAN,Williams Mercedes,7.0
17,18,Antonio Giovinazzi GIO,ITA,Alfa Romeo Racing Ferrari,3.0
18,19,Mick Schumacher MSC,GER,Haas Ferrari,0.0
19,20,Robert Kubica KUB,POL,Alfa Romeo Racing Ferrari,0.0


as Pos and Nationality aren't relative to our cause which is to enrich our existing data by adding cars and last points we can eliminate unuseful columns

In [8]:
df=df.drop(columns='Pos')
df=df.drop(columns='Nationality')

later on we need to match the previous data and this current data so as the driver code is UNIQUE we need to clean the column 'Driver' and extract the Code

In [9]:
df['Code']=df['Driver'].str[-3:]
df['Driver']=df['Driver'].str[:-4]

In [10]:
df

Unnamed: 0,Driver,Car,PTS,Code
0,Nino Farina,Alfa Romeo,30.0,FAR
1,Juan Manuel Fangio,Alfa Romeo,27.0,FAN
2,Luigi Fagioli,Alfa Romeo,24.0,FAG
3,Louis Rosier,Talbot-Lago,13.0,ROS
4,Alberto Ascari,Ferrari,11.0,ASC
...,...,...,...,...
16,Nicholas Latifi,Williams Mercedes,7.0,LAT
17,Antonio Giovinazzi,Alfa Romeo Racing Ferrari,3.0,GIO
18,Mick Schumacher,Haas Ferrari,0.0,MSC
19,Robert Kubica,Alfa Romeo Racing Ferrari,0.0,KUB


chechink for duplicates

In [11]:
dfDuplicated = df[df.duplicated()]

print(df)

                Driver                        Car   PTS Code
0          Nino Farina                 Alfa Romeo  30.0  FAR
1   Juan Manuel Fangio                 Alfa Romeo  27.0  FAN
2        Luigi Fagioli                 Alfa Romeo  24.0  FAG
3         Louis Rosier                Talbot-Lago  13.0  ROS
4       Alberto Ascari                    Ferrari  11.0  ASC
..                 ...                        ...   ...  ...
16     Nicholas Latifi          Williams Mercedes   7.0  LAT
17  Antonio Giovinazzi  Alfa Romeo Racing Ferrari   3.0  GIO
18     Mick Schumacher               Haas Ferrari   0.0  MSC
19       Robert Kubica  Alfa Romeo Racing Ferrari   0.0  KUB
20      Nikita Mazepin               Haas Ferrari   0.0  MAZ

[1618 rows x 4 columns]


In [13]:
df[df['Driver']=="Lewis Hamilton"]

Unnamed: 0,Driver,Car,PTS,Code
1,Lewis Hamilton,McLaren Mercedes,109.0,HAM
0,Lewis Hamilton,McLaren Mercedes,98.0,HAM
4,Lewis Hamilton,McLaren Mercedes,49.0,HAM
3,Lewis Hamilton,McLaren Mercedes,240.0,HAM
4,Lewis Hamilton,McLaren Mercedes,227.0,HAM
3,Lewis Hamilton,McLaren Mercedes,190.0,HAM
3,Lewis Hamilton,Mercedes,189.0,HAM
0,Lewis Hamilton,Mercedes,384.0,HAM
0,Lewis Hamilton,Mercedes,381.0,HAM
1,Lewis Hamilton,Mercedes,380.0,HAM


as we can see there's many duplicates ,latest data proves that lewis Hamilton Coded HAM have currently 387.5 points which appear in the last occurence 

In [14]:
df1=df.drop_duplicates(subset=['Code'], keep='last')

In [15]:
df1.shape

(287, 4)

In [16]:
df1[df1['Driver']=="Lewis Hamilton"]

Unnamed: 0,Driver,Car,PTS,Code
1,Lewis Hamilton,Mercedes,387.5,HAM


### Export data to csv

In [18]:
df1.to_csv (r'drivers.csv', index = False, header=True)
