In [1]:
import pandas as pd
from selenium import webdriver
import time
import datetime
#
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
#
import bs4
import urllib
#
import random

## SCRAPPING DATA

In [2]:
data_old = pd.read_csv('bwf_historic_rankings.csv')
data_old = data_old.drop('Unnamed: 0', axis = 1)

In [3]:
data_old.head()

Unnamed: 0,date,draw,rank,points,country,name
0,2022-08-15,MS,1,121606,DENMARK,VIKTOR AXELSEN
1,2022-08-15,MS,2,112704,JAPAN,KENTO MOMOTA
2,2022-08-15,MS,3,98300,DENMARK,ANDERS ANTONSEN
3,2022-08-15,MS,4,93776,CHINESE TAIPEI,CHOU TIEN CHEN
4,2022-08-15,MS,5,91578,MALAYSIA,LEE ZII JIA


In [4]:
last_fetched_date = data_old['date'].max()
last_fetched_date

'2022-08-15'

In [8]:
driver = webdriver.Chrome()

driver.get('https://badmintonstatistics.net/Rankings')
driver.maximize_window()

In [9]:
select = Select(driver.find_element(By.XPATH, '//*[@id="rankingPeriodsSelect"]'))
# select by visible text
select.select_by_index('0')

In [10]:
#getting last ID vs historic data
last_current_week = driver.find_element(By.XPATH, '//*[@id="rankingPeriodsSelect"]/option[1]').text.replace('/', '-')
last_current_week

'26-12-2022'

In [11]:
dic_ranking = {'date':[], 'draw':[], 'top_100':[]}

In [12]:
last_fetched_date

'2022-08-15'

In [13]:
date_listing = driver.find_element(By.XPATH, '//*[@id="rankingPeriodsSelect"]').text
date_listing = date_listing.replace(' ','').replace('/','-').split('\n')
date_listing = [x[6:10] + '-' + x[3:5] + '-' + x[0:2] for x in date_listing]
date_listing[0:5]

['2022-12-26', '2022-12-19', '2022-12-12', '2022-12-05', '2022-11-28']

In [14]:
date_listing_filtered = [x for x in date_listing if x > last_fetched_date]
display(date_listing_filtered[:5])
display(date_listing_filtered[-5:])

['2022-12-26', '2022-12-19', '2022-12-12', '2022-12-05', '2022-11-28']

['2022-09-19', '2022-09-12', '2022-09-05', '2022-08-29', '2022-08-22']

In [15]:
for date in date_listing_filtered:

    for draw in ['MS','WS','MD','WD','XD']:
        
        driver.get('https://badmintonstatistics.net/Rankings?date=' + date + '&category=' + draw + '&country=%&page=1&pagesize=100')

        top_100 = driver.find_element(By.XPATH, '//*[@id="RankingList"]/div[2]/table/tbody')

        dic_ranking['date'].append(date)
        dic_ranking['draw'].append(draw)
        dic_ranking['top_100'].append(top_100.text)

        #time.sleep(random.randint(3,5))

In [16]:
delta = pd.DataFrame.from_dict(dic_ranking)
delta

Unnamed: 0,date,draw,top_100
0,2022-12-26,MS,1 Viktor AXELSEN MS 108056\n2 LEE Zii Jia MS 7...
1,2022-12-26,WS,1 Akane YAMAGUCHI WS 103613\n2 TAI Tzu Ying WS...
2,2022-12-26,MD,1 Fajar ALFIAN / Muhammad Rian ARDIANTO MD 886...
3,2022-12-26,WD,1 CHEN Qingchen / JIA Yifan WD 105816\n2 Nami ...
4,2022-12-26,XD,1 ZHENG Siwei / HUANG Yaqiong XD 109600\n2 Dec...
...,...,...,...
90,2022-08-22,MS,1 Viktor AXELSEN MS 121606\n2 Kento MOMOTA MS ...
91,2022-08-22,WS,1 Akane YAMAGUCHI WS 111013\n2 TAI Tzu Ying WS...
92,2022-08-22,MD,1 Marcus Fernaldi GIDEON / Kevin Sanjaya SUKAM...
93,2022-08-22,WD,1 CHEN Qingchen / JIA Yifan WD 108666\n2 Yuki ...


In [17]:
delta.to_csv('export_delta_top100_bwf.csv')

## CLEANING DATA

In [18]:
def prep_data(df):

    df = pd.DataFrame(df.top_100.str.split('\n').tolist(), index=df[['date','draw']]).stack().reset_index()
    
    df.columns = ['date / draw', 'to_drop', 'content']

    df['date'] = df['date / draw'].str[0]
    df['draw'] = df['date / draw'].str[1]
    df['rank'] = df['content'].str.split(' ', 1).str[0]
    df['points'] = df['content'].str.split(' ').str[-1]
    df['country'] = df['content'].str.split(' ').str[-3].str.upper()
    df['name'] = df['content'].str.split(' ').str[1:-3]

    df['country'] = df['country'].str.replace('KONG','HONG KONG')
    df['country'] = df['country'].str.replace('TAIPEI','CHINESE TAIPEI')
    
    df['name'] = df['name'].apply(lambda x: ','.join(map(str, x)))
    df['name'] = df['name'].str.replace(',',' ').str.upper()
    df['name'] = df['name'].str.replace('CHINESE','')
    
    df['points'] = df['points'].astype('int')
    
    df['rank'] = df['rank'].astype('int')

    df = df.drop(['date / draw','content', 'to_drop'], axis = 1)
    
    display(df.info())
    display(df.head())
    
    return df

In [19]:
delta_prep = prep_data(delta)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9500 entries, 0 to 9499
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   date     9500 non-null   object
 1   draw     9500 non-null   object
 2   rank     9500 non-null   int32 
 3   points   9500 non-null   int32 
 4   country  9500 non-null   object
 5   name     9500 non-null   object
dtypes: int32(2), object(4)
memory usage: 371.2+ KB


None

Unnamed: 0,date,draw,rank,points,country,name
0,2022-12-26,MS,1,108056,AXELSEN,VIKTOR
1,2022-12-26,MS,2,79468,JIA,LEE ZII
2,2022-12-26,MS,3,77394,YEW,LOH KEAN
3,2022-12-26,MS,4,75812,CHRISTIE,JONATAN
4,2022-12-26,MS,5,75749,GINTING,ANTHONY SINISUKA


In [20]:
print(delta_prep.shape)
print(data_old.shape)

(9500, 6)
(829000, 6)


In [21]:
data_full = pd.concat([delta_prep, data_old], ignore_index=True)

In [22]:
data_full.head()

Unnamed: 0,date,draw,rank,points,country,name
0,2022-12-26,MS,1,108056,AXELSEN,VIKTOR
1,2022-12-26,MS,2,79468,JIA,LEE ZII
2,2022-12-26,MS,3,77394,YEW,LOH KEAN
3,2022-12-26,MS,4,75812,CHRISTIE,JONATAN
4,2022-12-26,MS,5,75749,GINTING,ANTHONY SINISUKA


In [23]:
data_full.to_csv('bwf_historic_rankings.csv')