# IMPORTING LIBRARIES

In [1]:
import pandas as pd
from selenium import webdriver
import time
import datetime
#
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
#
import bs4
import urllib
#
import random

# SCRAPPING RANKING DATA

### IMPORTING LAST DATA

In [2]:
data_old = pd.read_csv('bwf_historic_rankings.csv')
data_old = data_old.drop('Unnamed: 0', axis = 1)

In [3]:
last_fetched_date = data_old['date'].max()
print(last_fetched_date)

2023-10-23


In [5]:
driver = webdriver.Chrome()

In [7]:
driver.get('https://badmintonstatistics.net/Rankings')
driver.maximize_window()

In [9]:
select = Select(driver.find_element(By.XPATH, '//*[@id="rankingPeriodsSelect"]'))
# select by visible text
select.select_by_index('0')

In [10]:
#getting last ID vs historic data
last_current_week = driver.find_element(By.XPATH, '//*[@id="rankingPeriodsSelect"]/option[1]').text.replace('/', '-')
last_current_week

'23-10-2023'

In [10]:
dic_ranking = {'date':[], 'draw':[], 'top_100':[], 'country':[]}

In [11]:
date_listing = driver.find_element(By.XPATH, '//*[@id="rankingPeriodsSelect"]').text
date_listing = date_listing.replace(' ','').replace('/','-').split('\n')
date_listing = [x[6:10] + '-' + x[3:5] + '-' + x[0:2] for x in date_listing]
date_listing[0:5]

['2023-10-23', '2023-10-16', '2023-10-09', '2023-10-02', '2023-09-25']

In [12]:
date_listing_filtered = [x for x in date_listing if x > last_fetched_date]
display(date_listing_filtered[:5])
display(date_listing_filtered[-5:])

['2023-10-23', '2023-10-16', '2023-10-09', '2023-10-02', '2023-09-25']

['2023-08-21', '2023-08-14', '2023-08-07', '2023-07-31', '2023-07-24']

### SCRAPPING DELTA

In [13]:
for date in date_listing_filtered:

    for draw in ['MS','WS','MD','WD','XD']:
        
        driver.get('https://badmintonstatistics.net/Rankings?date=' + date + '&category=' + draw + '&country=%&page=1&pagesize=100')

        top_100 = driver.find_element(By.XPATH, '//*[@id="RankingList"]/div[2]/table/tbody')

        countries = []
        
        for n in range(1,101):
            
            country = driver.find_element(By.XPATH, '//*[@id="RankingList"]/div[2]/table/tbody/tr['+str(n)+']/td[1]/img[1]').get_attribute('src')
            countries.append(country)
        
        dic_ranking['date'].append(date)
        dic_ranking['draw'].append(draw)
        dic_ranking['top_100'].append(top_100.text)
        dic_ranking['country'].append(countries)

        #time.sleep(random.randint(3,5))

In [14]:
delta = pd.DataFrame.from_dict(dic_ranking)
delta

Unnamed: 0,date,draw,top_100,country
0,2023-10-23,MS,Viktor AXELSEN 1 MS 107455\nAnthony Sinisuka G...,[https://badmintonstatistics.net/img/flags/Den...
1,2023-10-23,WS,Se Young AN 1 WS 113314\nAkane YAMAGUCHI 2 WS ...,[https://badmintonstatistics.net/img/flags/Kor...
2,2023-10-23,MD,Satwiksairaj RANKIREDDY / Chirag SHETTY 1 MD 9...,[https://badmintonstatistics.net/img/flags/Ind...
3,2023-10-23,WD,CHEN Qingchen / JIA Yifan 1 WD 113104\nBAEK Ha...,[https://badmintonstatistics.net/img/flags/Chi...
4,2023-10-23,XD,ZHENG Siwei / HUANG Yaqiong 1 XD 114256\nYuta ...,[https://badmintonstatistics.net/img/flags/Chi...
...,...,...,...,...
65,2023-07-24,MS,Viktor AXELSEN 1 MS 101205\nAnthony Sinisuka G...,[https://badmintonstatistics.net/img/flags/Den...
66,2023-07-24,WS,Akane YAMAGUCHI 1 WS 104517\nSe Young AN 2 WS ...,[https://badmintonstatistics.net/img/flags/Jap...
67,2023-07-24,MD,Fajar ALFIAN / Muhammad Rian ARDIANTO 1 MD 916...,[https://badmintonstatistics.net/img/flags/Ind...
68,2023-07-24,WD,CHEN Qingchen / JIA Yifan 1 WD 103254\nBAEK Ha...,[https://badmintonstatistics.net/img/flags/Chi...


In [15]:
delta.to_csv('export_delta_top100_bwf.csv')

### CLEANING DATA

In [16]:
#if loaded from csv
#delta = pd.read_csv('export_delta_top100_bwf.csv')

In [17]:
flat_list = [item for sublist in delta['country'].to_list() for item in sublist]
flat_list = [item.split('/')[-1].replace('.png', '').replace('%20', '-') for item in flat_list]

In [18]:
delta_prep = pd.DataFrame(delta.top_100.str.split('\n').tolist(), index=delta[['date','draw']]).stack().reset_index()
delta_prep = pd.concat([delta_prep, pd.DataFrame(flat_list)], axis = 1)
delta_prep.columns = ['date-draw', 'to_drop', 'to_split', 'country']

In [19]:
delta_prep['date'] = delta_prep['date-draw'].str[0]
delta_prep['draw'] = delta_prep['date-draw'].str[1]

delta_prep['rank'] = delta_prep['to_split'].str.split(' ').str[-3]
delta_prep['rank'] = delta_prep['rank'].astype('int')

delta_prep['points'] = delta_prep['to_split'].str.split(' ').str[-1]
delta_prep['points'] = delta_prep['points'].astype('int')

delta_prep['draw'] = delta_prep['to_split'].str.split(' ').str[-2]

delta_prep['name'] = delta_prep['to_split'].str.split(' ').str[0:-3]
delta_prep['name'] = delta_prep['name'].apply(lambda x: ','.join(map(str, x)))
delta_prep['name'] = delta_prep['name'].str.replace(',',' ').str.upper()

delta_prep['country'] = delta_prep['country'].str.upper()

delta_prep = delta_prep.drop(['date-draw','to_split', 'to_drop'], axis = 1)

delta_prep

Unnamed: 0,country,date,draw,rank,points,name
0,DENMARK,2023-10-23,MS,1,107455,VIKTOR AXELSEN
1,INDONESIA,2023-10-23,MS,2,86611,ANTHONY SINISUKA GINTING
2,JAPAN,2023-10-23,MS,3,82015,KODAI NARAOKA
3,THAILAND,2023-10-23,MS,4,79778,KUNLAVUT VITIDSARN
4,CHINA,2023-10-23,MS,5,78548,LI SHIFENG
...,...,...,...,...,...,...
6995,CHINA,2023-07-24,XD,96,11180,GUO XINWA / ZHANG SHUXIAN
6996,GERMANY,2023-07-24,XD,97,11160,MALIK BOURAKKADI / LEONA MICHALSKI
6997,INDIA,2023-07-24,XD,98,11100,SAIPRATHEEK. K KRISHNAPRASAD / TANISHA CRASTO
6998,MEXICO,2023-07-24,XD,99,11048,LUIS ARMANDO MONTOYA NAVARRO / MIRIAM JACQUELI...


### MERGING DATA

In [20]:
data_full = pd.concat([delta_prep, data_old], ignore_index=True)

In [21]:
data_full.head()

Unnamed: 0,country,date,draw,rank,points,name
0,DENMARK,2023-10-23,MS,1,107455,VIKTOR AXELSEN
1,INDONESIA,2023-10-23,MS,2,86611,ANTHONY SINISUKA GINTING
2,JAPAN,2023-10-23,MS,3,82015,KODAI NARAOKA
3,THAILAND,2023-10-23,MS,4,79778,KUNLAVUT VITIDSARN
4,CHINA,2023-10-23,MS,5,78548,LI SHIFENG


In [22]:
data_full.to_csv('bwf_historic_rankings.csv')

### DATA QUALITY CHECKS

In [23]:
data_full['date'].value_counts()

date
2023-10-23    500
2000-10-23    500
2000-11-06    500
2000-11-13    500
2000-11-20    500
             ... 
2012-01-09    500
2012-01-16    500
2012-01-23    500
2012-01-30    500
1990-01-01    500
Name: count, Length: 1720, dtype: int64

In [24]:
data_full['rank'].value_counts()

rank
1      8600
64     8600
74     8600
73     8600
72     8600
       ... 
31     8600
30     8600
29     8600
28     8600
100    8600
Name: count, Length: 100, dtype: int64

## SCRAPPING TOURNAMENTS DATA

In [70]:
driver = webdriver.Chrome()
l_final = []
months = ['JANUARY', 'FEBRUARY', 'MARCH', 'APRIL', 'MAY', 'JUNE', 'JULY', 'AUGUST', 'SEPTEMBER', 'OCTOBER', 'NOVEMBER', 'DECEMBER']

for year in range(2018, 2024):
    
    driver.get('https://bwfworldtour.bwfbadminton.com/calendar/?cyear='+str(year))
    
    l_tour = driver.find_element(By.XPATH, '//*[@id="page"]/div/div[1]/div/div/div/div').text.split('\n')
    l_tour = [x for x in l_tour if x not in months]
    
    for x in range(5, len(l_tour), 5):
    
        l_final.append(l_tour[x-5:x])

In [71]:
df = pd.DataFrame(l_final)
df

Unnamed: 0,0,1,2,3,4
0,PRINCESS SIRIVANNAVARI Thailand Masters 2018,09 - 14 JANUARY,"PRIZE MONEY USD $150,000",Bangkok,Thailand
1,PERODUA Malaysia Masters 2018,16 - 21 JANUARY,"PRIZE MONEY USD $350,000",Bukit Jalil,Malaysia
2,DAIHATSU Indonesia Masters 2018,23 - 28 JANUARY,"PRIZE MONEY USD $350,000",Jakarta,Indonesia
3,YONEX-SUNRISE DR. AKHILESH DAS GUPTA India Ope...,30 JANUARY - 04 FEBRUARY,"PRIZE MONEY USD $350,000",New Delhi,India
4,YONEX Swiss Open 2018,20 - 25 FEBRUARY,"PRIZE MONEY USD $150,000",Basel,Switzerland
...,...,...,...,...,...
106,SimInvest Indonesia Open 2021(New dates),23 - 28 NOVEMBER,"PRIZE MONEY USD $850,000",Bali,Indonesia
107,Korea Masters 2023,07 - 12 NOVEMBER,"PRIZE MONEY USD $210,000",Gwangju,Korea
108,Kumamoto Masters Japan 2023,14 - 19 NOVEMBER,"PRIZE MONEY USD $420,000",Kumamoto,Japan
109,China Masters 2023,21 - 26 NOVEMBER,"PRIZE MONEY USD $1,000,000",Shenzhen,China


In [41]:
lnks=driver.find_elements(By.TAG_NAME, "a")

for lnk in lnks:
   if "tournament" in lnk.get_attribute('href'):
      print(lnk.get_attribute('href'))

https://bwfworldtour.bwfbadminton.com/tournament/3139/princess-sirivannavari-thailand-masters-2018
https://bwfworldtour.bwfbadminton.com/tournament/3143/perodua-malaysia-masters-2018
https://bwfworldtour.bwfbadminton.com/tournament/3140/daihatsu-indonesia-masters-2018
https://bwfworldtour.bwfbadminton.com/tournament/3170/yonex-sunrise-dr-akhilesh-das-gupta-india-open-2018
https://bwfworldtour.bwfbadminton.com/tournament/3144/yonex-swiss-open-2018
https://bwfworldtour.bwfbadminton.com/tournament/3146/yonex-german-open-2018
https://bwfworldtour.bwfbadminton.com/tournament/3141/yonex-all-england-open-2018
https://bwfworldtour.bwfbadminton.com/tournament/3147/barfoot-thompson-new-zealand-open-2018
https://bwfworldtour.bwfbadminton.com/tournament/3148/crown-group-australian-open-2018
https://bwfworldtour.bwfbadminton.com/tournament/3216/2018-yonex-us-open
https://bwfworldtour.bwfbadminton.com/tournament/3142/celcom-axiata-malaysia-open-2018
https://bwfworldtour.bwfbadminton.com/tournament/3