In [2]:
import pandas as pd
from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.common.exceptions import NoSuchElementException

import bs4
import urllib


import random

## SCRAPPING DATA

In [3]:
driver = webdriver.Chrome()

driver.get('https://badmintonstatistics.net/Rankings?date=2021-10-25&category=MS&country=%&page=1&pagesize=100')
driver.maximize_window()

In [4]:
dic_ranking = {'date':[], 'draw':[], 'top_100':[]}

In [53]:
date_listing = driver.find_element_by_xpath('//*[@id="rankingPeriodsSelect"]').text
date_listing = date_listing.replace(' ','').replace('/','-').split('\n')

for date in date_listing[1000:]:
    
    temp_date = date[6:10]+'-'+date[3:5]+'-'+date[0:2]

    for draw in ['MS','WS','MD','WD','XD']:
        
        driver.get('https://badmintonstatistics.net/Rankings?date='+temp_date+'&category='+draw+'&country=%&page=1&pagesize=100')

        top_100 = driver.find_elements_by_xpath('//*[@id="RankingList"]/div[2]/table/tbody')

        dic_ranking['date'].append(temp_date)
        dic_ranking['draw'].append(draw)
        dic_ranking['top_100'].append(top_100[0].text)

        #time.sleep(random.randint(3,5))

In [54]:
top_100 = pd.DataFrame.from_dict(dic_ranking)

In [56]:
top_100.to_csv('export_histo_top100_badminton.csv')

## LOADING DATA

In [74]:
#checking last scraped data
data = pd.read_csv(".\export_histo_top100_badminton.csv")

In [75]:
data.head()

Unnamed: 0.1,Unnamed: 0,date,draw,top_100
0,0,2021-10-25,MS,Rank Players Country Category Points\n1 Kento ...
1,1,2021-10-25,WS,Rank Players Country Category Points\n1 TAI Tz...
2,2,2021-10-25,MD,Rank Players Country Category Points\n1 Marcus...
3,3,2021-10-25,WD,Rank Players Country Category Points\n1 Yuki F...
4,4,2021-10-25,XD,Rank Players Country Category Points\n1 ZHENG ...


## FORMATING DATA

In [102]:
def prep_data(df):

    df = df.drop('Unnamed: 0', axis = 1)

    df = pd.DataFrame(df.top_100.str.split('\n').tolist(), index=df[['date','draw']]).stack().reset_index()
    
    df.columns = ['date / draw', 'to_drop', 'content']

    df = df[df['to_drop'] != 0].drop('to_drop', axis = 1)

    df['date'] = df['date / draw'].str[0]
    df['draw'] = df['date / draw'].str[1]
    df['rank'] = df['content'].str.split(' ', 1).str[0]
    df['points'] = df['content'].str.split(' ').str[-1]
    df['country'] = df['content'].str.split(' ').str[-3].str.upper()
    df['name'] = df['content'].str.split(' ').str[1:-3]

    df['country'] = df['country'].str.replace('KONG','HONG KONG')
    df['country'] = df['country'].str.replace('TAIPEI','CHINESE TAIPEI')
    
    df['name'] = df['name'].apply(lambda x: ','.join(map(str, x)))
    df['name'] = df['name'].str.replace(',',' ').str.upper()
    df['name'] = df['name'].str.replace('CHINESE','')
    
    df['points'] = df['points'].astype('int')
    
    df['rank'] = df['rank'].astype('int')

    df = df.drop(['date / draw','content'], axis = 1)
    
    print(df.info())
    print(df.head())
    
    return df

In [103]:
data_prep = prep_data(data)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 808000 entries, 1 to 816079
Data columns (total 6 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   date     808000 non-null  object
 1   draw     808000 non-null  object
 2   rank     808000 non-null  int32 
 3   points   808000 non-null  int32 
 4   country  808000 non-null  object
 5   name     808000 non-null  object
dtypes: int32(2), object(4)
memory usage: 37.0+ MB
None
         date draw  rank  points         country                      name
1  2021-10-25   MS     1  109118           JAPAN              KENTO MOMOTA
2  2021-10-25   MS     2  103786         DENMARK            VIKTOR AXELSEN
3  2021-10-25   MS     3   94875         DENMARK           ANDERS ANTONSEN
4  2021-10-25   MS     4   89828  CHINESE TAIPEI           CHOU TIEN CHEN 
5  2021-10-25   MS     5   85332       INDONESIA  ANTHONY SINISUKA GINTING


In [96]:
data_prep['country'].value_counts().head(10)

INDONESIA         68050
CHINA             67326
DENMARK           65930
MALAYSIA          50995
KOREA             47178
JAPAN             45130
ENGLAND           44912
CHINESE TAIPEI    31760
GERMANY           29778
THAILAND          27756
Name: country, dtype: int64

In [94]:
data_prep['name'].value_counts().head(10)

LIN DAN             1011
BOONSAK PONSANA      980
PETER HOEG GADE      952
LEE CHONG WEI        935
LEE HYUN IL          928
SONY DWI KUNCORO     891
CHARMAINE REID       884
ZHANG NING           881
PETYA NEDELCHEVA     872
JURGEN KOCH          865
Name: name, dtype: int64

In [99]:
data_prep[data_prep['rank'] == '1']['name'].value_counts().head(10)

LEE CHONG WEI                                       400
ZHANG NAN / ZHAO YUNLEI                             244
LIN DAN                                             212
SUSI SUSANTI                                        189
MARCUS FERNALDI GIDEON / KEVIN SANJAYA SUKAMULJO    179
GE FEI / GU JUN                                     178
TAI TZU YING                                        177
GAO LING / HUANG SUI CHINA /                        170
YANG WEI / ZHANG JIEWEN                             161
ARDY B WIRANATA                                     157
Name: name, dtype: int64

In [106]:
data_prep.sort_values(by='points', ascending=False).head(1)

Unnamed: 0,date,draw,rank,points,country,name
32725,2019-09-23,XD,1,115902,CHINA,ZHENG SIWEI / HUANG YAQIONG
