# Batting Analysis of The World Cup 2019
> Inspired by some graphics made by [@SAAdvantage](https://twitter.com/SAAdvantage)

In [2]:
# Imports
import requests
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import re

Define the URL for the summary page that shows the overall batting stats for the top 50 highest scoring batsman at The World Cup 2019.

In [3]:
url = 'http://stats.espncricinfo.com/ci/engine/records/batting/most_runs_career.html?id=12357;type=tournament'
soup = BeautifulSoup(requests.get(url).text, features="html.parser")

In [4]:
for caption in soup.find_all('caption'):
    if caption.get_text() == 'Most runs':
        main_table = caption.find_parent(
            'table', {'class': 'engineTable'})

In [8]:
columns = [header.get_text() for header in main_table.find('thead').find_all('tr')[0].find_all('th')]
rows = []

for innings in [row for row in main_table.find('tbody').find_all('tr')]:
    rows.append([stat.get_text() for stat in innings.find_all('td')])
    
df_raw = pd.DataFrame(rows, columns=columns).apply(pd.to_numeric, errors='ignore')

In [10]:
df_raw.head()

Unnamed: 0,Player,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,4s,6s
0,RG Sharma (INDIA),9,9,1,648,140,81.0,659,98.33,5,1,0,67,14
1,DA Warner (AUS),10,10,1,647,166,71.88,724,89.36,3,3,0,66,8
2,Shakib Al Hasan (BDESH),8,8,1,606,124*,86.57,631,96.03,2,5,0,60,2
3,KS Williamson (NZ),10,9,2,578,148,82.57,771,74.96,2,2,0,50,3
4,JE Root (ENG),11,11,2,556,107,61.77,621,89.53,2,3,0,48,2


We don't need all the columns to work out what we are going to work out. We can use just those defined below. 

In [18]:
df_trim = df_raw.loc[:, ['Player','Runs', 'BF','SR','4s','6s']].copy()

I'm going to define some cleaner functions just to handle the 'Player' column and split out the country into its own column. 

In [36]:
def country(player_string):
    regex = re.compile(r'.* \(([A-Z]*)\)')
    return(regex.search(player_string).group(1))

def clean_player(player_string):
    regex = re.compile(r'([a-zA-Z ]*)\s\([A-Z]*\)')
    return(regex.search(player_string).group(1))

In [38]:
df_trim['Country'] = df_trim.Player.apply(country)
df_trim['Player'] = df_trim.Player.apply(clean_player)

In [39]:
df_trim

Unnamed: 0,Player,Runs,BF,SR,4s,6s,Country
0,RG Sharma,648,659,98.33,67,14,INDIA
1,DA Warner,647,724,89.36,66,8,AUS
2,Shakib Al Hasan,606,631,96.03,60,2,BDESH
3,KS Williamson,578,771,74.96,50,3,NZ
4,JE Root,556,621,89.53,48,2,ENG
5,JM Bairstow,532,573,92.84,67,11,ENG
6,AJ Finch,507,497,102.01,47,18,AUS
7,Babar Azam,474,540,87.77,50,2,PAK
8,BA Stokes,465,499,93.18,38,11,ENG
9,JJ Roy,443,384,115.36,51,12,ENG
