In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the page
url = "https://www.espncricinfo.com/records/tournament/team-match-results/icc-cricket-world-cup-2023-24-15338"

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content of the page
soup = BeautifulSoup(response.content, 'html.parser')

# Find the table containing the match results
table = soup.find('table')

    # Extract data from the table
data = []
for row in table.find_all('tr'):
        row_data = [td.text.strip() for td in row.find_all('td')]
        # Extract link information if present
        links = row.find_all('a')
        if links:
            row_data.append(links[1]['href'])  # link is in the second column
        else:
            row_data.append("")  # If no link is present
        data.append(row_data)

# Create a DataFrame
df = pd.DataFrame(data, columns=['Team 1', 'Team 2', 'Winner', 'Margin', 'Ground', 'Match Date', 'Scorecard', 'Link'])
df = df[1:][:]
df = df.reset_index()
df['Link'] = 'https://www.espncricinfo.com' + df['Link']  #update link url

In [2]:
df.head()

Unnamed: 0,index,Team 1,Team 2,Winner,Margin,Ground,Match Date,Scorecard,Link
0,1,India,Australia,Australia,6 wickets,Ahmedabad,"Nov 19, 2023",ODI # 4705,https://www.espncricinfo.com/series/icc-cricke...
1,2,Australia,South Africa,Australia,3 wickets,Eden Gardens,"Nov 16, 2023",ODI # 4704,https://www.espncricinfo.com/series/icc-cricke...
2,3,India,New Zealand,India,70 runs,Wankhede,"Nov 15, 2023",ODI # 4703,https://www.espncricinfo.com/series/icc-cricke...
3,4,India,Netherlands,India,160 runs,Bengaluru,"Nov 12, 2023",ODI # 4702,https://www.espncricinfo.com/series/icc-cricke...
4,5,England,Pakistan,England,93 runs,Eden Gardens,"Nov 11, 2023",ODI # 4701,https://www.espncricinfo.com/series/icc-cricke...


In [3]:
df.tail()

Unnamed: 0,index,Team 1,Team 2,Winner,Margin,Ground,Match Date,Scorecard,Link
43,44,India,Australia,India,6 wickets,Chennai,"Oct 8, 2023",ODI # 4662,https://www.espncricinfo.com/series/icc-cricke...
44,45,South Africa,Sri Lanka,South Africa,102 runs,Delhi,"Oct 7, 2023",ODI # 4661,https://www.espncricinfo.com/series/icc-cricke...
45,46,Afghanistan,Bangladesh,Bangladesh,6 wickets,Dharamsala,"Oct 7, 2023",ODI # 4660,https://www.espncricinfo.com/series/icc-cricke...
46,47,Netherlands,Pakistan,Pakistan,81 runs,Hyderabad,"Oct 6, 2023",ODI # 4659,https://www.espncricinfo.com/series/icc-cricke...
47,48,England,New Zealand,New Zealand,9 wickets,Ahmedabad,"Oct 5, 2023",ODI # 4658,https://www.espncricinfo.com/series/icc-cricke...


In [148]:
df['Link'][14]

'https://www.espncricinfo.com/series/icc-cricket-world-cup-2023-24-1367856/afghanistan-vs-netherlands-34th-match-1384425/full-scorecard'

In [151]:
output_path = 'D:/final_output.json'

with open(output_path, 'w') as json_file:
    json.dump(final, json_file, indent=4)

print(f'Data saved to {output_path}')

Data saved to D:/final_output.json


In [153]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Assuming df is your DataFrame with the links

all_batting_summary = []

for link in df['Link']:
    response = requests.get(link)
    soup = BeautifulSoup(response.content, 'html.parser')
    tables = soup.select('div > table.ci-scorecard-table')
    first_inning_rows = tables[0].select('tbody > tr:has(td:nth-child(8))')
    second_inning_rows = tables[1].select('tbody > tr:has(td:nth-child(8))')
    team1 = soup.find_all('span', class_='ds-text-title-xs')[1].text.strip()
    team2 = soup.find_all('span', class_='ds-text-title-xs')[3].text.strip()
    match_info_str = f"{team1} Vs {team2}"

    batting_summary = []

    for index, element in enumerate(first_inning_rows):
        tds = element.find_all('td')
        batting_summary.append({
            "match": match_info_str,
            "teamInnings": team1,
            "battingPos": index + 1,
            "batsmanName": tds[0].find('span').text.strip(),
            "dismissal": tds[1].text.strip(),
            "runs": tds[2].find('strong').text,
            "balls": tds[3].text,
            "4s": tds[5].text,
            "6s": tds[6].text,
            "SR": tds[7].text
        })

    for index, element in enumerate(second_inning_rows):
        tds = element.find_all('td')
        batting_summary.append({
            "match": match_info_str,
            "teamInnings": team2,
            "battingPos": index + 1,
            "batsmanName": tds[0].find('span').text.strip(),
            "dismissal": tds[1].text.strip(),
            "runs": tds[2].find('strong').text,
            "balls": tds[3].text,
            "4s": tds[5].text,
            "6s": tds[6].text,
            "SR": tds[7].text
        })

    all_batting_summary.append({"battingSummary": batting_summary})
output_path = 'D:/batting_summary.json'
# Save the result to a JSON file
with open(output_path, 'w') as json_file:
    json.dump(all_batting_summary, json_file, indent=2, ensure_ascii=False)


In [7]:
df.to_json('D:/match_summary1.json', orient='table', indent=2)

In [155]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

# Assuming df is your DataFrame with the links

all_bowling_summary = []

for link in df['Link']:
    response = requests.get(link)
    soup = BeautifulSoup(response.content, 'html.parser')
    tables = soup.select('div > table.ds-table')
    first_inning_rows = tables[1].select('tbody > tr:has(td:nth-child(11))')
    second_inning_rows = tables[3].select('tbody > tr:has(td:nth-child(11))')
    team1 = soup.find_all('span', class_='ds-text-title-xs')[1].text.strip()
    team2 = soup.find_all('span', class_='ds-text-title-xs')[3].text.strip()
    match_info_str = f"{team1} Vs {team2}"

    bowling_summary = []

    for index, element in enumerate(first_inning_rows):
        tds = element.find_all('td')
        bowling_summary.append({
            "match": match_info_str,
            "bowlingTeam": team2,
            "bowlerName": tds[0].find('a').find('span').text.strip(),
            "overs": tds[1].text.strip(),
            "maiden": tds[2].text.strip(),
            "runs": tds[3].text.strip(),
            "wickets": tds[4].text.strip(),
            "economy": tds[5].text.strip(),
            "0s": tds[6].text.strip(),
            "4s": tds[7].text.strip(),
            "6s": tds[8].text.strip(),
            "wides": tds[9].text.strip(),
            "noBalls": tds[10].text.strip()
        })

    for index, element in enumerate(second_inning_rows):
        tds = element.find_all('td')
        bowling_summary.append({
            "match": match_info_str,
            "bowlingTeam": team1,
            "bowlerName": tds[0].find('a').find('span').text.strip(),
            "overs": tds[1].text.strip(),
            "maiden": tds[2].text.strip(),
            "runs": tds[3].text.strip(),
            "wickets": tds[4].text.strip(),
            "economy": tds[5].text.strip(),
            "0s": tds[6].text.strip(),
            "4s": tds[7].text.strip(),
            "6s": tds[8].text.strip(),
            "wides": tds[9].text.strip(),
            "noBalls": tds[10].text.strip()
        })

    all_bowling_summary.append({"bowlingSummary": bowling_summary})

output_path = 'D:/bowling_summary.json'
# Save the result to a JSON file
with open(output_path, 'w') as json_file:
    json.dump(all_bowling_summary, json_file, indent=2, ensure_ascii=False)


In [156]:
import requests
from bs4 import BeautifulSoup
import json

# -------------- STAGE 1 ------------ #

# Assuming df is your DataFrame with the match summary links

all_players_data = []

for link in df['Link']:
    response = requests.get(link)
    soup = BeautifulSoup(response.content, 'html.parser')
    tables_batting = soup.select('div > table.ci-scorecard-table')
    tables_bowling = soup.select('div > table.ds-table')
    team1 = soup.find_all('span', class_='ds-text-title-xs')[1].text.strip()
    team2 = soup.find_all('span', class_='ds-text-title-xs')[3].text.strip()

    # Extract batting players
    first_inning_rows_batting = tables_batting[0].select('tbody > tr:has(td:nth-child(8))')
    second_inning_rows_batting = tables_batting[1].select('tbody > tr:has(td:nth-child(8))')

    for index, element in enumerate(first_inning_rows_batting):
        tds = element.find_all('td')
        all_players_data.append({
            "name": tds[0].find('a').find('span').text.strip(),
            "team": team1,
            "link": "https://www.espncricinfo.com" + tds[0].find('a')['href']
        })

    for index, element in enumerate(second_inning_rows_batting):
        tds = element.find_all('td')
        all_players_data.append({
            "name": tds[0].find('a').find('span').text.strip(),
            "team": team2,
            "link": "https://www.espncricinfo.com" + tds[0].find('a')['href']
        })

    # Extract bowling players
    first_inning_rows_bowling = tables_bowling[1].select('tbody > tr:has(td:nth-child(11))')
    second_inning_rows_bowling = tables_bowling[3].select('tbody > tr:has(td:nth-child(11))')

    for index, element in enumerate(first_inning_rows_bowling):
        tds = element.find_all('td')
        all_players_data.append({
            "name": tds[0].find('a').find('span').text.strip(),
            "team": team2.replace(" Innings", ""),
            "link": "https://www.espncricinfo.com" + tds[0].find('a')['href']
        })

    for index, element in enumerate(second_inning_rows_bowling):
        tds = element.find_all('td')
        all_players_data.append({
            "name": tds[0].find('a').find('span').text.strip(),
            "team": team1.replace(" Innings", ""),
            "link": "https://www.espncricinfo.com" + tds[0].find('a')['href']
        })

# Save the result to a JSON file
output_path = 'D:/all_players_data.json'
with open(output_path, 'w') as json_file:
    json.dump(all_players_data, json_file, indent=2, ensure_ascii=False)


In [None]:
# Load the JSON file containing player links
with open('D:/all_players_data.json', 'r') as json_file:
    players_data = json.load(json_file)

# Remove duplicate player entries based on 'link'
unique_players_data = [dict(t) for t in {tuple(d.items()) for d in players_data}]

# Save the unique player data to a new JSON file
unique_output_path = 'D:/unique_all_players_data.json'
with open(unique_output_path, 'w') as json_file:
    json.dump(unique_players_data, json_file, indent=2, ensure_ascii=False)

In [36]:
import requests
from bs4 import BeautifulSoup
import json

# Load the JSON file containing player links
with open('D:/unique_all_players_data.json', 'r') as json_file:
    unique_players_data = json.load(json_file)

all_player_info = []

for player in unique_players_data:
    response = requests.get(player['link'])
    soup = BeautifulSoup(response.content, 'html.parser')
    _tags = soup.find_all('p', class_ = "ds-text-tight-m" )
    for tag in _tags:
        if tag.text == 'Batting Style':
            batting_style = tag.find_next('span').text.strip() if tag else ""
        elif tag.text == 'Bowling Style':
            bowling_style = tag.find_next('span').text.strip() if tag else ""
        elif tag.text == 'Playing Role':
            playing_role = tag.find_next('span').text.strip() if tag else ""
    # Get description if available
    description_tag = soup.find('div', class_='ci-player-bio-content')
    description = description_tag.find('p').text.strip() if description_tag else ""

    player_info = {
        "name": player['name'],
        "team": player['team'],
        "battingStyle": batting_style,
        "bowlingStyle": bowling_style,
        "playingRole": playing_role,
        "description": description
    }

    all_player_info.append(player_info)

# Save the result to a JSON file
output_path = 'D:/all_player_info.json'
with open(output_path, 'w') as json_file:
    json.dump(all_player_info, json_file, indent=2, ensure_ascii=False)


In [33]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
from selenium import webdriver

# Load the JSON file containing player links
with open('D:/unique_all_players_data.json', 'r') as json_file:
    unique_players_data = json.load(json_file)


In [35]:
unique_players_data[6]

{'name': 'Rachin Ravindra',
 'team': 'New Zealand',
 'link': 'https://www.espncricinfo.com/cricketers/rachin-ravindra-959767'}

In [38]:
pl = unique_players_data[6]['link']
response = requests.get(pl)
soup = BeautifulSoup(response.content, 'html.parser')
div_t = soup.find('div', class_="ds-ml-auto")
print(div_t)
# img_tags = div_t.find_next('img')
# print(img_tags['src'])
# img_links = [tag['src'] for tag in img_tags] if img_tags else []
# print(img_links)

<div class="ds-ml-auto ds-w-48 ds-h-48"><div style="position:relative;padding-bottom:100%;height:0;overflow:hidden"><img alt="Rachin Ravindra" class="" src="https://wassets.hscicdn.com/static/images/lazyimage-transparent.png" style="position:absolute;top:0;left:0;width:100%;height:100%"/></div></div>


In [39]:
pl

'https://www.espncricinfo.com/cricketers/rachin-ravindra-959767'

In [40]:
from selenium import webdriver
from bs4 import BeautifulSoup

# Example player link

# Use Selenium to render JavaScript
driver = webdriver.Chrome()
driver.get(pl)

# Get the page source after JavaScript execution
page_source = driver.page_source

# Close the Selenium WebDriver
driver.quit()

# Parse the page source with BeautifulSoup
soup = BeautifulSoup(page_source, 'html.parser')

# Find the div containing the image
div_container = soup.find('div', class_='ds-ml-auto')

# Find the actual image source within the div
img_tag = div_container.find('img')
actual_img_src = img_tag['src']

print(actual_img_src)


https://img1.hscicdn.com/image/upload/f_auto,t_ds_square_w_640,q_50/lsci/db/PICTURES/CMS/329700/329746.png


In [9]:
import copy

In [11]:
df_match  = df.copy()

In [13]:
df_match.head()

Unnamed: 0,index,Team 1,Team 2,Winner,Margin,Ground,Match Date,Scorecard,Link
0,1,India,Australia,Australia,6 wickets,Ahmedabad,"Nov 19, 2023",ODI # 4705,https://www.espncricinfo.com/series/icc-cricke...
1,2,Australia,South Africa,Australia,3 wickets,Eden Gardens,"Nov 16, 2023",ODI # 4704,https://www.espncricinfo.com/series/icc-cricke...
2,3,India,New Zealand,India,70 runs,Wankhede,"Nov 15, 2023",ODI # 4703,https://www.espncricinfo.com/series/icc-cricke...
3,4,India,Netherlands,India,160 runs,Bengaluru,"Nov 12, 2023",ODI # 4702,https://www.espncricinfo.com/series/icc-cricke...
4,5,England,Pakistan,England,93 runs,Eden Gardens,"Nov 11, 2023",ODI # 4701,https://www.espncricinfo.com/series/icc-cricke...


In [14]:
df_match.shape

(48, 9)

In [15]:
df_match.rename({'Scorecard': 'match_id'}, axis = 1, inplace = True)
df_match.head()

Unnamed: 0,index,Team 1,Team 2,Winner,Margin,Ground,Match Date,match_id,Link
0,1,India,Australia,Australia,6 wickets,Ahmedabad,"Nov 19, 2023",ODI # 4705,https://www.espncricinfo.com/series/icc-cricke...
1,2,Australia,South Africa,Australia,3 wickets,Eden Gardens,"Nov 16, 2023",ODI # 4704,https://www.espncricinfo.com/series/icc-cricke...
2,3,India,New Zealand,India,70 runs,Wankhede,"Nov 15, 2023",ODI # 4703,https://www.espncricinfo.com/series/icc-cricke...
3,4,India,Netherlands,India,160 runs,Bengaluru,"Nov 12, 2023",ODI # 4702,https://www.espncricinfo.com/series/icc-cricke...
4,5,England,Pakistan,England,93 runs,Eden Gardens,"Nov 11, 2023",ODI # 4701,https://www.espncricinfo.com/series/icc-cricke...


In [16]:
match_ids_dict = {}

for index, row in df_match.iterrows():
    key1 = row['Team 1'] + ' Vs ' + row['Team 2']
    key2 = row['Team 2'] + ' Vs ' + row['Team 1']
    match_ids_dict[key1] = row['match_id']
    match_ids_dict[key2] = row['match_id']

In [17]:
df_match.to_csv('D:/dim_match_summary.csv', index = False)

In [19]:
import json
with open('D:/batting_summary.json') as f:
    data = json.load(f)
    all_records = []
    for rec in data:
        all_records.extend(rec['battingSummary'])
  
df_batting = pd.DataFrame(all_records)
df_batting.head(11)

Unnamed: 0,match,teamInnings,battingPos,batsmanName,dismissal,runs,balls,4s,6s,SR
0,India Vs Australia,India,1,Rohit Sharma (c),c Head b Maxwell,47,31,4,3,151.61
1,India Vs Australia,India,2,Shubman Gill,c Zampa b Starc,4,7,0,0,57.14
2,India Vs Australia,India,3,Virat Kohli,b Cummins,54,63,4,0,85.71
3,India Vs Australia,India,4,Shreyas Iyer,c †Inglis b Cummins,4,3,1,0,133.33
4,India Vs Australia,India,5,KL Rahul †,c †Inglis b Starc,66,107,1,0,61.68
5,India Vs Australia,India,6,Ravindra Jadeja,c †Inglis b Hazlewood,9,22,0,0,40.9
6,India Vs Australia,India,7,Suryakumar Yadav,c †Inglis b Hazlewood,18,28,1,0,64.28
7,India Vs Australia,India,8,Mohammed Shami,c †Inglis b Starc,6,10,1,0,60.0
8,India Vs Australia,India,9,Jasprit Bumrah,lbw b Zampa,1,3,0,0,33.33
9,India Vs Australia,India,10,Kuldeep Yadav,run out (Labuschagne/Cummins),10,18,0,0,55.55


In [23]:
df_batting['out/not_out'] = df_batting.dismissal.apply(lambda x: "out" if (len(x)>0 and x != "not out")  else "not_out")
df_batting.head(11)

Unnamed: 0,match,teamInnings,battingPos,batsmanName,dismissal,runs,balls,4s,6s,SR,out/not_out
0,India Vs Australia,India,1,Rohit Sharma (c),c Head b Maxwell,47,31,4,3,151.61,out
1,India Vs Australia,India,2,Shubman Gill,c Zampa b Starc,4,7,0,0,57.14,out
2,India Vs Australia,India,3,Virat Kohli,b Cummins,54,63,4,0,85.71,out
3,India Vs Australia,India,4,Shreyas Iyer,c †Inglis b Cummins,4,3,1,0,133.33,out
4,India Vs Australia,India,5,KL Rahul †,c †Inglis b Starc,66,107,1,0,61.68,out
5,India Vs Australia,India,6,Ravindra Jadeja,c †Inglis b Hazlewood,9,22,0,0,40.9,out
6,India Vs Australia,India,7,Suryakumar Yadav,c †Inglis b Hazlewood,18,28,1,0,64.28,out
7,India Vs Australia,India,8,Mohammed Shami,c †Inglis b Starc,6,10,1,0,60.0,out
8,India Vs Australia,India,9,Jasprit Bumrah,lbw b Zampa,1,3,0,0,33.33,out
9,India Vs Australia,India,10,Kuldeep Yadav,run out (Labuschagne/Cummins),10,18,0,0,55.55,out


In [24]:
df_batting['match_id'] = df_batting['match'].map(match_ids_dict)
df_batting.head()

Unnamed: 0,match,teamInnings,battingPos,batsmanName,dismissal,runs,balls,4s,6s,SR,out/not_out,match_id
0,India Vs Australia,India,1,Rohit Sharma (c),c Head b Maxwell,47,31,4,3,151.61,out,ODI # 4662
1,India Vs Australia,India,2,Shubman Gill,c Zampa b Starc,4,7,0,0,57.14,out,ODI # 4662
2,India Vs Australia,India,3,Virat Kohli,b Cummins,54,63,4,0,85.71,out,ODI # 4662
3,India Vs Australia,India,4,Shreyas Iyer,c †Inglis b Cummins,4,3,1,0,133.33,out,ODI # 4662
4,India Vs Australia,India,5,KL Rahul †,c †Inglis b Starc,66,107,1,0,61.68,out,ODI # 4662


In [25]:
df_batting.drop(columns=["dismissal"], inplace=True)
df_batting.head(10)

Unnamed: 0,match,teamInnings,battingPos,batsmanName,runs,balls,4s,6s,SR,out/not_out,match_id
0,India Vs Australia,India,1,Rohit Sharma (c),47,31,4,3,151.61,out,ODI # 4662
1,India Vs Australia,India,2,Shubman Gill,4,7,0,0,57.14,out,ODI # 4662
2,India Vs Australia,India,3,Virat Kohli,54,63,4,0,85.71,out,ODI # 4662
3,India Vs Australia,India,4,Shreyas Iyer,4,3,1,0,133.33,out,ODI # 4662
4,India Vs Australia,India,5,KL Rahul †,66,107,1,0,61.68,out,ODI # 4662
5,India Vs Australia,India,6,Ravindra Jadeja,9,22,0,0,40.9,out,ODI # 4662
6,India Vs Australia,India,7,Suryakumar Yadav,18,28,1,0,64.28,out,ODI # 4662
7,India Vs Australia,India,8,Mohammed Shami,6,10,1,0,60.0,out,ODI # 4662
8,India Vs Australia,India,9,Jasprit Bumrah,1,3,0,0,33.33,out,ODI # 4662
9,India Vs Australia,India,10,Kuldeep Yadav,10,18,0,0,55.55,out,ODI # 4662


In [26]:
df_batting['batsmanName'] = df_batting['batsmanName'].apply(lambda x: x.replace('â€', ''))
df_batting['batsmanName'] = df_batting['batsmanName'].apply(lambda x: x.replace('\xa0', ''))
df_batting.head()

Unnamed: 0,match,teamInnings,battingPos,batsmanName,runs,balls,4s,6s,SR,out/not_out,match_id
0,India Vs Australia,India,1,Rohit Sharma(c),47,31,4,3,151.61,out,ODI # 4662
1,India Vs Australia,India,2,Shubman Gill,4,7,0,0,57.14,out,ODI # 4662
2,India Vs Australia,India,3,Virat Kohli,54,63,4,0,85.71,out,ODI # 4662
3,India Vs Australia,India,4,Shreyas Iyer,4,3,1,0,133.33,out,ODI # 4662
4,India Vs Australia,India,5,KL Rahul†,66,107,1,0,61.68,out,ODI # 4662


In [27]:
df_batting.shape

(876, 11)

In [28]:
df_batting.to_csv('D:/fact_bating_summary.csv', index = False)

In [29]:
with open('D:/bowling_summary.json') as f:
    data = json.load(f)
    all_records = []
    for rec in data:
        all_records.extend(rec['bowlingSummary'])
all_records[:2]

[{'match': 'India Vs Australia',
  'bowlingTeam': 'Australia',
  'bowlerName': 'Mitchell Starc',
  'overs': '10',
  'maiden': '0',
  'runs': '55',
  'wickets': '3',
  'economy': '5.50',
  '0s': '30',
  '4s': '4',
  '6s': '1',
  'wides': '3',
  'noBalls': '0'},
 {'match': 'India Vs Australia',
  'bowlingTeam': 'Australia',
  'bowlerName': 'Josh Hazlewood',
  'overs': '10',
  'maiden': '0',
  'runs': '60',
  'wickets': '2',
  'economy': '6.00',
  '0s': '22',
  '4s': '4',
  '6s': '1',
  'wides': '1',
  'noBalls': '0'}]

In [30]:
df_bowling = pd.DataFrame(all_records)
print(df_bowling.shape)
df_bowling.head()

(574, 13)


Unnamed: 0,match,bowlingTeam,bowlerName,overs,maiden,runs,wickets,economy,0s,4s,6s,wides,noBalls
0,India Vs Australia,Australia,Mitchell Starc,10,0,55,3,5.5,30,4,1,3,0
1,India Vs Australia,Australia,Josh Hazlewood,10,0,60,2,6.0,22,4,1,1,0
2,India Vs Australia,Australia,Glenn Maxwell,6,0,35,1,5.83,19,4,1,0,0
3,India Vs Australia,Australia,Pat Cummins,10,0,34,2,3.4,30,0,0,2,0
4,India Vs Australia,Australia,Adam Zampa,10,0,44,1,4.4,22,1,0,1,0


In [31]:
df_bowling['match_id'] = df_bowling['match'].map(match_ids_dict)
df_bowling.head()

Unnamed: 0,match,bowlingTeam,bowlerName,overs,maiden,runs,wickets,economy,0s,4s,6s,wides,noBalls,match_id
0,India Vs Australia,Australia,Mitchell Starc,10,0,55,3,5.5,30,4,1,3,0,ODI # 4662
1,India Vs Australia,Australia,Josh Hazlewood,10,0,60,2,6.0,22,4,1,1,0,ODI # 4662
2,India Vs Australia,Australia,Glenn Maxwell,6,0,35,1,5.83,19,4,1,0,0,ODI # 4662
3,India Vs Australia,Australia,Pat Cummins,10,0,34,2,3.4,30,0,0,2,0,ODI # 4662
4,India Vs Australia,Australia,Adam Zampa,10,0,44,1,4.4,22,1,0,1,0,ODI # 4662


In [32]:
df_bowling.to_csv('D:/fact_bowling_summary.csv', index = False)

In [59]:
with open('D:/all_player_info.json') as f:
    data = json.load(f)

In [60]:
df_players = pd.DataFrame(data)

print(df_players.shape)
df_players.head(10)

(160, 6)


Unnamed: 0,name,team,battingStyle,bowlingStyle,playingRole,description
0,Azmatullah Omarzai,Afghanistan,Right hand Bat,Right arm Medium fast,Allrounder,
1,Sadeera Samarawickrama,Sri Lanka,Right hand Bat,Right arm Medium fast,Wicketkeeper Batter,"In the longest format, Sadeera Samarawickrama ..."
2,Kuldeep Yadav,India,Left hand Bat,Left arm Wrist spin,Bowler,Kuldeep Yadav started as a fast bowler when he...
3,Josh Inglis †,Australia,Right hand Bat,Left arm Wrist spin,Wicketkeeper Batter,"Wicketkeeper-batter Josh Inglis, who was born ..."
4,Hardik Pandya,India,Right hand Bat,Right arm Medium fast,Allrounder,Allrounder Hardik Pandya's calling cards brisk...
5,Kasun Rajitha,Sri Lanka,Right hand Bat,Right arm Medium fast,Bowler,"Virtually unheard of before 2015, the first in..."
6,Rachin Ravindra,New Zealand,Left hand Bat,Slow Left arm Orthodox,Batting Allrounder,
7,David Miller,South Africa,Left hand Bat,Right arm Offbreak,Middle order Batter,A hard-hitting left-hander with a penchant for...
8,Dushmantha Chameera,Sri Lanka,Right hand Bat,Right arm Fast,Bowler,"Tall, slim and slippery, Dushmantha Chameera a..."
9,Ryan Klein,Netherlands,Right hand Bat,Right arm Medium fast,Bowler,


In [61]:
df_players['name'] = df_players['name'].apply(lambda x: x.replace('â€', ''))
df_players['name'] = df_players['name'].apply(lambda x: x.replace('†', ''))
df_players['name'] = df_players['name'].apply(lambda x: x.replace('\xa0', ''))
#df_players['name'] = df_players['name'].apply(lambda x: x.replace('(c)', ''))
df_players.head(10)

Unnamed: 0,name,team,battingStyle,bowlingStyle,playingRole,description
0,Azmatullah Omarzai,Afghanistan,Right hand Bat,Right arm Medium fast,Allrounder,
1,Sadeera Samarawickrama,Sri Lanka,Right hand Bat,Right arm Medium fast,Wicketkeeper Batter,"In the longest format, Sadeera Samarawickrama ..."
2,Kuldeep Yadav,India,Left hand Bat,Left arm Wrist spin,Bowler,Kuldeep Yadav started as a fast bowler when he...
3,Josh Inglis,Australia,Right hand Bat,Left arm Wrist spin,Wicketkeeper Batter,"Wicketkeeper-batter Josh Inglis, who was born ..."
4,Hardik Pandya,India,Right hand Bat,Right arm Medium fast,Allrounder,Allrounder Hardik Pandya's calling cards brisk...
5,Kasun Rajitha,Sri Lanka,Right hand Bat,Right arm Medium fast,Bowler,"Virtually unheard of before 2015, the first in..."
6,Rachin Ravindra,New Zealand,Left hand Bat,Slow Left arm Orthodox,Batting Allrounder,
7,David Miller,South Africa,Left hand Bat,Right arm Offbreak,Middle order Batter,A hard-hitting left-hander with a penchant for...
8,Dushmantha Chameera,Sri Lanka,Right hand Bat,Right arm Fast,Bowler,"Tall, slim and slippery, Dushmantha Chameera a..."
9,Ryan Klein,Netherlands,Right hand Bat,Right arm Medium fast,Bowler,


In [62]:
#df_players = df_players.drop_duplicates()

In [63]:
df_players[df_players['team'] == 'India']

Unnamed: 0,name,team,battingStyle,bowlingStyle,playingRole,description
2,Kuldeep Yadav,India,Left hand Bat,Left arm Wrist spin,Bowler,Kuldeep Yadav started as a fast bowler when he...
4,Hardik Pandya,India,Right hand Bat,Right arm Medium fast,Allrounder,Allrounder Hardik Pandya's calling cards brisk...
10,Rohit Sharma,India,Right hand Bat,Right arm Offbreak,Top order Batter,"Languid and easy on the eye, Rohit Sharma owne..."
12,Mohammed Shami,India,Right hand Bat,Right arm Fast,Bowler,Mohammed Shami was India's leading fast bowler...
33,Ishan Kishan,India,Left hand Bat,Legbreak,Wicketkeeper Batter,A wicketkeeper and left-handed opening batsman...
46,Mohammed Siraj,India,Right hand Bat,Right arm Fast,Bowler,Mohammed Siraj's rise as a seamer was remarkab...
59,Ravichandran Ashwin,India,Right hand Bat,Right arm Offbreak,Bowling Allrounder,R Ashwin took the tricks and skills he learned...
77,Rohit Sharma(c),India,Right hand Bat,Right arm Offbreak,Top order Batter,"Languid and easy on the eye, Rohit Sharma owne..."
81,Shubman Gill,India,Right hand Bat,Right arm Offbreak,Opening Batter,"A right-hand top-order batsman from Punjab, Sh..."
86,KL Rahul,India,Right hand Bat,Right arm Fast,Wicketkeeper Batter,"A tall, elegant right-hand batsman who can kee..."
