In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

import warnings
warnings.filterwarnings("ignore")

In [36]:
Matches = pd.read_csv("matches_processed.csv")
Deliveries = pd.read_csv("deliveries_processed.csv")

In [37]:
print(Matches.columns)
print(Deliveries.columns)

Index(['id', 'player_of_match', 'venue', 'team1', 'team2', 'toss_winner',
       'winner', 'result', 'target_runs', 'target_overs', 'Home_Win', 'loser',
       'Home_Loss', 'Neutral', 'toss_match_win'],
      dtype='object')
Index(['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball',
       'batter', 'bowler', 'non_striker', 'batsman_runs', 'extra_runs',
       'total_runs', 'extras_type', 'is_wicket', 'player_dismissed',
       'dismissal_kind', 'score'],
      dtype='object')


In [38]:
unique_players = pd.concat([
    Deliveries['batter'],
    Deliveries['bowler'],
    Deliveries['non_striker']
]).dropna().unique()
print(unique_players)

['SC Ganguly' 'BB McCullum' 'RT Ponting' 'DJ Hussey' 'Mohammad Hafeez'
 'R Dravid' 'W Jaffer' 'V Kohli' 'JH Kallis' 'CL White' 'MV Boucher'
 'B Akhil' 'AA Noffke' 'P Kumar' 'Z Khan' 'SB Joshi' 'PA Patel'
 'ML Hayden' 'MEK Hussey' 'MS Dhoni' 'SK Raina' 'JDP Oram' 'S Badrinath'
 'K Goel' 'JR Hopes' 'KC Sangakkara' 'Yuvraj Singh' 'SM Katich'
 'IK Pathan' 'T Kohli' 'YK Pathan' 'SR Watson' 'M Kaif' 'DS Lehmann'
 'RA Jadeja' 'M Rawat' 'D Salunkhe' 'SK Warne' 'SK Trivedi' 'G Gambhir'
 'V Sehwag' 'S Dhawan' 'L Ronchi' 'ST Jayasuriya' 'DJ Thornely'
 'RV Uthappa' 'PR Shah' 'AM Nayar' 'SM Pollock' 'Harbhajan Singh'
 'S Chanderpaul' 'LRPL Taylor' 'AC Gilchrist' 'Y Venugopal Rao'
 'VVS Laxman' 'A Symonds' 'RG Sharma' 'SB Styris' 'AS Yadav' 'SB Bangar'
 'WPUJC Vaas' 'RP Singh' 'WP Saha' 'LR Shukla' 'DPMD Jayawardene'
 'S Sohal' 'B Lee' 'PP Chawla' 'WA Mota' 'Kamran Akmal' 'Shahid Afridi'
 'DJ Bravo' 'MA Khote' 'A Nehra' 'GC Smith' 'Pankaj Singh' 'RR Sarwan'
 'S Sreesanth' 'VRV Singh' 'SS Tiwary' 'DS

In [40]:
Players = pd.DataFrame(unique_players, columns=['player'])

runs_df = Deliveries.groupby('batter')['batsman_runs'].sum().reset_index()

batting_stats = Deliveries.groupby('batter').agg(
    matches_played = ('match_id', lambda x: x.nunique()),
    total_runs     = ('batsman_runs', 'sum'),
    balls_faced    = ('batsman_runs', 'count'),
    total_4s       = ('batsman_runs', lambda x: (x == 4).sum()),
    total_6s       = ('batsman_runs', lambda x: (x == 6).sum())
).reset_index().rename(columns={'Batter': 'player'})

bowling_stats = Deliveries.groupby('bowler').agg(
    balls_bowled = ('match_id', 'count'),
    extras_given = ('extra_runs', 'sum'),
    runs_given   = ('total_runs', 'sum'),
    wickets_taken = ('is_wicket', 'sum')
).reset_index().rename(columns={'Bowler': 'player'})

bowling_stats['overs_bowled'] = bowling_stats['balls_bowled'] / 6
bowling_stats.drop('balls_bowled', axis=1, inplace=True)

batting_stats.rename(columns={'batter': 'player'}, inplace=True)
bowling_stats.rename(columns={'bowler': 'player'}, inplace=True)

players_stats = Players.merge(batting_stats, on='player', how='left')
players_stats = players_stats.merge(bowling_stats, on='player', how='left')

players_stats.fillna(0, inplace=True)

players_stats.head()

Unnamed: 0,player,matches_played,total_runs,balls_faced,total_4s,total_6s,extras_given,runs_given,wickets_taken,overs_bowled
0,SC Ganguly,38.0,1031.0,977.0,104.0,36.0,8.0,273.0,9.0,36.5
1,BB McCullum,90.0,2256.0,1777.0,230.0,103.0,0.0,0.0,0.0,0.0
2,RT Ponting,8.0,77.0,112.0,3.0,2.0,0.0,0.0,0.0,0.0
3,DJ Hussey,57.0,1271.0,1051.0,87.0,58.0,18.0,485.0,10.0,53.833333
4,Mohammad Hafeez,8.0,64.0,84.0,7.0,2.0,10.0,71.0,2.0,10.5
