In [180]:
import pandas as pd


# Filter criteria: (typically for openers)
# 1. Batting Average > 30
# 2. Strike Rate > 140
# 3. Innings Batted >= 3
# 4. Boundary % > 50


In [181]:
file_path = "icc_t20_worldcup_batting.csv"  
df = pd.read_csv(file_path)
print(df.head())


        batsmanName  runs  balls minutes  4s  6s      SR   match_id
0  Rohit Sharma (c)     9      5       7   2   0  180.00  scorecard
1       Virat Kohli    76     59      87   6   2  128.81  scorecard
2    Rishabh Pant †     0      2       2   0   0    0.00  scorecard
3  Suryakumar Yadav     3      4       9   0   0   75.00  scorecard
4        Axar Patel    47     31      38   1   4  151.61  scorecard


In [182]:
df = df.drop(columns=['match_id'])
print(df.head())

        batsmanName  runs  balls minutes  4s  6s      SR
0  Rohit Sharma (c)     9      5       7   2   0  180.00
1       Virat Kohli    76     59      87   6   2  128.81
2    Rishabh Pant †     0      2       2   0   0    0.00
3  Suryakumar Yadav     3      4       9   0   0   75.00
4        Axar Patel    47     31      38   1   4  151.61


In [183]:
df = df.sort_values(by='batsmanName')

print(df)

         batsmanName  runs  balls minutes  4s  6s      SR
818    Aaron Johnson    23     16      24   5   0  143.75
626    Aaron Johnson    14     13      23   3   0  107.69
468    Aaron Johnson    52     44      64   4   4  118.18
673      Aaron Jones    36     26       -   2   2  138.46
828      Aaron Jones    94     40      56   4  10  235.00
..               ...   ...    ...     ...  ..  ..     ...
799     Zane Green †     0      2       2   0   0    0.00
785  Zeeshan Maqsood    22     20      29   4   0  110.00
502  Zeeshan Maqsood     3      4       6   0   0   75.00
684  Zeeshan Maqsood     1      7      12   0   0   14.28
390  Zeeshan Maqsood     1      5       7   0   0   20.00

[830 rows x 7 columns]


In [184]:
name_counts = df['batsmanName'].value_counts()

# Step 2: Add a new column to the DataFrame that shows how many times each player appears
df['inning_batted'] = df['batsmanName'].map(name_counts)

# Step 3: Display the updated DataFrame
print(df)

         batsmanName  runs  balls minutes  4s  6s      SR  inning_batted
818    Aaron Johnson    23     16      24   5   0  143.75              3
626    Aaron Johnson    14     13      23   3   0  107.69              3
468    Aaron Johnson    52     44      64   4   4  118.18              3
673      Aaron Jones    36     26       -   2   2  138.46              2
828      Aaron Jones    94     40      56   4  10  235.00              2
..               ...   ...    ...     ...  ..  ..     ...            ...
799     Zane Green †     0      2       2   0   0    0.00              3
785  Zeeshan Maqsood    22     20      29   4   0  110.00              4
502  Zeeshan Maqsood     3      4       6   0   0   75.00              4
684  Zeeshan Maqsood     1      7      12   0   0   14.28              4
390  Zeeshan Maqsood     1      5       7   0   0   20.00              4

[830 rows x 8 columns]


In [185]:
print(df)

         batsmanName  runs  balls minutes  4s  6s      SR  inning_batted
818    Aaron Johnson    23     16      24   5   0  143.75              3
626    Aaron Johnson    14     13      23   3   0  107.69              3
468    Aaron Johnson    52     44      64   4   4  118.18              3
673      Aaron Jones    36     26       -   2   2  138.46              2
828      Aaron Jones    94     40      56   4  10  235.00              2
..               ...   ...    ...     ...  ..  ..     ...            ...
799     Zane Green †     0      2       2   0   0    0.00              3
785  Zeeshan Maqsood    22     20      29   4   0  110.00              4
502  Zeeshan Maqsood     3      4       6   0   0   75.00              4
684  Zeeshan Maqsood     1      7      12   0   0   14.28              4
390  Zeeshan Maqsood     1      5       7   0   0   20.00              4

[830 rows x 8 columns]


In [186]:
df['SR'] = pd.to_numeric(df['SR'], errors='coerce')

df_cleaned = df[(df['SR'] != 0) & (df['SR'].notna())]

print(df_cleaned)

         batsmanName  runs  balls minutes  4s  6s      SR  inning_batted
818    Aaron Johnson    23     16      24   5   0  143.75              3
626    Aaron Johnson    14     13      23   3   0  107.69              3
468    Aaron Johnson    52     44      64   4   4  118.18              3
673      Aaron Jones    36     26       -   2   2  138.46              2
828      Aaron Jones    94     40      56   4  10  235.00              2
..               ...   ...    ...     ...  ..  ..     ...            ...
648     Zane Green †    28     27      39   1   1  103.70              3
785  Zeeshan Maqsood    22     20      29   4   0  110.00              4
502  Zeeshan Maqsood     3      4       6   0   0   75.00              4
684  Zeeshan Maqsood     1      7      12   0   0   14.28              4
390  Zeeshan Maqsood     1      5       7   0   0   20.00              4

[706 rows x 8 columns]


In [187]:
df['SR'] = pd.to_numeric(df['SR'], errors='coerce').fillna(0)

df['Average_SR'] = df.groupby('batsmanName')['SR'].transform('mean')

print(df)



         batsmanName  runs  balls minutes  4s  6s      SR  inning_batted  \
818    Aaron Johnson    23     16      24   5   0  143.75              3   
626    Aaron Johnson    14     13      23   3   0  107.69              3   
468    Aaron Johnson    52     44      64   4   4  118.18              3   
673      Aaron Jones    36     26       -   2   2  138.46              2   
828      Aaron Jones    94     40      56   4  10  235.00              2   
..               ...   ...    ...     ...  ..  ..     ...            ...   
799     Zane Green †     0      2       2   0   0    0.00              3   
785  Zeeshan Maqsood    22     20      29   4   0  110.00              4   
502  Zeeshan Maqsood     3      4       6   0   0   75.00              4   
684  Zeeshan Maqsood     1      7      12   0   0   14.28              4   
390  Zeeshan Maqsood     1      5       7   0   0   20.00              4   

     Average_SR  
818  123.206667  
626  123.206667  
468  123.206667  
673  186.730000

In [188]:
df['boundaries'] = df['4s'] + df['6s']

print(df)

         batsmanName  runs  balls minutes  4s  6s      SR  inning_batted  \
818    Aaron Johnson    23     16      24   5   0  143.75              3   
626    Aaron Johnson    14     13      23   3   0  107.69              3   
468    Aaron Johnson    52     44      64   4   4  118.18              3   
673      Aaron Jones    36     26       -   2   2  138.46              2   
828      Aaron Jones    94     40      56   4  10  235.00              2   
..               ...   ...    ...     ...  ..  ..     ...            ...   
799     Zane Green †     0      2       2   0   0    0.00              3   
785  Zeeshan Maqsood    22     20      29   4   0  110.00              4   
502  Zeeshan Maqsood     3      4       6   0   0   75.00              4   
684  Zeeshan Maqsood     1      7      12   0   0   14.28              4   
390  Zeeshan Maqsood     1      5       7   0   0   20.00              4   

     Average_SR  boundaries  
818  123.206667           5  
626  123.206667           3

In [189]:
df['boundary_runs'] = (df['4s'] * 4) + (df['6s'] * 6)

print(df)

         batsmanName  runs  balls minutes  4s  6s      SR  inning_batted  \
818    Aaron Johnson    23     16      24   5   0  143.75              3   
626    Aaron Johnson    14     13      23   3   0  107.69              3   
468    Aaron Johnson    52     44      64   4   4  118.18              3   
673      Aaron Jones    36     26       -   2   2  138.46              2   
828      Aaron Jones    94     40      56   4  10  235.00              2   
..               ...   ...    ...     ...  ..  ..     ...            ...   
799     Zane Green †     0      2       2   0   0    0.00              3   
785  Zeeshan Maqsood    22     20      29   4   0  110.00              4   
502  Zeeshan Maqsood     3      4       6   0   0   75.00              4   
684  Zeeshan Maqsood     1      7      12   0   0   14.28              4   
390  Zeeshan Maqsood     1      5       7   0   0   20.00              4   

     Average_SR  boundaries  boundary_runs  
818  123.206667           5             20

In [190]:

df['boundary_percentage'] = (df['boundary_runs'] / df['runs']) * 100

print(df)


         batsmanName  runs  balls minutes  4s  6s      SR  inning_batted  \
818    Aaron Johnson    23     16      24   5   0  143.75              3   
626    Aaron Johnson    14     13      23   3   0  107.69              3   
468    Aaron Johnson    52     44      64   4   4  118.18              3   
673      Aaron Jones    36     26       -   2   2  138.46              2   
828      Aaron Jones    94     40      56   4  10  235.00              2   
..               ...   ...    ...     ...  ..  ..     ...            ...   
799     Zane Green †     0      2       2   0   0    0.00              3   
785  Zeeshan Maqsood    22     20      29   4   0  110.00              4   
502  Zeeshan Maqsood     3      4       6   0   0   75.00              4   
684  Zeeshan Maqsood     1      7      12   0   0   14.28              4   
390  Zeeshan Maqsood     1      5       7   0   0   20.00              4   

     Average_SR  boundaries  boundary_runs  boundary_percentage  
818  123.206667      

In [191]:
df['Average_Boundary_Percentage'] = df.groupby('batsmanName')['boundary_percentage'].transform('mean')

df

Unnamed: 0,batsmanName,runs,balls,minutes,4s,6s,SR,inning_batted,Average_SR,boundaries,boundary_runs,boundary_percentage,Average_Boundary_Percentage
818,Aaron Johnson,23,16,24,5,0,143.75,3,123.206667,5,20,86.956522,83.197961
626,Aaron Johnson,14,13,23,3,0,107.69,3,123.206667,3,12,85.714286,83.197961
468,Aaron Johnson,52,44,64,4,4,118.18,3,123.206667,8,40,76.923077,83.197961
673,Aaron Jones,36,26,-,2,2,138.46,2,186.730000,4,20,55.555556,68.203310
828,Aaron Jones,94,40,56,4,10,235.00,2,186.730000,14,76,80.851064,68.203310
...,...,...,...,...,...,...,...,...,...,...,...,...,...
799,Zane Green †,0,2,2,0,0,0.00,3,42.900000,0,0,,17.857143
785,Zeeshan Maqsood,22,20,29,4,0,110.00,4,54.820000,4,16,72.727273,18.181818
502,Zeeshan Maqsood,3,4,6,0,0,75.00,4,54.820000,0,0,0.000000,18.181818
684,Zeeshan Maqsood,1,7,12,0,0,14.28,4,54.820000,0,0,0.000000,18.181818


In [192]:
df['total_runs'] = df.groupby('batsmanName')['runs'].transform('sum')
df['inning_batted'] = df.groupby('batsmanName')['inning_batted'].transform('count')

# Step 2: Calculate batting average for each player and add it to the DataFrame
df['batting_average'] = df['total_runs'] / df['inning_batted']

# Step 3: Display the updated DataFrame with batting averages and other columns
print(df)

         batsmanName  runs  balls minutes  4s  6s      SR  inning_batted  \
818    Aaron Johnson    23     16      24   5   0  143.75              3   
626    Aaron Johnson    14     13      23   3   0  107.69              3   
468    Aaron Johnson    52     44      64   4   4  118.18              3   
673      Aaron Jones    36     26       -   2   2  138.46              2   
828      Aaron Jones    94     40      56   4  10  235.00              2   
..               ...   ...    ...     ...  ..  ..     ...            ...   
799     Zane Green †     0      2       2   0   0    0.00              3   
785  Zeeshan Maqsood    22     20      29   4   0  110.00              4   
502  Zeeshan Maqsood     3      4       6   0   0   75.00              4   
684  Zeeshan Maqsood     1      7      12   0   0   14.28              4   
390  Zeeshan Maqsood     1      5       7   0   0   20.00              4   

     Average_SR  boundaries  boundary_runs  boundary_percentage  \
818  123.206667     

In [193]:
df.to_csv('processed_batting_stats.csv', index=False)

  values = values.astype(str)


In [194]:
columns_to_keep = ['batsmanName', 'inning_batted', 'Average_SR', 'Average_Boundary_Percentage', 'total_runs', 'batting_average']

# Create the new DataFrame with the specified columns
df2 = df[columns_to_keep]

# Display the new DataFrame
df2.head()

Unnamed: 0,batsmanName,inning_batted,Average_SR,Average_Boundary_Percentage,total_runs,batting_average
818,Aaron Johnson,3,123.206667,83.197961,89,29.666667
626,Aaron Johnson,3,123.206667,83.197961,89,29.666667
468,Aaron Johnson,3,123.206667,83.197961,89,29.666667
673,Aaron Jones,2,186.73,68.20331,130,65.0
828,Aaron Jones,2,186.73,68.20331,130,65.0


In [195]:
# Removing duplicate rows from the DataFrame
df2 = df2.drop_duplicates()

# Display the updated DataFrame
print(df2)


               batsmanName  inning_batted  Average_SR  \
818          Aaron Johnson              3  123.206667   
673            Aaron Jones              2  186.730000   
108        Aaron Jones (c)              4   53.125000   
364         Aasif Sheikh †              2   67.855000   
314           Abbas Afridi              1   80.950000   
..                     ...            ...         ...   
295          Vivian Kingma              1  100.000000   
768  Wanindu Hasaranga (c)              3  111.110000   
400             Will Jacks              2   85.710000   
459           Zane Green †              3   42.900000   
785        Zeeshan Maqsood              4   54.820000   

     Average_Boundary_Percentage  total_runs  batting_average  
818                    83.197961          89        29.666667  
673                    68.203310         130        65.000000  
108                    63.030303          32         8.000000  
364                    76.190476          46        23.0000

In [196]:
df2.to_csv('final_batting_stats.csv', index=False)

  values = values.astype(str)


In [197]:
# for openers
filtered_df2 = df2[
    (df2['batting_average'] > 30) &
    (df2['Average_SR'] >140) &
    (df2['inning_batted'] > 3) &
    (df2['Average_Boundary_Percentage'] > 50)
]

filtered_df2


Unnamed: 0,batsmanName,inning_batted,Average_SR,Average_Boundary_Percentage,total_runs,batting_average
565,Harry Brook,4,158.7025,50.101164,145,36.25
28,Jos Buttler (c)†,7,145.768571,62.078307,214,30.571429
0,Rohit Sharma (c),8,140.00375,63.153005,257,32.125


# Filter criteria: (typically for middle order)
# Batting Average > 40
# Strike Rate > 125
# Innings Batted > 3
# Average balls faced by the batter in an innings> 20

In [198]:
df['total_balls'] = df.groupby('batsmanName')['balls'].transform('sum')

df['innings_played'] = df.groupby('batsmanName')['inning_batted'].transform('count')
df['balls_avg'] = df['total_balls'] / df['innings_played']
df.head(10)

Unnamed: 0,batsmanName,runs,balls,minutes,4s,6s,SR,inning_batted,Average_SR,boundaries,boundary_runs,boundary_percentage,Average_Boundary_Percentage,total_runs,batting_average,total_balls,innings_played,balls_avg
818,Aaron Johnson,23,16,24,5,0,143.75,3,123.206667,5,20,86.956522,83.197961,89,29.666667,73,3,24.333333
626,Aaron Johnson,14,13,23,3,0,107.69,3,123.206667,3,12,85.714286,83.197961,89,29.666667,73,3,24.333333
468,Aaron Johnson,52,44,64,4,4,118.18,3,123.206667,8,40,76.923077,83.197961,89,29.666667,73,3,24.333333
673,Aaron Jones,36,26,-,2,2,138.46,2,186.73,4,20,55.555556,68.20331,130,65.0,66,2,33.0
828,Aaron Jones,94,40,56,4,10,235.0,2,186.73,14,76,80.851064,68.20331,130,65.0,66,2,33.0
108,Aaron Jones (c),10,16,10,2,0,62.5,4,53.125,2,8,80.0,63.030303,32,8.0,54,4,13.5
443,Aaron Jones (c),11,22,29,0,1,50.0,4,53.125,1,6,54.545455,63.030303,32,8.0,54,4,13.5
157,Aaron Jones (c),11,11,12,0,1,100.0,4,53.125,1,6,54.545455,63.030303,32,8.0,54,4,13.5
238,Aaron Jones (c),0,5,4,0,0,0.0,4,53.125,0,0,,63.030303,32,8.0,54,4,13.5
364,Aasif Sheikh †,42,49,77,4,1,85.71,2,67.855,5,22,52.380952,76.190476,46,23.0,57,2,28.5


In [199]:
# Merge df2 with df to add the balls_avg column
df2 = df2.merge(df[['batsmanName', 'balls_avg']], on='batsmanName', how='left')

# Display the updated df2
print(df2)


         batsmanName  inning_batted  Average_SR  Average_Boundary_Percentage  \
0      Aaron Johnson              3  123.206667                    83.197961   
1      Aaron Johnson              3  123.206667                    83.197961   
2      Aaron Johnson              3  123.206667                    83.197961   
3        Aaron Jones              2  186.730000                    68.203310   
4        Aaron Jones              2  186.730000                    68.203310   
..               ...            ...         ...                          ...   
825     Zane Green †              3   42.900000                    17.857143   
826  Zeeshan Maqsood              4   54.820000                    18.181818   
827  Zeeshan Maqsood              4   54.820000                    18.181818   
828  Zeeshan Maqsood              4   54.820000                    18.181818   
829  Zeeshan Maqsood              4   54.820000                    18.181818   

     total_runs  batting_average  balls

In [200]:
# Removing duplicate rows from the DataFrame
df2 = df2.drop_duplicates()

# Display the updated DataFrame
print(df2)

               batsmanName  inning_batted  Average_SR  \
0            Aaron Johnson              3  123.206667   
3              Aaron Jones              2  186.730000   
5          Aaron Jones (c)              4   53.125000   
9           Aasif Sheikh †              2   67.855000   
11            Abbas Afridi              1   80.950000   
..                     ...            ...         ...   
817          Vivian Kingma              1  100.000000   
818  Wanindu Hasaranga (c)              3  111.110000   
821             Will Jacks              2   85.710000   
823           Zane Green †              3   42.900000   
826        Zeeshan Maqsood              4   54.820000   

     Average_Boundary_Percentage  total_runs  batting_average  balls_avg  
0                      83.197961          89        29.666667  24.333333  
3                      68.203310         130        65.000000  33.000000  
5                      63.030303          32         8.000000  13.500000  
9              

In [201]:

filtered_df3 = df2[
    (df2['batting_average'] > 35) &
    (df2['Average_SR'] > 125) &
    (df2['inning_batted'] > 3) &
    (df2['balls_avg'] > 20)
]

filtered_df3



Unnamed: 0,batsmanName,inning_batted,Average_SR,Average_Boundary_Percentage,total_runs,batting_average,balls_avg
249,Harry Brook,4,158.7025,50.101164,145,36.25,23.0
784,Travis Head,7,138.857143,76.387918,255,36.428571,23.0


In [202]:
selected_players = pd.concat([filtered_df2['batsmanName'], filtered_df3['batsmanName']])
selected_players.to_csv('selected_players.csv', index=False)
selected_players

565         Harry Brook
28     Jos Buttler (c)†
0      Rohit Sharma (c)
249         Harry Brook
784         Travis Head
Name: batsmanName, dtype: object

# Filter criteria: (typically for lower order)
# Batting Average > 25
# Strike Rate > 130
# Innings Batted > 3
# Average balls faced by the batter in an innings> 12
# Innings Bowled > 1

In [203]:

filtered_df2 = df2[
    (df2['batting_average'] > 25) &
    (df2['Average_SR'] > 130) &
    (df2['inning_batted'] > 3) &
    (df2['balls_avg'] > 12)
]

filtered_df2.reset_index(drop=True, inplace=True)
filtered_df2.to_csv('temp.csv', index=False)
filtered_df2



Unnamed: 0,batsmanName,inning_batted,Average_SR,Average_Boundary_Percentage,total_runs,batting_average,balls_avg
0,Andries Gous †,4,137.865,49.892241,119,29.75,18.25
1,David Warner,7,131.067143,51.998213,178,25.428571,18.285714
2,George Munsey,4,134.02,55.156794,124,31.0,22.25
3,Harry Brook,4,158.7025,50.101164,145,36.25,23.0
4,Jos Buttler (c)†,7,145.768571,62.078307,214,30.571429,19.285714
5,Marcus Stoinis,5,136.144,50.011575,169,33.8,20.6
6,Nicholas Pooran †,7,132.4,61.103951,228,32.571429,22.285714
7,Phil Salt,7,171.785714,62.568387,188,26.857143,16.857143
8,Rohit Sharma (c),8,140.00375,63.153005,257,32.125,20.5
9,Travis Head,7,138.857143,76.387918,255,36.428571,23.0
