In [1]:
import pandas as pd


# Filter criteria: (typically for openers)
# 1. Batting Average > 30
# 2. Strike Rate > 140
# 3. Innings Batted >= 3
# 4. Boundary % > 50


In [2]:
file_path = "icc_t20_worldcup_batting.csv"  
df = pd.read_csv(file_path)
print(df.head())


         batsmanName  runs  balls minutes  4s  6s      SR   match_id
0  Mohammad Rizwan †    15     14      24   0   1  107.14  scorecard
1     Babar Azam (c)    32     28      58   2   0  114.28  scorecard
2     Mohammad Haris     8     12      15   1   0   66.66  scorecard
3        Shan Masood    38     28      46   2   1  135.71  scorecard
4     Iftikhar Ahmed     0      6       8   0   0    0.00  scorecard


In [3]:
df = df.drop(columns=['match_id'])
print(df.head())

         batsmanName  runs  balls minutes  4s  6s      SR
0  Mohammad Rizwan †    15     14      24   0   1  107.14
1     Babar Azam (c)    32     28      58   2   0  114.28
2     Mohammad Haris     8     12      15   1   0   66.66
3        Shan Masood    38     28      46   2   1  135.71
4     Iftikhar Ahmed     0      6       8   0   0    0.00


In [4]:
df = df.sort_values(by='batsmanName')

print(df)

          batsmanName  runs  balls minutes  4s  6s      SR
487   Aaron Finch (c)    13     11      20   1   1  118.18
230   Aaron Finch (c)    63     44      87   5   3  143.18
399   Aaron Finch (c)    31     42      89   0   1   73.80
668  Aayan Afzal Khan     5      7       8   0   0   71.42
604  Aayan Afzal Khan    19     21      33   1   0   90.47
..                ...   ...    ...     ...  ..  ..     ...
289         Yasir Ali     1      1       3   0   0  100.00
176         Yasir Ali     1      3       8   0   0   33.33
608       Zahoor Khan     1      1      15   0   0  100.00
534      Zane Green †     2      3       3   0   0   66.66
665       Zawar Farid     2      4       8   0   0   50.00

[699 rows x 7 columns]


In [5]:
df.describe()

Unnamed: 0,runs,balls,4s,6s
count,699.0,699.0,699.0,699.0
mean,15.978541,13.648069,1.300429,0.473534
std,18.709462,12.718093,1.834448,0.978465
min,0.0,0.0,0.0,0.0
25%,2.0,4.0,0.0,0.0
50%,9.0,10.0,1.0,0.0
75%,22.5,19.0,2.0,1.0
max,109.0,64.0,10.0,8.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 699 entries, 487 to 665
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   batsmanName  699 non-null    object
 1   runs         699 non-null    int64 
 2   balls        699 non-null    int64 
 3   minutes      699 non-null    object
 4   4s           699 non-null    int64 
 5   6s           699 non-null    int64 
 6   SR           699 non-null    object
dtypes: int64(4), object(3)
memory usage: 43.7+ KB


In [7]:
name_counts = df['batsmanName'].value_counts()

# Step 2: Add a new column to the DataFrame that shows how many times each player appears
df['inning_batted'] = df['batsmanName'].map(name_counts)

# Step 3: Display the updated DataFrame
print(df)

          batsmanName  runs  balls minutes  4s  6s      SR  inning_batted
487   Aaron Finch (c)    13     11      20   1   1  118.18              3
230   Aaron Finch (c)    63     44      87   5   3  143.18              3
399   Aaron Finch (c)    31     42      89   0   1   73.80              3
668  Aayan Afzal Khan     5      7       8   0   0   71.42              2
604  Aayan Afzal Khan    19     21      33   1   0   90.47              2
..                ...   ...    ...     ...  ..  ..     ...            ...
289         Yasir Ali     1      1       3   0   0  100.00              3
176         Yasir Ali     1      3       8   0   0   33.33              3
608       Zahoor Khan     1      1      15   0   0  100.00              1
534      Zane Green †     2      3       3   0   0   66.66              1
665       Zawar Farid     2      4       8   0   0   50.00              1

[699 rows x 8 columns]


In [8]:
print(df)

          batsmanName  runs  balls minutes  4s  6s      SR  inning_batted
487   Aaron Finch (c)    13     11      20   1   1  118.18              3
230   Aaron Finch (c)    63     44      87   5   3  143.18              3
399   Aaron Finch (c)    31     42      89   0   1   73.80              3
668  Aayan Afzal Khan     5      7       8   0   0   71.42              2
604  Aayan Afzal Khan    19     21      33   1   0   90.47              2
..                ...   ...    ...     ...  ..  ..     ...            ...
289         Yasir Ali     1      1       3   0   0  100.00              3
176         Yasir Ali     1      3       8   0   0   33.33              3
608       Zahoor Khan     1      1      15   0   0  100.00              1
534      Zane Green †     2      3       3   0   0   66.66              1
665       Zawar Farid     2      4       8   0   0   50.00              1

[699 rows x 8 columns]


In [9]:
df['SR'] = pd.to_numeric(df['SR'], errors='coerce')

df_cleaned = df[(df['SR'] != 0) & (df['SR'].notna())]

print(df_cleaned)

          batsmanName  runs  balls minutes  4s  6s      SR  inning_batted
487   Aaron Finch (c)    13     11      20   1   1  118.18              3
230   Aaron Finch (c)    63     44      87   5   3  143.18              3
399   Aaron Finch (c)    31     42      89   0   1   73.80              3
668  Aayan Afzal Khan     5      7       8   0   0   71.42              2
604  Aayan Afzal Khan    19     21      33   1   0   90.47              2
..                ...   ...    ...     ...  ..  ..     ...            ...
289         Yasir Ali     1      1       3   0   0  100.00              3
176         Yasir Ali     1      3       8   0   0   33.33              3
608       Zahoor Khan     1      1      15   0   0  100.00              1
534      Zane Green †     2      3       3   0   0   66.66              1
665       Zawar Farid     2      4       8   0   0   50.00              1

[617 rows x 8 columns]


In [10]:
df['SR'] = pd.to_numeric(df['SR'], errors='coerce').fillna(0)

df['Average_SR'] = df.groupby('batsmanName')['SR'].transform('mean')

print(df)



          batsmanName  runs  balls minutes  4s  6s      SR  inning_batted  \
487   Aaron Finch (c)    13     11      20   1   1  118.18              3   
230   Aaron Finch (c)    63     44      87   5   3  143.18              3   
399   Aaron Finch (c)    31     42      89   0   1   73.80              3   
668  Aayan Afzal Khan     5      7       8   0   0   71.42              2   
604  Aayan Afzal Khan    19     21      33   1   0   90.47              2   
..                ...   ...    ...     ...  ..  ..     ...            ...   
289         Yasir Ali     1      1       3   0   0  100.00              3   
176         Yasir Ali     1      3       8   0   0   33.33              3   
608       Zahoor Khan     1      1      15   0   0  100.00              1   
534      Zane Green †     2      3       3   0   0   66.66              1   
665       Zawar Farid     2      4       8   0   0   50.00              1   

     Average_SR  
487  111.720000  
230  111.720000  
399  111.720000  
668

In [11]:
df['boundaries'] = df['4s'] + df['6s']

print(df)

          batsmanName  runs  balls minutes  4s  6s      SR  inning_batted  \
487   Aaron Finch (c)    13     11      20   1   1  118.18              3   
230   Aaron Finch (c)    63     44      87   5   3  143.18              3   
399   Aaron Finch (c)    31     42      89   0   1   73.80              3   
668  Aayan Afzal Khan     5      7       8   0   0   71.42              2   
604  Aayan Afzal Khan    19     21      33   1   0   90.47              2   
..                ...   ...    ...     ...  ..  ..     ...            ...   
289         Yasir Ali     1      1       3   0   0  100.00              3   
176         Yasir Ali     1      3       8   0   0   33.33              3   
608       Zahoor Khan     1      1      15   0   0  100.00              1   
534      Zane Green †     2      3       3   0   0   66.66              1   
665       Zawar Farid     2      4       8   0   0   50.00              1   

     Average_SR  boundaries  
487  111.720000           2  
230  111.720000

In [12]:
df['boundary_runs'] = (df['4s'] * 4) + (df['6s'] * 6)

print(df)

          batsmanName  runs  balls minutes  4s  6s      SR  inning_batted  \
487   Aaron Finch (c)    13     11      20   1   1  118.18              3   
230   Aaron Finch (c)    63     44      87   5   3  143.18              3   
399   Aaron Finch (c)    31     42      89   0   1   73.80              3   
668  Aayan Afzal Khan     5      7       8   0   0   71.42              2   
604  Aayan Afzal Khan    19     21      33   1   0   90.47              2   
..                ...   ...    ...     ...  ..  ..     ...            ...   
289         Yasir Ali     1      1       3   0   0  100.00              3   
176         Yasir Ali     1      3       8   0   0   33.33              3   
608       Zahoor Khan     1      1      15   0   0  100.00              1   
534      Zane Green †     2      3       3   0   0   66.66              1   
665       Zawar Farid     2      4       8   0   0   50.00              1   

     Average_SR  boundaries  boundary_runs  
487  111.720000           2   

In [13]:

df['boundary_percentage'] = (df['boundary_runs'] / df['runs']) * 100

print(df)


          batsmanName  runs  balls minutes  4s  6s      SR  inning_batted  \
487   Aaron Finch (c)    13     11      20   1   1  118.18              3   
230   Aaron Finch (c)    63     44      87   5   3  143.18              3   
399   Aaron Finch (c)    31     42      89   0   1   73.80              3   
668  Aayan Afzal Khan     5      7       8   0   0   71.42              2   
604  Aayan Afzal Khan    19     21      33   1   0   90.47              2   
..                ...   ...    ...     ...  ..  ..     ...            ...   
289         Yasir Ali     1      1       3   0   0  100.00              3   
176         Yasir Ali     1      3       8   0   0   33.33              3   
608       Zahoor Khan     1      1      15   0   0  100.00              1   
534      Zane Green †     2      3       3   0   0   66.66              1   
665       Zawar Farid     2      4       8   0   0   50.00              1   

     Average_SR  boundaries  boundary_runs  boundary_percentage  
487  111.

In [14]:
df['Average_Boundary_Percentage'] = df.groupby('batsmanName')['boundary_percentage'].transform('mean')

df

Unnamed: 0,batsmanName,runs,balls,minutes,4s,6s,SR,inning_batted,Average_SR,boundaries,boundary_runs,boundary_percentage,Average_Boundary_Percentage
487,Aaron Finch (c),13,11,20,1,1,118.18,3,111.720000,2,10,76.923077,52.198459
230,Aaron Finch (c),63,44,87,5,3,143.18,3,111.720000,8,38,60.317460,52.198459
399,Aaron Finch (c),31,42,89,0,1,73.80,3,111.720000,1,6,19.354839,52.198459
668,Aayan Afzal Khan,5,7,8,0,0,71.42,2,80.945000,0,0,0.000000,10.526316
604,Aayan Afzal Khan,19,21,33,1,0,90.47,2,80.945000,1,4,21.052632,10.526316
...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,Yasir Ali,1,1,3,0,0,100.00,3,64.443333,0,0,0.000000,0.000000
176,Yasir Ali,1,3,8,0,0,33.33,3,64.443333,0,0,0.000000,0.000000
608,Zahoor Khan,1,1,15,0,0,100.00,1,100.000000,0,0,0.000000,0.000000
534,Zane Green †,2,3,3,0,0,66.66,1,66.660000,0,0,0.000000,0.000000


In [None]:
df['total_runs'] = df.groupby('batsmanName')['runs'].transform('sum')
df['inning_batted'] = df.groupby('batsmanName')['inning_batted'].transform('count')

# Step 2: Calculate batting average for each player and add it to the DataFrame
df['batting_average'] = df['total_runs'] / df['inning_batted']

# Step 3: Display the updated DataFrame with batting averages and other columns
print(df)

          batsmanName  runs  balls minutes  4s  6s      SR  inning_batted  \
487   Aaron Finch (c)    13     11      20   1   1  118.18              3   
230   Aaron Finch (c)    63     44      87   5   3  143.18              3   
399   Aaron Finch (c)    31     42      89   0   1   73.80              3   
668  Aayan Afzal Khan     5      7       8   0   0   71.42              2   
604  Aayan Afzal Khan    19     21      33   1   0   90.47              2   
..                ...   ...    ...     ...  ..  ..     ...            ...   
289         Yasir Ali     1      1       3   0   0  100.00              3   
176         Yasir Ali     1      3       8   0   0   33.33              3   
608       Zahoor Khan     1      1      15   0   0  100.00              1   
534      Zane Green †     2      3       3   0   0   66.66              1   
665       Zawar Farid     2      4       8   0   0   50.00              1   

     Average_SR  boundaries  boundary_runs  boundary_percentage  \
487  111

In [16]:
df.to_csv('processed_batting_stats.csv', index=False)

  values = values.astype(str)


In [None]:
columns_to_keep = ['batsmanName', 'inning_batted', 'Average_SR', 'Average_Boundary_Percentage', 'total_runs', 'batting_average']

# Create the new DataFrame with the specified columns
df2 = df[columns_to_keep]

# Display the new DataFrame
df2.head()

Unnamed: 0,batsmanName,inning_batted,Average_SR,Average_Boundary_Percentage,total_runs,batting_average
487,Aaron Finch (c),3,111.72,52.198459,107,35.666667
230,Aaron Finch (c),3,111.72,52.198459,107,35.666667
399,Aaron Finch (c),3,111.72,52.198459,107,35.666667
668,Aayan Afzal Khan,2,80.945,10.526316,24,12.0
604,Aayan Afzal Khan,2,80.945,10.526316,24,12.0


In [18]:
# Removing duplicate rows from the DataFrame
df2 = df2.drop_duplicates()

# Display the updated DataFrame
print(df2)


          batsmanName  inning_batted  Average_SR  Average_Boundary_Percentage  \
487   Aaron Finch (c)              3  111.720000                    52.198459   
668  Aayan Afzal Khan              2   80.945000                    10.526316   
495        Adam Zampa              2   50.000000                     0.000000   
286      Afif Hossain              5   98.674000                    27.422868   
81      Aiden Markram              4  122.832500                    56.187783   
..                ...            ...         ...                          ...   
180  Wessly Madhevere              8   92.051250                    62.139886   
416         Yasir Ali              3   64.443333                     0.000000   
608       Zahoor Khan              1  100.000000                     0.000000   
534      Zane Green †              1   66.660000                     0.000000   
665       Zawar Farid              1   50.000000                     0.000000   

     total_runs  batting_av

In [19]:
df2.to_csv('final_batting_stats.csv', index=False)

  values = values.astype(str)


In [20]:
# for openers
filtered_df2 = df2[
    (df2['batting_average'] > 30) &
    (df2['Average_SR'] >140) &
    (df2['inning_batted'] > 3) &
    (df2['Average_Boundary_Percentage'] > 50)
]

filtered_df2.reset_index(drop=True, inplace=True)
filtered_df2


Unnamed: 0,batsmanName,inning_batted,Average_SR,Average_Boundary_Percentage,total_runs,batting_average
0,Glenn Phillips,5,143.72,66.412689,201,40.2
1,Suryakumar Yadav,6,182.583333,64.202523,239,39.833333


In [21]:
selected_players = filtered_df2['batsmanName']
selected_players

0      Glenn Phillips
1    Suryakumar Yadav
Name: batsmanName, dtype: object

# Filter criteria: (typically for middle order)
# Batting Average > 40
# Strike Rate > 125
# Innings Batted > 3
# Average balls faced by the batter in an innings> 20

In [22]:
df['total_balls'] = df.groupby('batsmanName')['balls'].transform('sum')

df['innings_played'] = df.groupby('batsmanName')['inning_batted'].transform('count')
df['balls_avg'] = df['total_balls'] / df['innings_played']
df.head(10)

Unnamed: 0,batsmanName,runs,balls,minutes,4s,6s,SR,inning_batted,Average_SR,boundaries,boundary_runs,boundary_percentage,Average_Boundary_Percentage,total_runs,batting_average,total_balls,innings_played,balls_avg
487,Aaron Finch (c),13,11,20,1,1,118.18,3,111.72,2,10,76.923077,52.198459,107,35.666667,97,3,32.333333
230,Aaron Finch (c),63,44,87,5,3,143.18,3,111.72,8,38,60.31746,52.198459,107,35.666667,97,3,32.333333
399,Aaron Finch (c),31,42,89,0,1,73.8,3,111.72,1,6,19.354839,52.198459,107,35.666667,97,3,32.333333
668,Aayan Afzal Khan,5,7,8,0,0,71.42,2,80.945,0,0,0.0,10.526316,24,12.0,28,2,14.0
604,Aayan Afzal Khan,19,21,33,1,0,90.47,2,80.945,1,4,21.052632,10.526316,24,12.0,28,2,14.0
495,Adam Zampa,0,2,3,0,0,0.0,2,50.0,0,0,,0.0,1,0.5,3,2,1.5
114,Adam Zampa,1,1,5,0,0,100.0,2,50.0,0,0,0.0,0.0,1,0.5,3,2,1.5
286,Afif Hossain,29,19,40,1,1,152.63,5,98.674,2,10,34.482759,27.422868,95,19.0,76,5,15.2
365,Afif Hossain,1,5,8,0,0,20.0,5,98.674,0,0,0.0,27.422868,95,19.0,76,5,15.2
175,Afif Hossain,3,5,12,0,0,60.0,5,98.674,0,0,0.0,27.422868,95,19.0,76,5,15.2


In [23]:
# Merge df2 with df to add the balls_avg column
df2 = df2.merge(df[['batsmanName', 'balls_avg']], on='batsmanName', how='left')

# Display the updated df2
print(df2)


          batsmanName  inning_batted  Average_SR  Average_Boundary_Percentage  \
0     Aaron Finch (c)              3  111.720000                    52.198459   
1     Aaron Finch (c)              3  111.720000                    52.198459   
2     Aaron Finch (c)              3  111.720000                    52.198459   
3    Aayan Afzal Khan              2   80.945000                    10.526316   
4    Aayan Afzal Khan              2   80.945000                    10.526316   
..                ...            ...         ...                          ...   
694         Yasir Ali              3   64.443333                     0.000000   
695         Yasir Ali              3   64.443333                     0.000000   
696       Zahoor Khan              1  100.000000                     0.000000   
697      Zane Green †              1   66.660000                     0.000000   
698       Zawar Farid              1   50.000000                     0.000000   

     total_runs  batting_av

In [24]:
# Removing duplicate rows from the DataFrame
df2 = df2.drop_duplicates()

# Display the updated DataFrame
print(df2)

          batsmanName  inning_batted  Average_SR  Average_Boundary_Percentage  \
0     Aaron Finch (c)              3  111.720000                    52.198459   
3    Aayan Afzal Khan              2   80.945000                    10.526316   
5          Adam Zampa              2   50.000000                     0.000000   
7        Afif Hossain              5   98.674000                    27.422868   
12      Aiden Markram              4  122.832500                    56.187783   
..                ...            ...         ...                          ...   
685  Wessly Madhevere              8   92.051250                    62.139886   
693         Yasir Ali              3   64.443333                     0.000000   
696       Zahoor Khan              1  100.000000                     0.000000   
697      Zane Green †              1   66.660000                     0.000000   
698       Zawar Farid              1   50.000000                     0.000000   

     total_runs  batting_av

In [25]:

filtered_df2 = df2[
    (df2['batting_average'] > 35) &
    (df2['Average_SR'] > 125) &
    (df2['inning_batted'] > 3) &
    (df2['balls_avg'] > 20)
]

filtered_df2.reset_index(drop=True, inplace=True)
filtered_df2



Unnamed: 0,batsmanName,inning_batted,Average_SR,Average_Boundary_Percentage,total_runs,batting_average,balls_avg
0,Alex Hales,6,125.771667,48.981451,212,35.333333,24.0
1,Glenn Phillips,5,143.72,66.412689,201,40.2,25.4
2,Suryakumar Yadav,6,182.583333,64.202523,239,39.833333,21.0
3,Virat Kohli,6,129.858333,49.676193,296,49.333333,36.166667


In [26]:
combined_df = pd.concat([filtered_df2, selected_players])
combined_df.to_csv('selected_players.csv', index=False)

  values = values.astype(str)


# Filter criteria: (typically for lower order)
# Batting Average > 25
# Strike Rate > 130
# Innings Batted > 3
# Average balls faced by the batter in an innings> 12
# Innings Bowled > 1

In [27]:

filtered_df2 = df2[
    (df2['batting_average'] > 25) &
    (df2['Average_SR'] > 130) &
    (df2['inning_batted'] > 3) &
    (df2['balls_avg'] > 12)
]

filtered_df2.reset_index(drop=True, inplace=True)
filtered_df2.to_csv('temp.csv', index=False)
filtered_df2


Unnamed: 0,batsmanName,inning_batted,Average_SR,Average_Boundary_Percentage,total_runs,batting_average,balls_avg
0,Glenn Maxwell,4,161.2125,66.015687,118,29.5,18.25
1,Glenn Phillips,5,143.72,66.412689,201,40.2,25.4
2,Litton Das,5,131.072,50.358543,127,25.4,17.8
3,Marcus Stoinis,4,159.2025,46.891041,126,31.5,19.5
4,Suryakumar Yadav,6,182.583333,64.202523,239,39.833333,21.0
