In [1]:
import pandas as pd

In [2]:
df=pd.read_csv("CombinedcleanedOdiStats.csv")
df

Unnamed: 0,PlayerName,Matches,Innings,Average,StrikeRate,HighestScore,4s,6s,50s,100s,Runs,Gender
0,Sachin Tendulkar,463,452,44.83,86.23,200,2016.0,195.0,96,49,18426,Men
1,Virat Kohli,305,293,57.71,93.26,183,1332.0,152.0,75,51,14255,Men
2,Rohit Sharma,276,268,49.22,92.66,264,1066.0,349.0,59,33,11370,Men
3,Sourav Ganguly,311,300,41.02,73.70,183,1122.0,190.0,72,22,11363,Men
4,Rahul Dravid,344,318,39.16,71.23,153,950.0,42.0,83,12,10889,Men
...,...,...,...,...,...,...,...,...,...,...,...,...
366,Bindeshwari Goyal,4,1,0.00,25.00,1,,,0,0,1,Women
367,Samantha Lobatto,3,2,0.00,12.50,1,,,0,0,1,Women
368,Anjali Sharma,3,3,0.33,,1,,,0,0,1,Women
369,Rita Patel,1,1,1.00,,1,,,0,0,1,Women


### ðŸ§® Feature Engineering (Using Existing Columns)

In this notebook, I created several new features using the already available columns to get better insights into player performance:

- **Landmarks** â†’ created using  
  `50s + 100s`  
  (Total number of 50+ scores)

- **BoundariesHit** â†’ created using  
  `4s + 6s`  
  (Total number of boundaries hit)

- **ConversionRate** â†’ created using  
  `(50s + 100s) / Innings`  
  (How well a player converts innings into big scores)

- **Boundary%** â†’ created using  
  `(Runs from boundaries / Total Runs)`  
  (Shows how much a player depends on boundaries)

- **Rating Columns (RunsRating, AvgRating, SRRating, LandmarksRating, BoundariesRating)**  
  These were created by normalizing the existing statistical columns to compare players fairly.

- **TotalRating** â†’ created using  
  `sum of all rating columns`

- **Rating10** â†’ created by  
  `normalizing TotalRating to a 10-point scale`

These engineered features help in analyzing player performance more effectively.

In [3]:
import numpy as np

# ---------------------- 1. Conversion Rate ----------------------
# Correct cricket formula: 100s / (50s + 100s)
df['ConversionRate'] = (
    df['100s'] / (df['50s'] + df['100s']).replace(0, 1) * 100
).round(2)

# ---------------------- 2. Balls Faced ----------------------
# Estimated from Strike Rate: Balls = (Runs * 100) / SR
df['BallsFaced'] = (
    df['Runs'] * 100 / df['StrikeRate']
).replace([np.inf, -np.inf], np.nan).round(0)

# ---------------------- 3. Boundary Percentage ----------------------
df['Boundary%'] = (
    ((df['4s'] * 4) + (df['6s'] * 6)) / df['Runs'] * 100
).round(2)

# ---------------------- 4. Landmarks (50s + 100s) ----------------------
df['Landmarks'] = df['50s'] + df['100s']

# ---------------------- 5. Boundaries Hit (4s + 6s) ----------------------
df['BoundariesHit'] = df['4s'] + df['6s']

# ============================================================
# =============== RATING SYSTEM FEATURE COLUMNS ===============
# ============================================================

# ---------------------- Rating Functions ----------------------
def rate_runs(r):
    if r <= 1000: return 1
    elif r <= 3000: return 2
    elif r <= 5000: return 3
    elif r <= 8000: return 4
    else: return 5

def rate_average(a):
    if a <= 10: return 1
    elif a <= 20: return 2
    elif a <= 30: return 3
    elif a <= 35: return 4
    else: return 5

def rate_sr(sr):
    if sr <= 70: return 1
    elif sr <= 90: return 2
    elif sr <= 110: return 3
    elif sr <= 130: return 4
    else: return 5

def rate_landmarks(x):
    if x <= 10: return 1
    elif x <= 20: return 2
    elif x <= 35: return 3
    elif x <= 50: return 4
    else: return 5

def rate_boundaries(b):
    if b <= 75: return 1
    elif b <= 150: return 2
    elif b <= 250: return 3
    elif b <= 350: return 4
    else: return 5

# ---------------------- Apply Ratings ----------------------
df['RunsRating'] = df['Runs'].apply(rate_runs)
df['AvgRating'] = df['Average'].apply(rate_average)
df['SRRating'] = df['StrikeRate'].apply(rate_sr)
df['LandmarksRating'] = df['Landmarks'].apply(rate_landmarks)
df['BoundariesRating'] = df['BoundariesHit'].apply(rate_boundaries)

# ---------------------- Final Rating (Out of 25) ----------------------
df['TotalRating'] = (
    df['RunsRating'] +
    df['AvgRating'] +
    df['SRRating'] +
    df['LandmarksRating'] +
    df['BoundariesRating']
)

# ---------------------- Scale to Rating /10 ----------------------
df['Rating10'] = ((df['TotalRating'] / 25) * 10).round(1)

# ---------------------- Final Added Features List ----------------------
added_features = [
    'ConversionRate', 'BallsFaced', 'Boundary%', 'Landmarks',
    'BoundariesHit', 'RunsRating', 'AvgRating', 'SRRating',
    'LandmarksRating', 'BoundariesRating', 'TotalRating', 'Rating10'
]

print("FEATURES ADDED TO DATASET:")
print(added_features)

FEATURES ADDED TO DATASET:
['ConversionRate', 'BallsFaced', 'Boundary%', 'Landmarks', 'BoundariesHit', 'RunsRating', 'AvgRating', 'SRRating', 'LandmarksRating', 'BoundariesRating', 'TotalRating', 'Rating10']


In [4]:
df

Unnamed: 0,PlayerName,Matches,Innings,Average,StrikeRate,HighestScore,4s,6s,50s,100s,...,Boundary%,Landmarks,BoundariesHit,RunsRating,AvgRating,SRRating,LandmarksRating,BoundariesRating,TotalRating,Rating10
0,Sachin Tendulkar,463,452,44.83,86.23,200,2016.0,195.0,96,49,...,50.11,145,2211.0,5,5,2,5,5,22,8.8
1,Virat Kohli,305,293,57.71,93.26,183,1332.0,152.0,75,51,...,43.77,126,1484.0,5,5,3,5,5,23,9.2
2,Rohit Sharma,276,268,49.22,92.66,264,1066.0,349.0,59,33,...,55.92,92,1415.0,5,5,3,5,5,23,9.2
3,Sourav Ganguly,311,300,41.02,73.70,183,1122.0,190.0,72,22,...,49.53,94,1312.0,5,5,2,5,5,22,8.8
4,Rahul Dravid,344,318,39.16,71.23,153,950.0,42.0,83,12,...,37.21,95,992.0,5,5,2,5,5,22,8.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
366,Bindeshwari Goyal,4,1,0.00,25.00,1,,,0,0,...,,0,,1,1,1,1,5,9,3.6
367,Samantha Lobatto,3,2,0.00,12.50,1,,,0,0,...,,0,,1,1,1,1,5,9,3.6
368,Anjali Sharma,3,3,0.33,,1,,,0,0,...,,0,,1,1,5,1,5,13,5.2
369,Rita Patel,1,1,1.00,,1,,,0,0,...,,0,,1,1,5,1,5,13,5.2


In [8]:
#df.to_csv("addedfeaturesodistats.csv")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371 entries, 0 to 370
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PlayerName        371 non-null    object 
 1   Matches           371 non-null    int64  
 2   Innings           371 non-null    int64  
 3   Average           371 non-null    float64
 4   StrikeRate        322 non-null    float64
 5   HighestScore      371 non-null    int64  
 6   4s                254 non-null    float64
 7   6s                154 non-null    float64
 8   50s               371 non-null    int64  
 9   100s              371 non-null    int64  
 10  Runs              371 non-null    int64  
 11  Gender            371 non-null    object 
 12  ConversionRate    371 non-null    float64
 13  BallsFaced        322 non-null    float64
 14  Boundary%         151 non-null    float64
 15  Landmarks         371 non-null    int64  
 16  BoundariesHit     151 non-null    float64
 1