In [None]:
import os
import glob
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Define your directories (update these paths)
input_dir = 'src\data_processing\Global_player_csvs'      # Folder containing the CSV files
output_dir = 'src\data_processing\Cleaned_Global_player_csvs'    # Folder where cleaned CSV files will be saved

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Get list of all CSV files in the input directory
csv_files = glob.glob(os.path.join(input_dir, '*.csv'))

# Define the columns to be processed
numeric_cols = [
    'batting_position', 'runs', 'balls', 'fours', 'sixes', 'strike_rate',
    'overs', 'total_balls', 'dots', 'maidens',
    'conceded', 'fours_conceded', 'sixes_conceded',
    'wickets', 'LBW', 'Bowled', 'noballs',
    'wides', 'economy_rate', 'catches', 'stumping', 'direct_hit', 'indirect_hit'
]

# Define the selected features list (adjust if needed)
selected_features = ['match_id'] + numeric_cols + ['strike_rate_fp', 'batting_fp', 'bowling_fp', 'fielding_fp', 'total_fp']

# Initialize the scaler
scaler = MinMaxScaler()

# Process each CSV file
for csv_file in csv_files:
    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_file)
    
    # Drop unwanted columns if they exist
    for col in ['date', 'event', 'teamname', 'overs_bowled']:
        if col in df.columns:
            df = df.drop(columns=[col])
    
    # Convert specified columns to numeric
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
    
    # Normalize the numeric columns
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    
    # Select the desired features
    df_selected = df[selected_features]
    
    # Create the output file path with the same name as the original file
    base_name = os.path.basename(csv_file)
    output_file = os.path.join(output_dir, base_name)
    
    # Save the cleaned DataFrame to CSV without index
    df_selected.to_csv(output_file, index=False)

In [2]:
import os
import glob
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Define directories (update these paths as needed)
input_dir = r'C:\Users\kumar\IPL_Fantasy_Score_Prediction\Ashu\Player_records'      # Folder containing the CSV files
output_dir = r'C:\Users\kumar\IPL_Fantasy_Score_Prediction\Ashu\Processed_Player_records'  # Folder where cleaned CSV files will be saved
os.makedirs(output_dir, exist_ok=True)
csv_files = glob.glob(os.path.join(input_dir, '*.csv'))
numeric_cols = [
    'batting_position', 'runs', 'balls', 'fours', 'sixes', 'strike_rate',
    'overs', 'total_balls', 'dots', 'maidens',
    'conceded', 'fours_conceded', 'sixes_conceded',
    'wickets', 'LBW', 'Bowled', 'noballs',
    'wides', 'economy_rate', 'catches', 'stumping', 'direct_hit', 'indirect_hit'
]
selected_features = ['match_id'] + numeric_cols + ['strike_rate_fp', 'batting_fp', 'bowling_fp', 'fielding_fp', 'total_fp']
scaler = MinMaxScaler()
global_numeric_data = []

for csv_file in csv_files:
    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_file)
    
    # Drop unwanted columns if they exist
    for col in ['date', 'event', 'teamname', 'overs_bowled']:
        if col in df.columns:
            df = df.drop(columns=[col])
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
    global_numeric_data.append(df[numeric_cols])
global_data = pd.concat(global_numeric_data, ignore_index=True)
global_data

Unnamed: 0,batting_position,runs,balls,fours,sixes,strike_rate,overs,total_balls,dots,maidens,...,wickets,LBW,Bowled,noballs,wides,economy_rate,catches,stumping,direct_hit,indirect_hit
0,7,0,0,0,0,0.00,0.000,0,0,0,...,0,0,0,0,0,0.00,0,0,0,0
1,4,2,2,0,0,100.00,4.000,24,12,0,...,2,0,0,0,3,6.75,0,0,0,0
2,6,0,0,0,0,0.00,3.000,18,4,0,...,0,0,0,0,0,6.67,0,0,0,0
3,9,3,5,0,0,60.00,2.000,12,1,0,...,0,0,0,0,0,12.50,0,0,0,0
4,5,0,0,0,0,0.00,1.333,8,5,0,...,1,0,0,0,1,3.00,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169756,3,20,10,3,1,200.00,1.000,6,0,0,...,0,0,0,0,0,13.00,1,0,0,0
169757,3,1,2,0,0,50.00,0.000,0,0,0,...,0,0,0,0,0,0.00,0,0,0,0
169758,3,12,13,0,1,92.31,0.000,0,0,0,...,0,0,0,0,0,0.00,0,0,0,0
169759,3,17,8,0,2,212.50,0.000,0,0,0,...,0,0,0,0,0,0.00,2,0,0,0


In [4]:
scaler.fit(global_data)
for csv_file in csv_files:
    # Load CSV file
    df = pd.read_csv(csv_file)
    
    # Drop unwanted columns if they exist
    for col in ['date', 'event', 'teamname', 'overs_bowled']:
        if col in df.columns:
            df = df.drop(columns=[col])
    
    # Convert specified columns to numeric
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
    
    # Normalize the numeric columns using the global scaler
    df[numeric_cols] = scaler.transform(df[numeric_cols])
    
    # Select the desired features
    df_selected = df[selected_features]
    
    # Create the output file path with the same name as the original file
    base_name = os.path.basename(csv_file)
    output_file = os.path.join(output_dir, base_name)
    
    # Save the cleaned DataFrame to CSV without index
    df_selected.to_csv(output_file, index=False)
