In [7]:
import pandas as pd

# Load datasets with specific data types
df_21 = pd.read_csv('../Dataset/FIFA21_official_data.csv', dtype={'Preferred Foot': str, 'Work Rate': str})
df_22 = pd.read_csv('../Dataset/FIFA22_official_data.csv', dtype={'Preferred Foot': str, 'Work Rate': str})
df_23 = pd.read_csv('../Dataset/FIFA23_official_data.csv', dtype={'Preferred Foot': str, 'Work Rate': str})

# Function to handle missing values and specific column transformations
def handle_missing_values_and_transform(df):
    # Handle missing values for numeric columns
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.median()), axis=0)
    
    # Handle missing values for categorical columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        mode_value = df[col].mode()[0] if not df[col].mode().empty else 'Unknown'
        df[col] = df[col].fillna(mode_value)
    
    # Clean up 'Best Overall Rating' column if it exists
    if 'Best Overall Rating' in df.columns:
        df['Best Overall Rating'] = df['Best Overall Rating'].astype(str).str.extract(r'(\d+)').astype(float)
    
    return df

# Apply function to datasets
df_21 = handle_missing_values_and_transform(df_21)
df_22 = handle_missing_values_and_transform(df_22)
df_23 = handle_missing_values_and_transform(df_23)

# Combine datasets
common_columns = list(set(df_21.columns) & set(df_22.columns) & set(df_23.columns))
df_21 = df_21[common_columns]
df_22 = df_22[common_columns]
df_23 = df_23[common_columns]

combined_df = pd.concat([df_21, df_22, df_23], ignore_index=True)

# Check for remaining missing values
def check_missing_values(df, name):
    missing = df.isnull().sum()
    missing = missing[missing > 0]
    if not missing.empty:
        print(f"Remaining missing values in {name} Data:")
        print(missing)
    else:
        print(f"No missing values in {name} Data.")

# Check for missing values in the combined dataset
check_missing_values(combined_df, "Combined")

# Print the total number of rows and columns in the combined dataset
total_rows = combined_df.shape[0]
total_columns = combined_df.shape[1]
print("Total rows in the combined dataset:", total_rows)
print("Total columns in the combined dataset:", total_columns)


No missing values in Combined Data.
Total rows in the combined dataset: 51478
Total columns in the combined dataset: 28


In [8]:
combined_df.to_csv('../Dataset/Cleaned/combinedCleanedData.csv', index=False)

In [10]:
from datetime import datetime
# Generate current timestamp
current_timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Save the combined DataFrame with the timestamp in the filename
output_path = f'../Dataset/Cleaned/combinedCleanedData_{current_timestamp}.csv'
combined_df.to_csv(output_path, index=False)

print(f"DataFrame saved to {output_path}")

DataFrame saved to ../Dataset/Cleaned/combinedCleanedData_20240709_133832.csv
