In [3]:
import pandas as pd

# Load the combined cleaned data
file_path = 'combinedCleanedData.csv'
df = pd.read_csv(file_path)

# Function to check for missing values
def check_missing_values(df, name):
    missing = df.isnull().sum()
    missing = missing[missing > 0]
    if not missing.empty:
        print(f"Remaining missing values in {name} Data:")
        print(missing)
    else:
        print(f"No missing values in {name} Data.")

# Function to check data types
def check_data_types(df):
    print("Data types:")
    print(df.dtypes)
    print("\n")

# Function to check for duplicates
def check_duplicates(df):
    duplicates = df.duplicated().sum()
    print(f"Number of duplicate rows: {duplicates}")
    return duplicates

# Function to check value ranges and validity
def check_value_ranges(df):
    print("Summary statistics:")
    print(df.describe())
    print("\n")

# Function to check unique values in categorical columns
def check_unique_values(df, columns):
    for col in columns:
        print(f"Unique values in {col}:")
        print(df[col].unique())
        print("\n")

# Check for missing values
check_missing_values(df, "Combined")

# Check data types
check_data_types(df)

# Check for duplicates
num_duplicates = check_duplicates(df)
if num_duplicates > 0:
    df = df.drop_duplicates()

# Check value ranges and validity
check_value_ranges(df)

# Check unique values in categorical columns
categorical_columns = ['Name', 'Nationality', 'Club', 'Preferred Foot', 'Work Rate']
check_unique_values(df, categorical_columns)

# Additional cleaning for 'Value' and 'Wage' columns
if 'Value' in df.columns:
    df['Value'] = df['Value'].replace({'\€': '', 'M': 'e+6', 'K': 'e+3'}, regex=True).astype(float)
if 'Wage' in df.columns:
    df['Wage'] = df['Wage'].replace({'\€': '', 'M': 'e+6', 'K': 'e+3'}, regex=True).astype(float)

# Verify the changes
check_value_ranges(df)

# Save the cleaned data to a new CSV file if needed
cleaned_file_path = 'cleaned_combined_data.csv'
df.to_csv(cleaned_file_path, index=False)

print("Data cleaning complete. Cleaned data saved to", cleaned_file_path)

# Print the total number of rows and columns in the cleaned dataset
total_rows = df.shape[0]
total_columns = df.shape[1]
print("Total rows in the cleaned dataset:", total_rows)
print("Total columns in the cleaned dataset:", total_columns)


No missing values in Combined Data.
Data types:
Body Type                    object
Best Overall Rating         float64
Skill Moves                 float64
Joined                       object
Height                       object
Weight                       object
Release Clause               object
Loaned From                  object
Name                         object
Special                       int64
Age                           int64
Wage                         object
ID                            int64
International Reputation    float64
Club                         object
Club Logo                    object
Preferred Foot               object
Real Face                    object
Weak Foot                   float64
Overall                       int64
Potential                     int64
Nationality                  object
Flag                         object
Photo                        object
Value                        object
Work Rate                    object
Contract Valid U