In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

# 1. Load dataset
df = pd.read_csv("mydata.csv")
print("Original shape:", df.shape)

# 2. Normalize / Standardize numerical columns
# Select numeric columns
num_cols = df.select_dtypes(include=np.number).columns.tolist()

# Standardize (mean=0, std=1)
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

print("\nStandardized numerical columns")

# 3. Encode categorical variables
cat_cols = df.select_dtypes(include="object").columns.tolist()

#One-hot encode "stadium" and label encode "Home Team" + "Away Team"
df_encoded = df.copy()

# One-hot encode "stadium"
if "stadium" in cat_cols:
    ohe = OneHotEncoder(sparse_output=False, drop="first")  
    ohe_array = ohe.fit_transform(df[["stadium"]])
    ohe_df = pd.DataFrame(ohe_array, columns=ohe.get_feature_names_out(["stadium"]))
    df_encoded = pd.concat([df_encoded.drop(columns=["stadium"]), ohe_df], axis=1)
    print("One-hot encoded 'stadium'")

for col in ["Home Team", "Away Team"]:
    if col in df.columns:
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df[col])
        print(f"Label encoded '{col}'")

# 4. Create at least two new features
# Goal Difference (Home - Away)
if "Goals Home" in df.columns and "Away Goals" in df.columns:
    df_encoded["goal_diff"] = df["Goals Home"] - df["Away Goals"]

# Total Goals
if "Goals Home" in df.columns and "Away Goals" in df.columns:
    df_encoded["total_goals"] = df["Goals Home"] + df["Away Goals"]

print("Created new features: 'goal_diff', 'total_goals'")

# 5. Save transformed dataset
df_encoded.to_csv("mydata_transformed.csv", index=False)
print("\nTransformed dataset saved as 'mydata_transformed.csv'")
print("Final shape:", df_encoded.shape)


Original shape: (1140, 40)

Standardized numerical columns
One-hot encoded 'stadium'
Label encoded 'Home Team'
Label encoded 'Away Team'
Created new features: 'goal_diff', 'total_goals'

Transformed dataset saved as 'mydata_transformed.csv'
Final shape: (1140, 66)
