In [1]:
# %% [markdown]
# # Hybrid Model Training with Train/Validation Split
# We'll split our data like a coach splits a team for practice and testing, extract features (like a detective with a checklist), and train a Random Forest model.

# %%
%pip install split-folders opencv-python-headless scikit-learn scikit-image joblib tqdm

Collecting split-folders
  Downloading split_folders-0.5.1-py3-none-any.whl.metadata (6.2 kB)
Downloading split_folders-0.5.1-py3-none-any.whl (8.4 kB)
Installing collected packages: split-folders
Successfully installed split-folders-0.5.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
# %% [markdown]
# ## Step 1: Split the Dataset into Train and Validation Sets

# %%
import splitfolders

splitfolders.ratio(
    "Dataset/preprocessed_images", 
    output="Dataset/split_preprocessed_images", 
    seed=42, 
    ratio=(.8, .2)  # 80% train, 20% val
)

# %% [markdown]
# ## Step 2: Feature Extraction Functions

# %%
import cv2
import numpy as np
from skimage.feature import local_binary_pattern
import os

def extract_features(img):
    """Detective's checklist: color, texture, shape clues."""
    features = []
    # Color: HSV histogram
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    hist_h = cv2.calcHist([hsv], [0], None, [8], [0, 180]).flatten()
    hist_s = cv2.calcHist([hsv], [1], None, [8], [0, 256]).flatten()
    hist_v = cv2.calcHist([hsv], [2], None, [8], [0, 256]).flatten()
    features.extend(hist_h)
    features.extend(hist_s)
    features.extend(hist_v)
    # Texture: LBP
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    lbp = local_binary_pattern(gray, P=8, R=1, method='uniform')
    (hist_lbp, _) = np.histogram(lbp.ravel(), bins=np.arange(0, 11), range=(0, 10))
    features.extend(hist_lbp)
    # Shape: Largest contour area
    contours, _ = cv2.findContours(gray, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    largest_area = max([cv2.contourArea(cnt) for cnt in contours], default=0)
    features.append(largest_area)
    return np.array(features, dtype=np.float32)

# %% [markdown]
# ## Step 3: Prepare Data Loaders for Train/Val

# %%
import glob

def load_data(image_folder):
    """Like sorting players into their positions (X = features, y = labels)."""
    X, y = [], []
    class_names = sorted(os.listdir(image_folder))
    class_to_idx = {name: idx for idx, name in enumerate(class_names)}
    for class_name in class_names:
        class_dir = os.path.join(image_folder, class_name)
        for img_file in glob.glob(os.path.join(class_dir, "*.jpg")):
            img = cv2.imread(img_file)
            if img is None:
                continue
            img = cv2.resize(img, (224, 224))
            img[:, :, 1] = img[:, :, 1] * 0.6  # Green channel emphasis
            feats = extract_features(img)
            X.append(feats)
            y.append(class_to_idx[class_name])
    return np.array(X), np.array(y), class_to_idx

# Load training data
X_train, y_train, class_to_idx = load_data("Dataset/split_preprocessed_images/train")
# Load validation data
X_val, y_val, _ = load_data("Dataset/split_preprocessed_images/val")

# %% [markdown]
# ## Step 4: Train Random Forest Model

# %%
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight

# Calculate class weights (like giving more attention to underrepresented players)
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: w for i, w in enumerate(class_weights)}

rf = RandomForestClassifier(
    n_estimators=150,
    class_weight=class_weight_dict,
    n_jobs=-1,
    random_state=42,
    verbose=1
)
rf.fit(X_train, y_train)

# %% [markdown]
# ## Step 5: Evaluate Model

# %%
from sklearn.metrics import classification_report, confusion_matrix

y_pred = rf.predict(X_val)
idx_to_class = {v: k for k, v in class_to_idx.items()}
print(classification_report(y_val, y_pred, target_names=[idx_to_class[i] for i in sorted(idx_to_class)]))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# %% [markdown]
# ## Step 6: Save Model

# %%
import joblib
joblib.dump(rf, "hybrid_rf_model_training4.pkl")


Copying files: 41628 files [00:04, 9516.66 files/s] 
