# Lựa chọn feature


In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings("ignore")
import numpy as np


k_fold = 8
seed = 42
np.random.seed(seed)

## 1. Load data


In [2]:
matches_df = pd.read_csv("../training_data/arsenal.csv")
# Chọn các feature cần thiết
matches_df = matches_df[
    [
        "result",
        "venue",
        "win_before_team1",
        "win_before_team2",
        "draw_before_team1",
        "draw_before_team2",
        "opponent",
        "history_team1_win_team2",
        "history_team1_draw_team2",
        "history_team1_lose_team2",
        "is_opponent_big6",
    ]
]

In [3]:
matches_df.shape

(380, 11)

In [4]:
matches_df["result"].value_counts()

result
W    209
L     95
D     76
Name: count, dtype: int64

## 2. Dùng mô hình để lựa chọn feature


In [5]:
X = matches_df.drop("result", axis=1)
y = matches_df["result"]

# Xác định các cột số và phân loại
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = X.select_dtypes(include=["object", "category"]).columns

numerical_cols, categorical_cols

(Index(['win_before_team1', 'win_before_team2', 'draw_before_team1',
        'draw_before_team2', 'history_team1_win_team2',
        'history_team1_draw_team2', 'history_team1_lose_team2',
        'is_opponent_big6'],
       dtype='object'),
 Index(['venue', 'opponent'], dtype='object'))

In [6]:
# Tạo ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ]
)

### Định nghĩa Pipelines


In [7]:
# Định nghĩa các mô hình cơ bản
logit = LogisticRegression(random_state=seed)

rf = RandomForestClassifier(
    n_estimators=100,
    random_state=seed,
)

svc = LinearSVC(
    C=0.05,
    penalty="l1",
    dual=False,
    random_state=seed,
)

# Tạo các pipelines kết hợp preprocessor và mô hình
pipe_rf_without_fs = Pipeline([("preprocessor", preprocessor), ("classifier", logit)])

pipe_svc_without_fs = Pipeline([("preprocessor", preprocessor), ("classifier", logit)])

pipe_rf = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("feature_selection", SelectFromModel(rf)),
        ("classifier", logit),
    ]
)

pipe_svc = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("feature_selection", SelectFromModel(svc)),
        ("classifier", logit),
    ]
)

### Huấn luyện và Đánh giá các Pipelines


In [8]:
# Huấn luyện và đánh giá pipeline sử dụng RandomForest
scores_rf_without_fs = cross_val_score(
    pipe_rf_without_fs,
    X,
    y,
    cv=KFold(
        n_splits=k_fold,
        shuffle=True,
        random_state=seed,
    ),
    scoring="accuracy",
    error_score="raise",
)

scores_rf = cross_val_score(
    pipe_rf,
    X,
    y,
    cv=KFold(
        n_splits=k_fold,
        shuffle=True,
        random_state=seed,
    ),
    scoring="accuracy",
    error_score="raise",
)

# Huấn luyện và đánh giá pipeline sử dụng SVC
scores_svc_without_fs = cross_val_score(
    pipe_svc_without_fs,
    X,
    y,
    cv=KFold(
        n_splits=k_fold,
        shuffle=True,
        random_state=seed,
    ),
    scoring="accuracy",
    error_score="raise",
)

scores_svc = cross_val_score(
    pipe_svc,
    X,
    y,
    cv=KFold(
        n_splits=k_fold,
        shuffle=True,
        random_state=seed,
    ),
    scoring="accuracy",
    error_score="raise",
)

In [9]:
print(
    "Độ chính xác trung bình của RandomForest pipeline không sử dụng feature selection:",
    scores_rf_without_fs.mean(),
)

print(
    "Độ chính xác trung bình của RandomForest pipeline có sử dụng feature selection:",
    scores_rf.mean(),
)

print(
    "Độ chính xác trung bình của SVC pipeline không sử dụng feature selection:",
    scores_svc_without_fs.mean(),
)

print(
    "Độ chính xác trung bình của SVC pipeline có sử dụng feature selection:",
    scores_svc.mean(),
)

Độ chính xác trung bình của RandomForest pipeline không sử dụng feature selection: 0.5686502659574468
Độ chính xác trung bình của RandomForest pipeline có sử dụng feature selection: 0.5844414893617021
Độ chính xác trung bình của SVC pipeline không sử dụng feature selection: 0.5686502659574468
Độ chính xác trung bình của SVC pipeline có sử dụng feature selection: 0.5924202127659575


In [10]:
def clean_feature_names(feature_names):
    cleaned_names = set()  # Sử dụng set để tránh trùng lặp
    for name in feature_names:
        # Phân tách tên dựa trên tiền tố '__'
        parts = name.split("__")
        if len(parts) > 1:
            prefix = parts[0]
            base_name = parts[1]
            if prefix == "num":
                # Nếu tiền tố là 'num__', giữ nguyên base_name
                cleaned_names.add(base_name)
            elif "date_time" in base_name:
                # Đối với 'date_time', chỉ giữ 'date_time'
                cleaned_names.add("date_time")
            elif "formation_team1" in base_name:
                # Đối với 'team', chỉ giữ 'team'
                cleaned_names.add("formation_team1")
            elif "formation_team2" in base_name:
                # Đối với 'team', chỉ giữ 'team'
                cleaned_names.add("formation_team2")
            elif "_" in base_name:
                # Chỉ lấy phần đầu tiên trước dấu '_' đầu tiên
                feature_key = base_name.split("_")[0]
                cleaned_names.add(feature_key)
            else:
                cleaned_names.add(base_name)
        else:
            # Không có tiền tố, giữ nguyên tên
            cleaned_names.add(name)
    return list(cleaned_names)

In [11]:
def find_best_list_features(estimator, n_splits=k_fold):
    kf = KFold(
        n_splits=n_splits,
        shuffle=True,
        random_state=seed,
    )

    best_accuracy = 0
    best_features = None

    results = []
    accuracy_mean = 0
    loop_idx = 1

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        estimator.fit(X_train, y_train)
        accuracy = estimator.score(X_test, y_test)
        accuracy_mean += accuracy

        print(f"Accuracy at loop {loop_idx}: {accuracy}")
        loop_idx += 1
        transformed_feature_names = estimator.named_steps[
            "preprocessor"
        ].get_feature_names_out()
        best_features_in_this_loop = estimator.named_steps[
            "feature_selection"
        ].get_support()

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_features = best_features_in_this_loop

        results.append(
            {
                "feature_names": transformed_feature_names,
                "best_features": [
                    feature
                    for feature in transformed_feature_names[best_features_in_this_loop]
                ],
            }
        )

    accuracy_mean /= n_splits
    print(f"Accuracy mean: {accuracy_mean}")
    print(
        f"Best features with max accuracy: {transformed_feature_names[best_features]}"
    )

    return results

In [12]:
rf_list_best_features = find_best_list_features(pipe_rf)

Accuracy at loop 1: 0.4583333333333333
Accuracy at loop 2: 0.6458333333333334
Accuracy at loop 3: 0.5625
Accuracy at loop 4: 0.5833333333333334
Accuracy at loop 5: 0.46808510638297873
Accuracy at loop 6: 0.6170212765957447
Accuracy at loop 7: 0.6808510638297872
Accuracy at loop 8: 0.6595744680851063
Accuracy mean: 0.584441489361702
Best features with max accuracy: ['num__win_before_team1' 'num__win_before_team2' 'num__draw_before_team1'
 'num__draw_before_team2' 'num__history_team1_win_team2'
 'num__history_team1_draw_team2' 'num__history_team1_lose_team2'
 'num__is_opponent_big6' 'cat__venue_Away' 'cat__venue_Home']


In [13]:
[print(result["best_features"]) for result in rf_list_best_features]

['num__win_before_team1', 'num__win_before_team2', 'num__draw_before_team1', 'num__draw_before_team2', 'num__history_team1_win_team2', 'num__history_team1_draw_team2', 'num__history_team1_lose_team2', 'num__is_opponent_big6', 'cat__venue_Away', 'cat__venue_Home']
['num__win_before_team1', 'num__win_before_team2', 'num__draw_before_team1', 'num__draw_before_team2', 'num__history_team1_win_team2', 'num__history_team1_draw_team2', 'num__history_team1_lose_team2', 'num__is_opponent_big6', 'cat__venue_Away', 'cat__venue_Home']
['num__win_before_team1', 'num__win_before_team2', 'num__draw_before_team1', 'num__draw_before_team2', 'num__history_team1_win_team2', 'num__history_team1_draw_team2', 'num__history_team1_lose_team2', 'num__is_opponent_big6', 'cat__venue_Away', 'cat__venue_Home', 'cat__opponent_Manchester City']
['num__win_before_team1', 'num__win_before_team2', 'num__draw_before_team1', 'num__draw_before_team2', 'num__history_team1_win_team2', 'num__history_team1_draw_team2', 'num__h

[None, None, None, None, None, None, None, None]

In [14]:
svc_list_best_features = find_best_list_features(pipe_svc)

Accuracy at loop 1: 0.4583333333333333
Accuracy at loop 2: 0.6458333333333334
Accuracy at loop 3: 0.5625
Accuracy at loop 4: 0.5833333333333334
Accuracy at loop 5: 0.48936170212765956
Accuracy at loop 6: 0.6170212765957447
Accuracy at loop 7: 0.6808510638297872
Accuracy at loop 8: 0.7021276595744681
Accuracy mean: 0.5924202127659575
Best features with max accuracy: ['num__win_before_team1' 'num__draw_before_team1' 'num__draw_before_team2'
 'num__history_team1_draw_team2' 'num__history_team1_lose_team2'
 'num__is_opponent_big6' 'cat__venue_Away' 'cat__venue_Home']


In [15]:
[print(result["best_features"]) for result in svc_list_best_features];

['num__win_before_team1', 'num__win_before_team2', 'num__draw_before_team1', 'num__draw_before_team2', 'num__history_team1_win_team2', 'num__history_team1_draw_team2', 'num__history_team1_lose_team2', 'num__is_opponent_big6', 'cat__venue_Away', 'cat__venue_Home']
['num__win_before_team1', 'num__win_before_team2', 'num__draw_before_team1', 'num__draw_before_team2', 'num__history_team1_win_team2', 'num__history_team1_draw_team2', 'num__history_team1_lose_team2', 'num__is_opponent_big6', 'cat__venue_Away', 'cat__venue_Home']
['num__win_before_team1', 'num__draw_before_team1', 'num__draw_before_team2', 'num__history_team1_win_team2', 'num__history_team1_draw_team2', 'num__history_team1_lose_team2', 'num__is_opponent_big6', 'cat__venue_Away', 'cat__venue_Home', 'cat__opponent_Manchester City']
['num__win_before_team1', 'num__draw_before_team1', 'num__draw_before_team2', 'num__history_team1_draw_team2', 'num__history_team1_lose_team2', 'num__is_opponent_big6', 'cat__venue_Away', 'cat__venue_

In [16]:
def show_list_features_removed(list_features):
    features_not_in_list = [
        feature for feature in X.columns if feature not in list_features
    ]
    
    # In ra các đặc trưng không có trong danh sách
    print("Features bị loại bỏ:", features_not_in_list)

### Hiển thị các đặc trưng được chọn bởi Random forest và SVC sau khi làm sạch tiền tố num, cat trong đặc trưng

In [17]:
print("Các đặc trưng được lựa chọn bởi Random forest sau khi làm sạch:")

idx = 0
for best_features in rf_list_best_features:
    print(f"Loop {idx}")
    idx += 1
    cleaned_features_rf = clean_feature_names(best_features["best_features"])
    print(cleaned_features_rf)
    show_list_features_removed(cleaned_features_rf)
    print("=====================================")

Các đặc trưng được lựa chọn bởi Random forest sau khi làm sạch:
Loop 0
['is_opponent_big6', 'draw_before_team2', 'venue', 'draw_before_team1', 'win_before_team2', 'history_team1_win_team2', 'history_team1_draw_team2', 'win_before_team1', 'history_team1_lose_team2']
Features bị loại bỏ: ['opponent']
Loop 1
['is_opponent_big6', 'draw_before_team2', 'venue', 'draw_before_team1', 'win_before_team2', 'history_team1_win_team2', 'history_team1_draw_team2', 'win_before_team1', 'history_team1_lose_team2']
Features bị loại bỏ: ['opponent']
Loop 2
['is_opponent_big6', 'draw_before_team2', 'venue', 'draw_before_team1', 'win_before_team2', 'opponent', 'history_team1_win_team2', 'history_team1_draw_team2', 'win_before_team1', 'history_team1_lose_team2']
Features bị loại bỏ: []
Loop 3
['is_opponent_big6', 'draw_before_team2', 'venue', 'draw_before_team1', 'win_before_team2', 'opponent', 'history_team1_win_team2', 'history_team1_draw_team2', 'win_before_team1', 'history_team1_lose_team2']
Features bị 

In [18]:
print("Các đặc trưng được lựa chọn bởi SVC sau khi làm sạch:")

idx = 0
for best_features in svc_list_best_features:
    print(f"Loop {idx}")
    idx += 1
    cleaned_features_svc = clean_feature_names(best_features["best_features"])
    print(cleaned_features_svc)
    show_list_features_removed(cleaned_features_svc)
    print("=====================================")

Các đặc trưng được lựa chọn bởi SVC sau khi làm sạch:
Loop 0
['is_opponent_big6', 'draw_before_team2', 'venue', 'draw_before_team1', 'win_before_team2', 'history_team1_win_team2', 'history_team1_draw_team2', 'win_before_team1', 'history_team1_lose_team2']
Features bị loại bỏ: ['opponent']
Loop 1
['is_opponent_big6', 'draw_before_team2', 'venue', 'draw_before_team1', 'win_before_team2', 'history_team1_win_team2', 'history_team1_draw_team2', 'win_before_team1', 'history_team1_lose_team2']
Features bị loại bỏ: ['opponent']
Loop 2
['is_opponent_big6', 'draw_before_team2', 'venue', 'draw_before_team1', 'opponent', 'history_team1_win_team2', 'history_team1_draw_team2', 'win_before_team1', 'history_team1_lose_team2']
Features bị loại bỏ: ['win_before_team2']
Loop 3
['is_opponent_big6', 'draw_before_team2', 'venue', 'draw_before_team1', 'opponent', 'history_team1_draw_team2', 'win_before_team1', 'history_team1_lose_team2']
Features bị loại bỏ: ['win_before_team2', 'history_team1_win_team2']
Lo

### Thử nghiệm lại với các cột đã chọn


### 1. Random Forest

In [19]:
cleaned_features_rf = X.columns.drop("opponent").tolist()
cleaned_features_rf

['venue',
 'win_before_team1',
 'win_before_team2',
 'draw_before_team1',
 'draw_before_team2',
 'history_team1_win_team2',
 'history_team1_draw_team2',
 'history_team1_lose_team2',
 'is_opponent_big6']

In [20]:
X_new = X[cleaned_features_rf]
numerical_cols_new = numerical_cols[numerical_cols.isin(X_new.columns)]
categorical_cols_new = categorical_cols[categorical_cols.isin(X_new.columns)]

preprocessor_new = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols_new),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols_new),
    ]
)

pipe_rf_new = Pipeline(
    [
        ("preprocessor", preprocessor_new),
        ("feature_selection", SelectFromModel(rf)),
        ("classifier", logit),
    ]
)

scores_rf = cross_val_score(
    pipe_rf_new,
    X_new,
    y,
    cv=KFold(n_splits=k_fold, shuffle=True, random_state=seed),
    scoring="accuracy",
    error_score="raise",
)

print(scores_rf)
scores_rf.mean()

[0.47916667 0.5625     0.54166667 0.5625     0.55319149 0.57446809
 0.55319149 0.59574468]


0.5528036347517731

### 2. SVC

In [21]:
idx = 0
def evaluate_svc(cleaned_features) -> int:
    global idx
    X_new = X[cleaned_features]
    numerical_cols_new = numerical_cols[numerical_cols.isin(X_new.columns)]
    categorical_cols_new = categorical_cols[categorical_cols.isin(X_new.columns)]

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numerical_cols_new),
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols_new),
        ]
    )

    pipe_svc = Pipeline(
        [
            ("preprocessor", preprocessor),
            ("feature_selection", SelectFromModel(svc)),
            ("classifier", logit),
        ]
    )

    scores_svc = cross_val_score(
        pipe_svc,
        X_new,
        y,
        cv=KFold(
            n_splits=k_fold,
            shuffle=True,
            random_state=seed,
        ),
        scoring="accuracy",
        error_score="raise",
    )

    print("Loop:", idx)
    idx +=1 
    print(scores_svc)
    print("Score mean:", scores_svc.mean())
    print("==========================================================================")
    return scores_svc.mean()

In [22]:
max_score = 0
list_best_features = []
unique_list = set()

for best_features in svc_list_best_features:
    cleaned_features_svc = clean_feature_names(best_features["best_features"])
    score = evaluate_svc(cleaned_features_svc)
    if score == max_score:
        temp = tuple(cleaned_features_svc)
        if temp not in unique_list:
            unique_list.add(temp)
            list_best_features.append(cleaned_features_svc)
    elif score > max_score:
        max_score = score
        list_best_features = [cleaned_features_svc]
        unique_list.clear()
        unique_list.add(tuple(cleaned_features_svc))

print("Accuracy mean:", max_score)

Loop: 0
[0.45833333 0.64583333 0.5625     0.58333333 0.4893617  0.61702128
 0.68085106 0.70212766]
Score mean: 0.5924202127659575
Loop: 1
[0.45833333 0.64583333 0.5625     0.58333333 0.4893617  0.61702128
 0.68085106 0.70212766]
Score mean: 0.5924202127659575
Loop: 2
[0.41666667 0.64583333 0.5625     0.58333333 0.4893617  0.61702128
 0.70212766 0.70212766]
Score mean: 0.5898714539007093
Loop: 3
[0.41666667 0.64583333 0.5625     0.58333333 0.5106383  0.61702128
 0.68085106 0.70212766]
Score mean: 0.5898714539007093
Loop: 4
[0.41666667 0.64583333 0.5625     0.58333333 0.4893617  0.61702128
 0.70212766 0.70212766]
Score mean: 0.5898714539007093
Loop: 5
[0.41666667 0.64583333 0.5625     0.58333333 0.5106383  0.61702128
 0.68085106 0.70212766]
Score mean: 0.5898714539007093
Loop: 6
[0.45833333 0.64583333 0.5625     0.58333333 0.4893617  0.61702128
 0.68085106 0.70212766]
Score mean: 0.5924202127659575
Loop: 7
[0.41666667 0.64583333 0.5625     0.58333333 0.5106383  0.61702128
 0.68085106 0.7

In [23]:
print("List best features of SVC:")
print(list_best_features)

for best_features in list_best_features:
    show_list_features_removed(best_features)

List best features of SVC:
[['is_opponent_big6', 'draw_before_team2', 'venue', 'draw_before_team1', 'win_before_team2', 'history_team1_win_team2', 'history_team1_draw_team2', 'win_before_team1', 'history_team1_lose_team2']]
Features bị loại bỏ: ['opponent']


### Kết luận
Vậy sau bước lựa chọn feature:
- Random Forest: 
    + Có một số trường hợp loại bỏ cột Opponent nhưng có một số trường hợp loại bỏ gần hết cột Opponent và chỉ giữ lại cột Opponent_Manchester city
    Với trường hợp giữ lại cột Opponent_Manchester city thì có thể xem xét việc `loại luôn cột Opponent này` vì Opponent này là category mà chỉ có 1 giá trị duy nhất, hơn nữa
    nó có thể được thay thế bằng các cột khác là is_opponent_big6 rồi. Vì vậy sẽ loại bỏ cột Opponent
- SVC: cũng loại bỏ cột Opponent
