# Lựa chọn feature


In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
import warnings
warnings.filterwarnings("ignore")
import numpy as np

k_fold = 10
seed = 42
np.random.seed(seed)

## 1. Load data


In [2]:
matches_df = pd.read_csv("../../../feature_engineering_data/manchester_united.csv")

# Chọn các feature cần thiết
matches_df = matches_df[
    [
        "result",
        "venue",
        "win_before_team2",
        "draw_before_team1",
        "lose_before_team2",
        "opponent",
        "history_team1_win_team2",
        "history_team1_draw_team2",
        "history_team1_lose_team2",
        "is_opponent_big6",
    ]
]

cv_info = {
    "n_splits": k_fold,
    "shuffle": True,
    "random_state": seed,
}

kfold_cv = KFold(**cv_info)
stratify_cv = StratifiedKFold(**cv_info)
repeated_stratify_cv = RepeatedStratifiedKFold(n_splits=k_fold, n_repeats=3, random_state=seed)

applied_cv = repeated_stratify_cv

In [3]:
matches_df.shape

(874, 10)

In [4]:
matches_df["result"].value_counts()

result
W    530
D    183
L    161
Name: count, dtype: int64

## 2. Dùng mô hình để lựa chọn feature


In [5]:
X = matches_df.drop("result", axis=1)
y = matches_df["result"]

# Xác định các cột số và phân loại
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = X.select_dtypes(include=["object", "category"]).columns

numerical_cols, categorical_cols

(Index(['win_before_team2', 'draw_before_team1', 'lose_before_team2',
        'history_team1_win_team2', 'history_team1_draw_team2',
        'history_team1_lose_team2', 'is_opponent_big6'],
       dtype='object'),
 Index(['venue', 'opponent'], dtype='object'))

In [6]:
# Tạo ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ]
)

### Định nghĩa Pipelines


In [7]:
# Định nghĩa các mô hình cơ bản
logit = LogisticRegression(random_state=seed)

rf = RandomForestClassifier(
    n_estimators=100,
    random_state=seed,
)

svc = LinearSVC(
    C=0.05,
    penalty="l1",
    dual=False,
    random_state=seed,
)

# Tạo các pipelines kết hợp preprocessor và mô hình
pipe_rf_without_fs = Pipeline([("preprocessor", preprocessor), ("classifier", logit)])

pipe_svc_without_fs = Pipeline([("preprocessor", preprocessor), ("classifier", logit)])

pipe_rf = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("feature_selection", SelectFromModel(rf)),
        ("classifier", logit),
    ]
)

pipe_svc = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("feature_selection", SelectFromModel(svc)),
        ("classifier", logit),
    ]
)

### Huấn luyện và Đánh giá các Pipelines


In [8]:
# Huấn luyện và đánh giá pipeline sử dụng RandomForest
scores_rf_without_fs = cross_val_score(
    pipe_rf_without_fs,
    X,
    y,
    cv=KFold(
        n_splits=k_fold,
        shuffle=True,
        random_state=seed,
    ),
    scoring="accuracy",
    error_score="raise",
)

scores_rf = cross_val_score(
    pipe_rf,
    X,
    y,
    cv=KFold(
        n_splits=k_fold,
        shuffle=True,
        random_state=seed,
    ),
    scoring="accuracy",
    error_score="raise",
)

# Huấn luyện và đánh giá pipeline sử dụng SVC
scores_svc_without_fs = cross_val_score(
    pipe_svc_without_fs,
    X,
    y,
    cv=KFold(
        n_splits=k_fold,
        shuffle=True,
        random_state=seed,
    ),
    scoring="accuracy",
    error_score="raise",
)

scores_svc = cross_val_score(
    pipe_svc,
    X,
    y,
    cv=KFold(
        n_splits=k_fold,
        shuffle=True,
        random_state=seed,
    ),
    scoring="accuracy",
    error_score="raise",
)

In [9]:
print("Độ chính xác trung bình của RandomForest pipeline không sử dụng feature selection:", scores_rf_without_fs.mean())
print( "Độ chính xác trung bình của RandomForest pipeline có sử dụng feature selection:", scores_rf.mean())
print("Độ chính xác trung bình của SVC pipeline không sử dụng feature selection:", scores_svc_without_fs.mean())
print("Độ chính xác trung bình của SVC pipeline có sử dụng feature selection:", scores_svc.mean())

Độ chính xác trung bình của RandomForest pipeline không sử dụng feature selection: 0.6099529780564263
Độ chính xác trung bình của RandomForest pipeline có sử dụng feature selection: 0.6203369905956113
Độ chính xác trung bình của SVC pipeline không sử dụng feature selection: 0.6099529780564263
Độ chính xác trung bình của SVC pipeline có sử dụng feature selection: 0.6168364681295716


In [10]:
# Huấn luyện và đánh giá pipeline sử dụng RandomForest
scores_rf_without_fs = cross_val_score(
    pipe_rf_without_fs,
    X,
    y,
    cv=applied_cv,
    scoring="accuracy",
    error_score="raise",
)

scores_rf = cross_val_score(
    pipe_rf,
    X,
    y,
    cv=applied_cv,
    scoring="accuracy",
    error_score="raise",
)

# Huấn luyện và đánh giá pipeline sử dụng SVC
scores_svc_without_fs = cross_val_score(
    pipe_svc_without_fs,
    X,
    y,
    cv=applied_cv,
    scoring="accuracy",
    error_score="raise",
)

scores_svc = cross_val_score(
    pipe_svc,
    X,
    y,
    cv=applied_cv,
    scoring="accuracy",
    error_score="raise",
)

print(
    "Độ chính xác trung bình của RandomForest pipeline không sử dụng feature selection:",
    scores_rf_without_fs.mean(),
)
print(
    "Độ chính xác trung bình của RandomForest pipeline có sử dụng feature selection:",
    scores_rf.mean(),
)
print(
    "Độ chính xác trung bình của SVC pipeline không sử dụng feature selection:",
    scores_svc_without_fs.mean(),
)
print(
    "Độ chính xác trung bình của SVC pipeline có sử dụng feature selection:",
    scores_svc.mean(),
)

Độ chính xác trung bình của RandomForest pipeline không sử dụng feature selection: 0.6121560431905259
Độ chính xác trung bình của RandomForest pipeline có sử dụng feature selection: 0.6144287704632532
Độ chính xác trung bình của SVC pipeline không sử dụng feature selection: 0.6121560431905259
Độ chính xác trung bình của SVC pipeline có sử dụng feature selection: 0.6155825496342737


#### Hàm dùng để làm sạch tên feature sau bước encode

In [11]:
def clean_feature_names(feature_names):
    cleaned_names = set()  # Sử dụng set để tránh trùng lặp
    for name in feature_names:
        # Phân tách tên dựa trên tiền tố '__'
        parts = name.split("__")
        if len(parts) > 1:
            prefix = parts[0]
            base_name = parts[1]
            if prefix == "num":
                # Nếu tiền tố là 'num__', giữ nguyên base_name
                cleaned_names.add(base_name)
            elif "date_time" in base_name:
                # Đối với 'date_time', chỉ giữ 'date_time'
                cleaned_names.add("date_time")
            elif "formation_team1" in base_name:
                # Đối với 'team', chỉ giữ 'team'
                cleaned_names.add("formation_team1")
            elif "formation_team2" in base_name:
                # Đối với 'team', chỉ giữ 'team'
                cleaned_names.add("formation_team2")
            elif "_" in base_name:
                # Vd opponent_Arsenal thì cần giữ lại cả opponent_Arsenal
                cleaned_names.add(base_name)
            else:
                cleaned_names.add(base_name)
        else:
            # Không có tiền tố, giữ nguyên tên
            cleaned_names.add(name)
    return list(cleaned_names)

#### Hàm xác định danh sách feature được chọn bằng Model Selection kết hợp cross validation

In [30]:
def find_best_list_features(estimator, n_splits=k_fold):
    kf = KFold(
        n_splits=n_splits,
        shuffle=True,
        random_state=seed,
    )

    best_accuracy = 0
    best_features = None

    results = []
    accuracy_mean = 0
    loop_idx = 1

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        estimator.fit(X_train, y_train)
        accuracy = estimator.score(X_test, y_test)
        accuracy_mean += accuracy

        print(f"Accuracy at loop {loop_idx}: {accuracy}")
        loop_idx += 1
        transformed_feature_names = estimator.named_steps[
            "preprocessor"
        ].get_feature_names_out()
        best_features_in_this_loop = estimator.named_steps[
            "feature_selection"
        ].get_support()

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_features = best_features_in_this_loop

        results.append(
            {
                "feature_names": transformed_feature_names,
                "best_features": clean_feature_names(
                    transformed_feature_names[best_features_in_this_loop]
                ),
            }
        )

    accuracy_mean /= n_splits
    print(f"Accuracy mean: {accuracy_mean}")
    print(
        "=> Best features with max accuracy: \n",
        "\n".join(
            str(feature)
            for feature in sorted(
                clean_feature_names(transformed_feature_names[best_features])
            )
        ),
        sep=""
    )

    return results

In [31]:
rf_list_best_features = find_best_list_features(pipe_rf)

Accuracy at loop 1: 0.5454545454545454
Accuracy at loop 2: 0.6022727272727273
Accuracy at loop 3: 0.6022727272727273
Accuracy at loop 4: 0.5568181818181818
Accuracy at loop 5: 0.632183908045977
Accuracy at loop 6: 0.6551724137931034
Accuracy at loop 7: 0.6436781609195402
Accuracy at loop 8: 0.6436781609195402
Accuracy at loop 9: 0.6206896551724138
Accuracy at loop 10: 0.7011494252873564
Accuracy mean: 0.6203369905956113
=> Best features with max accuracy: 
draw_before_team1
history_team1_draw_team2
history_team1_lose_team2
history_team1_win_team2
is_opponent_big6
lose_before_team2
venue_Home
win_before_team2


#### In danh sách các feature được chọn

In [14]:
[print(result["best_features"]) for result in rf_list_best_features];

['draw_before_team1', 'lose_before_team2', 'win_before_team2', 'history_team1_win_team2', 'venue_Home', 'history_team1_draw_team2', 'is_opponent_big6', 'history_team1_lose_team2']
['draw_before_team1', 'lose_before_team2', 'win_before_team2', 'history_team1_win_team2', 'venue_Away', 'history_team1_draw_team2', 'is_opponent_big6', 'history_team1_lose_team2']
['draw_before_team1', 'lose_before_team2', 'win_before_team2', 'history_team1_win_team2', 'venue_Away', 'venue_Home', 'history_team1_draw_team2', 'is_opponent_big6', 'history_team1_lose_team2']
['draw_before_team1', 'lose_before_team2', 'win_before_team2', 'history_team1_win_team2', 'venue_Away', 'history_team1_draw_team2', 'is_opponent_big6', 'history_team1_lose_team2']
['draw_before_team1', 'lose_before_team2', 'win_before_team2', 'history_team1_win_team2', 'venue_Away', 'venue_Home', 'history_team1_draw_team2', 'is_opponent_big6', 'history_team1_lose_team2']
['draw_before_team1', 'lose_before_team2', 'win_before_team2', 'history_

#### Tìm danh sách đặc trưng được chọn bởi SVC

In [15]:
svc_list_best_features = find_best_list_features(pipe_svc)

Accuracy at loop 1: 0.5340909090909091
Accuracy at loop 2: 0.6136363636363636
Accuracy at loop 3: 0.6363636363636364
Accuracy at loop 4: 0.5681818181818182
Accuracy at loop 5: 0.6436781609195402
Accuracy at loop 6: 0.6551724137931034
Accuracy at loop 7: 0.5402298850574713
Accuracy at loop 8: 0.6551724137931034
Accuracy at loop 9: 0.6206896551724138
Accuracy at loop 10: 0.7011494252873564
Accuracy mean: 0.6168364681295715
Best features with max accuracy:  draw_before_team1
history_team1_draw_team2
history_team1_win_team2
is_opponent_big6
lose_before_team2
opponent_Chelsea
opponent_Tottenham Hotspur
venue_Home
win_before_team2


In [16]:
[print(result["best_features"]) for result in svc_list_best_features];

['draw_before_team1', 'lose_before_team2', 'win_before_team2', 'opponent_Tottenham Hotspur', 'history_team1_win_team2', 'venue_Home', 'is_opponent_big6', 'opponent_Chelsea']
['draw_before_team1', 'win_before_team2', 'opponent_Tottenham Hotspur', 'venue_Home', 'history_team1_draw_team2', 'is_opponent_big6', 'opponent_Chelsea']
['draw_before_team1', 'lose_before_team2', 'win_before_team2', 'opponent_Tottenham Hotspur', 'history_team1_win_team2', 'venue_Home', 'history_team1_draw_team2', 'is_opponent_big6', 'opponent_Chelsea', 'history_team1_lose_team2']
['draw_before_team1', 'win_before_team2', 'opponent_Tottenham Hotspur', 'history_team1_win_team2', 'venue_Home', 'history_team1_draw_team2', 'is_opponent_big6', 'opponent_Chelsea']
['draw_before_team1', 'lose_before_team2', 'win_before_team2', 'opponent_Tottenham Hotspur', 'venue_Home', 'is_opponent_big6', 'opponent_Chelsea', 'history_team1_lose_team2']
['draw_before_team1', 'lose_before_team2', 'win_before_team2', 'opponent_Tottenham Hot

In [17]:
def show_list_features_removed(list_features):
    features_not_in_list = [
        feature for feature in X.columns if feature not in list_features
    ]
    
    # In ra các đặc trưng không có trong danh sách
    print("Features bị loại bỏ:", features_not_in_list)

### Thử nghiệm lại với các cột đã chọn


In [18]:
transformed_X = preprocessor.fit_transform(X).toarray()
new_columns = numerical_cols.to_list() + list(
    preprocessor.named_transformers_["cat"].get_feature_names_out(categorical_cols)
)

transformed_X = pd.DataFrame(transformed_X, columns=new_columns)
transformed_X

Unnamed: 0,win_before_team2,draw_before_team1,lose_before_team2,history_team1_win_team2,history_team1_draw_team2,history_team1_lose_team2,is_opponent_big6,venue_Away,venue_Home,opponent_Arsenal,...,opponent_Southampton,opponent_Stoke City,opponent_Sunderland,opponent_Swansea City,opponent_Tottenham Hotspur,opponent_Watford,opponent_West Bromwich Albion,opponent_West Ham United,opponent_Wigan Athletic,opponent_Wolverhampton Wanderers
0,-1.400086,-1.047219,-1.415549,-2.380529,-1.152226,-0.992415,-0.594089,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.400086,-1.047219,2.477230,-2.380529,-1.152226,-0.992415,-0.594089,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.400086,1.587155,2.477230,-2.380529,-1.152226,-0.992415,-0.594089,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-0.112210,2.465279,-0.117956,-2.380529,-1.152226,-0.992415,-0.594089,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.434179,1.587155,0.530840,-2.380529,-1.152226,-0.992415,-0.594089,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
869,-0.627361,0.006531,0.920118,0.056115,0.181689,0.047699,-0.594089,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
870,0.918090,0.006531,0.141562,-0.051384,-0.004667,0.371096,-0.594089,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
871,0.145365,0.006531,0.920118,0.499141,-0.561059,0.061207,-0.594089,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
872,-0.627361,-1.047219,0.920118,-1.324650,1.593415,0.810450,1.683251,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Tạo ra một hàm để đánh giá accuracy của mô hình trên danh sách các đặc trưng đã chọn

In [19]:
def evaluate_selected_features(model_list_best_features, evaluate_func):
    max_score = 0
    list_best_features = []
    unique_list = set()
    loop_index = 0

    for index, best_features in enumerate(model_list_best_features):
        cleaned_features_svc = best_features["best_features"]
        score = evaluate_func(cleaned_features_svc)
        if score == max_score:
            temp = tuple(cleaned_features_svc)
            if temp not in unique_list:
                unique_list.add(temp)
                list_best_features.append(cleaned_features_svc)
        elif score > max_score:
            loop_index = index
            max_score = score
            list_best_features = [cleaned_features_svc]
            unique_list.clear()
            unique_list.add(tuple(cleaned_features_svc))

    print(f"Accuracy mean: {max_score} at loop: {loop_index}")

    print("List best features of:")
    print(list_best_features)

    for best_features in list_best_features:
        show_list_features_removed(best_features)

    print("\nRaw features: ")
    print(
        "\n".join(
            str(feature)
            for feature in sorted(model_list_best_features[loop_index]["best_features"])
        )
    )

### 1. Random Forest

In [20]:
cleaned_features_rf = X.columns.drop("opponent").tolist()
cleaned_features_rf

['venue',
 'win_before_team2',
 'draw_before_team1',
 'lose_before_team2',
 'history_team1_win_team2',
 'history_team1_draw_team2',
 'history_team1_lose_team2',
 'is_opponent_big6']

In [21]:
transformed_X[
    [
        "win_before_team2",
        "lose_before_team2",
        "draw_before_team1",
        "history_team1_lose_team2",
        "venue_Home",
        "history_team1_win_team2",
        "is_opponent_big6",
        "history_team1_draw_team2",
    ]
]

Unnamed: 0,win_before_team2,lose_before_team2,draw_before_team1,history_team1_lose_team2,venue_Home,history_team1_win_team2,is_opponent_big6,history_team1_draw_team2
0,-1.400086,-1.415549,-1.047219,-0.992415,1.0,-2.380529,-0.594089,-1.152226
1,-1.400086,2.477230,-1.047219,-0.992415,0.0,-2.380529,-0.594089,-1.152226
2,-1.400086,2.477230,1.587155,-0.992415,0.0,-2.380529,-0.594089,-1.152226
3,-0.112210,-0.117956,2.465279,-0.992415,1.0,-2.380529,-0.594089,-1.152226
4,-0.434179,0.530840,1.587155,-0.992415,1.0,-2.380529,-0.594089,-1.152226
...,...,...,...,...,...,...,...,...
869,-0.627361,0.920118,0.006531,0.047699,0.0,0.056115,-0.594089,0.181689
870,0.918090,0.141562,0.006531,0.371096,1.0,-0.051384,-0.594089,-0.004667
871,0.145365,0.920118,0.006531,0.061207,0.0,0.499141,-0.594089,-0.561059
872,-0.627361,0.920118,-1.047219,0.810450,1.0,-1.324650,1.683251,1.593415


In [22]:
idx = 0
def evaluate_rf(cleaned_features) -> int:
    global idx
    X_new = transformed_X[cleaned_features]

    logit = LogisticRegression(random_state=seed)
    scores_rf = cross_val_score(
        logit,
        X_new,
        y,
        cv=applied_cv,
        scoring="accuracy",
        error_score="raise",
    )

    print("Loop:", idx)
    idx += 1
    print(scores_rf)
    print("Score mean:", scores_rf.mean())
    print("==========================================================================")
    return scores_rf.mean()

In [23]:
evaluate_selected_features(rf_list_best_features, evaluate_rf)

Loop: 0
[0.57954545 0.60227273 0.625      0.59090909 0.63218391 0.59770115
 0.62068966 0.63218391 0.65517241 0.59770115 0.625      0.67045455
 0.625      0.61363636 0.59770115 0.5862069  0.56321839 0.64367816
 0.59770115 0.63218391 0.55681818 0.57954545 0.64772727 0.625
 0.59770115 0.63218391 0.59770115 0.65517241 0.62068966 0.63218391]
Score mean: 0.6144287704632532
Loop: 1
[0.57954545 0.60227273 0.625      0.59090909 0.63218391 0.59770115
 0.62068966 0.63218391 0.65517241 0.59770115 0.625      0.67045455
 0.625      0.61363636 0.59770115 0.5862069  0.56321839 0.64367816
 0.59770115 0.63218391 0.55681818 0.57954545 0.64772727 0.625
 0.59770115 0.63218391 0.59770115 0.65517241 0.62068966 0.63218391]
Score mean: 0.6144287704632532
Loop: 2
[0.57954545 0.60227273 0.625      0.59090909 0.63218391 0.59770115
 0.62068966 0.63218391 0.65517241 0.59770115 0.625      0.67045455
 0.625      0.61363636 0.59770115 0.5862069  0.56321839 0.64367816
 0.59770115 0.63218391 0.55681818 0.57954545 0.6477

### 2. SVC

In [24]:
idx = 0
def evaluate_svc(cleaned_features) -> int:
    global idx
    X_new = transformed_X[cleaned_features]

    logit = LogisticRegression(random_state=seed)
    scores_svc = cross_val_score(
        logit,
        X_new,
        y,
        cv=applied_cv,
        scoring="accuracy",
        error_score="raise",
    )

    print("Loop:", idx)
    idx +=1 
    print(scores_svc)
    print("Score mean:", scores_svc.mean())
    print("==========================================================================")
    return scores_svc.mean()

In [25]:
evaluate_selected_features(svc_list_best_features, evaluate_svc)

Loop: 0
[0.61363636 0.60227273 0.61363636 0.60227273 0.63218391 0.6091954
 0.64367816 0.65517241 0.62068966 0.59770115 0.65909091 0.625
 0.625      0.60227273 0.6091954  0.63218391 0.56321839 0.63218391
 0.59770115 0.64367816 0.55681818 0.57954545 0.625      0.63636364
 0.5862069  0.66666667 0.59770115 0.66666667 0.63218391 0.62068966]
Score mean: 0.6182601880877743
Loop: 1
[0.61363636 0.59090909 0.61363636 0.60227273 0.63218391 0.6091954
 0.65517241 0.67816092 0.62068966 0.59770115 0.64772727 0.63636364
 0.625      0.60227273 0.59770115 0.63218391 0.56321839 0.63218391
 0.59770115 0.64367816 0.55681818 0.57954545 0.625      0.63636364
 0.5862069  0.66666667 0.59770115 0.67816092 0.63218391 0.62068966]
Score mean: 0.6190308254963426
Loop: 2
[0.61363636 0.60227273 0.61363636 0.61363636 0.63218391 0.6091954
 0.63218391 0.65517241 0.62068966 0.59770115 0.64772727 0.63636364
 0.63636364 0.59090909 0.6091954  0.63218391 0.57471264 0.63218391
 0.59770115 0.63218391 0.55681818 0.57954545 0.62

### Kết luận
Vậy sau bước lựa chọn feature:
- Random Forest: 
    + 'win_before_team2'
    + 'lose_before_team2'
    + 'draw_before_team1'
    + 'history_team1_win_team2'
    + 'history_team1_lose_team2'
    + 'history_team1_draw_team2'
    + 'is_opponent_big6'
    + 'venue_Home'
    
- SVC: danh sách đặc trưng được chọn:
    + 'win_before_team2',
    + 'lose_before_team2',
    + 'history_team1_draw_team2',
    + 'history_team1_lose_team2',
    + 'is_opponent_big6',
    + 'venue_Home',
    + 'opponent_Chelsea',
    + 'opponent_Tottenham Hotspur'

In [26]:
data_rf = matches_df[
    [
        "result",
        "history_team1_draw_team2",
        "is_opponent_big6",
        "draw_before_team1",
        "history_team1_win_team2",
        "history_team1_lose_team2",
        "lose_before_team2",
        "win_before_team2",
    ]
]
data_rf["venue_Home"] = matches_df["venue"].apply(lambda x: 1 if x == "Home" else 0)
# data_rf.to_csv("../../train_model/MU/data_rf.csv", index=False)

In [27]:
data_svc = matches_df[
    [
        "result",
        "is_opponent_big6",
        "draw_before_team1",
        "history_team1_lose_team2",
        "lose_before_team2",
        "win_before_team2",
    ]
]

# Vì model chỉ chọn venue_Home, opponent_Chelsea, opponent_Tottenham Hotspur nên ta chỉ giữ lại các cột này
# thay vì nguyên cột category venue và opponent của data gốc
data_svc["venue_Home"] = matches_df["venue"].apply(lambda x: 1 if x == "Home" else 0)
data_svc["opponent_Chelsea"] = matches_df["opponent"].apply(lambda x: 1 if x == "Chelsea" else 0)
data_svc["opponent_Tottenham Hotspur"] = matches_df["opponent"].apply(lambda x: 1 if x == "Tottenham Hotspur" else 0)

# data_svc.to_csv("../../train_model/MU/data_svc.csv", index=False)