In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SequentialFeatureSelector as SFS
import warnings
warnings.filterwarnings('ignore')

In [2]:
matches_df = pd.read_csv('/kaggle/input/raw-data/arsenal.csv')

# Chọn các feature cần thiết
matches_df = matches_df[['result', 'venue', 'win_before_team1', 'win_before_team2', 'draw_before_team1', 'draw_before_team2', 'opponent', 'history_team1_win_team2', 'history_team1_draw_team2', 'history_team1_lose_team2', 'is_opponent_big6']]

matches_df.head()

Unnamed: 0,result,venue,win_before_team1,win_before_team2,draw_before_team1,draw_before_team2,opponent,history_team1_win_team2,history_team1_draw_team2,history_team1_lose_team2,is_opponent_big6
0,L,Away,0.0,0.0,0.0,0.0,Sunderland,0.0,0.0,0.0,0
1,W,Home,0.0,1.0,0.0,0.0,Liverpool,0.0,0.0,0.0,1
2,W,Home,0.5,0.5,0.0,0.0,Charlton Athletic,0.0,0.0,0.0,0
3,D,Away,0.666667,0.333333,0.0,0.333333,Chelsea,0.0,0.0,0.0,1
4,D,Away,0.5,0.25,0.25,0.25,Bradford City,0.0,0.0,0.0,0


In [3]:
X = matches_df.drop('result', axis=1)
y = matches_df['result']

In [4]:
# Xác định các cột số và phân loại
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

In [5]:
# Tạo ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

In [6]:
# Định nghĩa mô hình cơ bản
logit = LogisticRegression()

# Tạo Sequential Feature Selector
sfs = SFS(logit,
          n_features_to_select="auto",  # Số lượng đặc trưng mong muốn (min, max) hoặc 'best'
          direction='forward',          # 'forward' hoặc 'backward'
          scoring='accuracy',
          tol=0.01,
          cv=8)


In [7]:
def clean_feature_names(feature_names):
    cleaned_names = set()  # Sử dụng set để tránh trùng lặp
    for name in feature_names:
        # Phân tách tên dựa trên tiền tố '__'
        parts = name.split('__')
        if len(parts) > 1:
            prefix = parts[0]
            base_name = parts[1]
            if prefix == 'num':
                # Nếu tiền tố là 'num__', giữ nguyên base_name
                cleaned_names.add(base_name)
            elif 'date_time' in base_name:
                # Đối với 'date_time', chỉ giữ 'date_time'
                cleaned_names.add('date_time')
            elif 'formation_team1' in base_name:
                # Đối với 'team', chỉ giữ 'team'
                cleaned_names.add('formation_team1')
            elif 'formation_team2' in base_name:
                # Đối với 'team', chỉ giữ 'team'
                cleaned_names.add('formation_team2')
            elif '_' in base_name:
                # Chỉ lấy phần đầu tiên trước dấu '_' đầu tiên
                feature_key = base_name.split('_')[0]
                cleaned_names.add(feature_key)
            else:
                cleaned_names.add(base_name)
        else:
            # Không có tiền tố, giữ nguyên tên
            cleaned_names.add(name)
    # return list(cleaned_names)
    return cleaned_names


In [8]:
import time as t
logit = LogisticRegression()
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

for tol in [-1e-2, -1e-3, -1e-4]:
    start = t.time()
    feature_selector = SFS(
        logit,
        n_features_to_select="auto",
        direction="backward",
        scoring="accuracy",
        tol=tol,
        n_jobs=-1,
        cv=KFold(n_splits=8, shuffle=True, random_state=42)
    )
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('sfs', feature_selector),
        ('classifier', logit)
    ])
    model.fit(X, y)
    
    print(f"\ntol: {tol}")
    transformed_feature_names = model.named_steps['preprocessor'].get_feature_names_out()
    best_features = model.named_steps["sfs"].get_support()
    print(f"Features selected: {transformed_feature_names[best_features]}")
    accuracy = model.score(X, y)  # Calculate accuracy on the training set
    accuray_mean = cross_val_score(model, X, y, cv=KFold(n_splits=8, shuffle=True, random_state=42), scoring='accuracy', error_score='raise', n_jobs=-1)
    print(f"Accuracy trên tập huấn luyện: {accuracy}")
    print(f"Accuracy cross_val: {accuray_mean.mean()}")
    
    end = t.time()
    print(f"Done in {end - start:.3f}s")
    print("===================================================")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


tol: -0.01
Features selected: ['num__is_opponent_big6' 'cat__venue_Home']
Accuracy trên tập huấn luyện: 0.6011428571428571
Accuracy cross_val: 0.6011676396997497
Done in 1719.417s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


tol: -0.001
Features selected: ['num__win_before_team1' 'num__win_before_team2'
 'num__history_team1_draw_team2' 'num__history_team1_lose_team2'
 'num__is_opponent_big6' 'cat__venue_Home'
 'cat__opponent_Bolton Wanderers' 'cat__opponent_Charlton Athletic'
 'cat__opponent_Chelsea' 'cat__opponent_Crystal Palace'
 'cat__opponent_Everton' 'cat__opponent_Fulham' 'cat__opponent_Liverpool'
 'cat__opponent_Manchester City' 'cat__opponent_Norwich City'
 'cat__opponent_Portsmouth' 'cat__opponent_Queens Park Rangers'
 'cat__opponent_Reading' 'cat__opponent_Tottenham Hotspur'
 'cat__opponent_Watford' 'cat__opponent_Wigan Athletic'
 'cat__opponent_Wolverhampton Wanderers']
Accuracy trên tập huấn luyện: 0.6091428571428571
Accuracy cross_val: 0.5736968306922435
Done in 1421.253s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


tol: -0.0001
Features selected: ['num__win_before_team1' 'num__win_before_team2'
 'num__history_team1_draw_team2' 'num__history_team1_lose_team2'
 'num__is_opponent_big6' 'cat__venue_Home'
 'cat__opponent_Bolton Wanderers' 'cat__opponent_Charlton Athletic'
 'cat__opponent_Chelsea' 'cat__opponent_Crystal Palace'
 'cat__opponent_Everton' 'cat__opponent_Fulham' 'cat__opponent_Liverpool'
 'cat__opponent_Manchester City' 'cat__opponent_Norwich City'
 'cat__opponent_Portsmouth' 'cat__opponent_Queens Park Rangers'
 'cat__opponent_Reading' 'cat__opponent_Tottenham Hotspur'
 'cat__opponent_Watford' 'cat__opponent_Wigan Athletic'
 'cat__opponent_Wolverhampton Wanderers']
Accuracy trên tập huấn luyện: 0.6091428571428571
Accuracy cross_val: 0.5736968306922435
Done in 1429.562s


In [10]:
import time as t
logit = LogisticRegression()
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

for tol in [-1e-2, -1e-3, -1e-4]:
    start = t.time()
    feature_selector = SFS(
        logit,
        n_features_to_select="auto",
        direction="backward",
        scoring="accuracy",
        tol=tol,
        n_jobs=-1,
        cv=KFold(n_splits=5, shuffle=True, random_state=42)
    )
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('sfs', feature_selector),
        ('classifier', logit)
    ])
    
    model.fit(X, y)
    print(f"\ntol: {tol}")
    
    transformed_feature_names = model.named_steps['preprocessor'].get_feature_names_out()
    best_features = model.named_steps["sfs"].get_support()
    
    print(f"Features selected: {transformed_feature_names[best_features]}")
    accuracy = model.score(X, y)  # Calculate accuracy on the training set
    accuray_mean = cross_val_score(model, X, y, cv=KFold(n_splits=5, shuffle=True, random_state=42), scoring='accuracy', error_score='raise', n_jobs=-1)
    
    print(f"Accuracy trên tập huấn luyện: {accuracy}")
    print(f"Accuracy cross_val: {accuray_mean.mean()}")
    
    end = t.time()
    print(f"Done in {end - start:.3f}s")
    print("===================================================")


tol: -0.01
Features selected: ['num__is_opponent_big6' 'cat__venue_Home']
Accuracy trên tập huấn luyện: 0.6011428571428571
Accuracy cross_val: 0.5977142857142858
Done in 864.405s

tol: -0.001
Features selected: ['num__draw_before_team2' 'num__history_team1_win_team2'
 'num__history_team1_draw_team2' 'num__is_opponent_big6' 'cat__venue_Home'
 'cat__opponent_Cardiff City' 'cat__opponent_Charlton Athletic'
 'cat__opponent_Coventry City' 'cat__opponent_Derby County'
 'cat__opponent_Everton' 'cat__opponent_Fulham'
 'cat__opponent_Huddersfield Town' 'cat__opponent_Hull City'
 'cat__opponent_Ipswich Town' 'cat__opponent_Leicester City'
 'cat__opponent_Liverpool' 'cat__opponent_Manchester City'
 'cat__opponent_Manchester United' 'cat__opponent_Newcastle United'
 'cat__opponent_Nottingham Forest' 'cat__opponent_Portsmouth'
 'cat__opponent_Queens Park Rangers' 'cat__opponent_Reading'
 'cat__opponent_Southampton' 'cat__opponent_Stoke City'
 'cat__opponent_Sunderland' 'cat__opponent_Tottenham Hot