### Biến đổi đặc trưng cho tập test

In [1]:
import pandas as pd

### 1. Load dữ liệu

In [2]:
# Load tập train ra vì sẽ dùng tập này kết hợp với tập test để tính một số đặc trưng mang yếu tố lịch sử
train_df = pd.read_csv("../../feature_engineering_data/train/all_clubs.csv")
test_df = pd.read_csv("../../raw_data/clean_data_test.csv")

In [3]:
test_df.columns

Index(['date', 'time', 'round', 'venue', 'result', 'gf', 'ga', 'opponent',
       'attendance', 'formation', 'referee', 'season', 'team'],
      dtype='object')

In [4]:
test_df.shape

(760, 13)

In [5]:
test_df.head()

Unnamed: 0,date,time,round,venue,result,gf,ga,opponent,attendance,formation,referee,season,team
0,2023-08-11,20:00,Matchweek 1,Away,W,3,0,Burnley,21572.0,4-2-3-1,Craig Pawson,2023-2024,Manchester City
1,2023-08-19,20:00,Matchweek 2,Home,W,1,0,Newcastle United,53419.0,4-2-3-1,Robert Jones,2023-2024,Manchester City
2,2023-08-27,14:00,Matchweek 3,Away,W,2,1,Sheffield United,31336.0,4-2-3-1,Jarred Gillett,2023-2024,Manchester City
3,2023-09-02,15:00,Matchweek 4,Home,W,5,1,Fulham,52899.0,4-2-3-1,Michael Oliver,2023-2024,Manchester City
4,2023-09-16,15:00,Matchweek 5,Away,W,3,1,West Ham United,62475.0,4-2-3-1,Andy Madley,2023-2024,Manchester City


In [7]:
# Tạo cột date_time để sắp xếp các trận tăng dần theo thời gian
train_df["date_time"] = pd.to_datetime(train_df["date"] + " " + train_df["time"])
train_df = train_df.sort_values(by=["date_time", "referee"])

test_df["date_time"] = pd.to_datetime(test_df["date"] + " " + test_df["time"])
test_df = test_df.sort_values(by=["date_time", "referee"])

### I. Tính toán phong độ và số bàn thắng,thua của 2 đội trong 5 trận gần nhất

In [8]:
test_df["average_score_before_match"] = 0.0
test_df["win_percent_before"] = 0.0
test_df["lose_percent_before"] = 0.0
test_df["draw_percent_before"] = 0.0

test_df["total_goals_before"] = 0
test_df["total_lost_goals_before"] = 0

for index, row in test_df.iterrows():
    team = row["team"]
    date_time = row["date_time"]
    season = row["season"]

    # Lấy các trận đấu trước đó của đội bóng
    previous_matches = test_df[
        (test_df["team"] == team)
        & (test_df["date_time"] < date_time)
        & (test_df["season"] == season)
    ]

    # Chỉ lấy 5 trận gần nhất
    previous_matches = previous_matches[-5:]

    # Đếm số trận thắng, hoà trước đó
    win_before = previous_matches[previous_matches["result"] == "W"].shape[0]
    lose_before = previous_matches[previous_matches["result"] == "L"].shape[0]
    draw_before = previous_matches[previous_matches["result"] == "D"].shape[0]

    # Tính tổng số bàn thắng, bàn thua trước đó
    total_goals_before = previous_matches["gf"].sum()
    total_lost_goals_before = previous_matches["ga"].sum()

    # Tính tỉ lệ số trận thắng, hoà trước đó
    match_total = 1 if previous_matches.shape[0] == 0 else previous_matches.shape[0]
    test_df.at[index, "win_percent_before"] = win_before / match_total
    test_df.at[index, "lose_percent_before"] = lose_before / match_total
    test_df.at[index, "draw_percent_before"] = draw_before / match_total

    # Tổng số bàn thắng, bàn thua trước đó
    test_df.at[index, "total_goals_before"] = total_goals_before
    test_df.at[index, "total_lost_goals_before"] = total_lost_goals_before

    # Tính điểm số trung bình từ 5 trận gần nhất
    test_df.at[index, "average_score_before_match"] = (
        previous_matches["result"].map({"W": 3.0, "D": 1.0, "L": 0.0}).sum()
        / match_total
    )

In [9]:
test_df.tail()

Unnamed: 0,date,time,round,venue,result,gf,ga,opponent,attendance,formation,referee,season,team,date_time,average_score_before_match,win_percent_before,lose_percent_before,draw_percent_before,total_goals_before,total_lost_goals_before
683,2024-05-19,16:00,Matchweek 38,Home,L,2,4,Fulham,12027.0,3-4-3,Matt Donohue,2023-2024,Luton Town,2024-05-19 16:00:00,0.2,0.0,0.8,0.2,5,16
75,2024-05-19,16:00,Matchweek 38,Home,W,2,1,Everton,60312.0,4-3-3,Michael Oliver,2023-2024,Arsenal,2024-05-19 16:00:00,3.0,1.0,0.0,0.0,14,2
569,2024-05-19,16:00,Matchweek 38,Away,L,1,2,Arsenal,60312.0,4-4-1-1,Michael Oliver,2023-2024,Everton,2024-05-19 16:00:00,2.6,0.8,0.0,0.2,7,1
265,2024-05-19,16:00,Matchweek 38,Away,W,4,2,Brentford,17124.0,4-3-3,Simon Hooper,2023-2024,Newcastle United,2024-05-19 16:00:00,1.4,0.4,0.4,0.2,12,8
607,2024-05-19,16:00,Matchweek 38,Home,L,2,4,Newcastle United,17124.0,4-3-3,Simon Hooper,2023-2024,Brentford,2024-05-19 16:00:00,2.0,0.6,0.2,0.2,9,3


#### II. Tính toán chỉ số ELO của 2 đội

In [10]:
def get_game_different(score_different):
    if score_different == 0:
        return 0
    if score_different == 1:
        return 0.25
    elif score_different == 2:
        return 0.5
    elif score_different == 3:
        return 0.75
    else:
        return 0.75 + (score_different - 3) / 8


def get_result(result):
    if result == "W":
        return 1
    elif result == "D":
        return 0.5
    else:
        return 0


def get_expected_result(elo_diff):
    return 1 / (1 + 10 ** (elo_diff / 400))

In [11]:
k = 50
for index, row in test_df.iterrows():
    previous_matches_of_team = test_df[
        (test_df["team"] == row["team"]) & (test_df["date_time"] < row["date_time"]) & (test_df["season"] == row["season"])
    ][-1:]

    previous_matches_of_opponent = test_df[
        (test_df["team"] == row["opponent"])
        & (test_df["date_time"] < row["date_time"]) & (test_df["season"] == row["season"])
    ][-1:]

    score_different = abs(row["gf"] - row["ga"])

    if previous_matches_of_team.shape[0] == 0 or previous_matches_of_opponent.shape[0] == 0:
        elo_different = 0
        test_df.at[index, "ELO_before_match"] = 1500
        test_df.at[index, "ELO_after_match"] = test_df.at[index, "ELO_before_match"] + (
            k + k * get_game_different(score_different)
        ) * (get_result(row["result"]) - get_expected_result(elo_different))
    else:
        elo_different = (
            previous_matches_of_opponent["ELO_after_match"].values[0]
            - previous_matches_of_team["ELO_after_match"].values[0]
        )

        if row["venue"] == "Away":
            elo_different *= -1

        test_df.at[index, "ELO_before_match"] = previous_matches_of_team["ELO_after_match"].values[0]
        test_df.at[index, "ELO_after_match"] = test_df.at[index, "ELO_before_match"] + (
            k + k * get_game_different(score_different)
        ) * (get_result(row["result"]) - get_expected_result(elo_different))


Gộp các trận đấu đối xứng

In [12]:
# Đảm bảo dữ liệu đã được sắp xếp theo thời gian, nếu 2 thời gian giống nhau thì sắp xếp theo referee
test_df = test_df.sort_values(by=["date_time", "referee"])
test_df.head()

Unnamed: 0,date,time,round,venue,result,gf,ga,opponent,attendance,formation,...,team,date_time,average_score_before_match,win_percent_before,lose_percent_before,draw_percent_before,total_goals_before,total_lost_goals_before,ELO_before_match,ELO_after_match
0,2023-08-11,20:00,Matchweek 1,Away,W,3,0,Burnley,21572.0,4-2-3-1,...,Manchester City,2023-08-11 20:00:00,0.0,0.0,0.0,0.0,0,0,1500.0,1543.75
684,2023-08-11,20:00,Matchweek 1,Home,L,0,3,Manchester City,21572.0,5-4-1,...,Burnley,2023-08-11 20:00:00,0.0,0.0,0.0,0.0,0,0,1500.0,1456.25
38,2023-08-12,12:30,Matchweek 1,Home,W,2,1,Nottingham Forest,59984.0,4-3-3,...,Arsenal,2023-08-12 12:30:00,0.0,0.0,0.0,0.0,0,0,1500.0,1531.25
608,2023-08-12,12:30,Matchweek 1,Away,L,1,2,Arsenal,59984.0,3-4-3,...,Nottingham Forest,2023-08-12 12:30:00,0.0,0.0,0.0,0.0,0,0,1500.0,1468.75
380,2023-08-12,15:00,Matchweek 1,Home,W,4,1,Luton Town,31872.0,4-2-3-1,...,Brighton and Hove Albion,2023-08-12 15:00:00,0.0,0.0,0.0,0.0,0,0,1500.0,1543.75


In [13]:
test_df_sorted_odd = test_df.iloc[::2]
test_df_sorted_even = test_df.iloc[1::2]

In [14]:
test_df_sorted_odd = test_df_sorted_odd.rename(
    columns={
        "formation": "formation_team1",
        "win_percent_before": "win_percent_before_team1",
        "lose_percent_before": "lose_percent_before_team1",
        "draw_percent_before": "draw_percent_before_team1",
        "total_goals_before": "total_goals_before_team1",
        "total_lost_goals_before": "total_lost_goals_before_team1",
        "ELO_before_match": "ELO_before_match_team1",
        "ELO_after_match": "ELO_after_match_team1",
        "average_score_before_match": "average_score_before_match_team1",
    }
)
test_df_sorted_odd.head()

Unnamed: 0,date,time,round,venue,result,gf,ga,opponent,attendance,formation_team1,...,team,date_time,average_score_before_match_team1,win_percent_before_team1,lose_percent_before_team1,draw_percent_before_team1,total_goals_before_team1,total_lost_goals_before_team1,ELO_before_match_team1,ELO_after_match_team1
0,2023-08-11,20:00,Matchweek 1,Away,W,3,0,Burnley,21572.0,4-2-3-1,...,Manchester City,2023-08-11 20:00:00,0.0,0.0,0.0,0.0,0,0,1500.0,1543.75
38,2023-08-12,12:30,Matchweek 1,Home,W,2,1,Nottingham Forest,59984.0,4-3-3,...,Arsenal,2023-08-12 12:30:00,0.0,0.0,0.0,0.0,0,0,1500.0,1531.25
380,2023-08-12,15:00,Matchweek 1,Home,W,4,1,Luton Town,31872.0,4-2-3-1,...,Brighton and Hove Albion,2023-08-12 15:00:00,0.0,0.0,0.0,0.0,0,0,1500.0,1543.75
342,2023-08-12,15:00,Matchweek 1,Away,W,1,0,Sheffield United,31194.0,4-2-3-1,...,Crystal Palace,2023-08-12 15:00:00,0.0,0.0,0.0,0.0,0,0,1500.0,1531.25
304,2023-08-12,15:00,Matchweek 1,Away,D,1,1,Bournemouth,11245.0,4-2-3-1,...,West Ham United,2023-08-12 15:00:00,0.0,0.0,0.0,0.0,0,0,1500.0,1500.0


In [15]:
test_df_sorted_even = test_df_sorted_even[
    [
        "formation",
        "win_percent_before",
        "lose_percent_before",
        "draw_percent_before",
        "total_goals_before",
        "total_lost_goals_before",
        "ELO_before_match",
        "ELO_after_match",
        "average_score_before_match",
    ]
].rename(
    columns={
        "formation": "formation_team2",
        "win_percent_before": "win_percent_before_team2",
        "lose_percent_before": "lose_percent_before_team2",
        "draw_percent_before": "draw_percent_before_team2",
        "total_goals_before": "total_goals_before_team2",
        "total_lost_goals_before": "total_lost_goals_before_team2",
        "ELO_before_match": "ELO_before_match_team2",
        "ELO_after_match": "ELO_after_match_team2",
        "average_score_before_match": "average_score_before_match_team2",
    }
)

test_df_sorted_even.head()

Unnamed: 0,formation_team2,win_percent_before_team2,lose_percent_before_team2,draw_percent_before_team2,total_goals_before_team2,total_lost_goals_before_team2,ELO_before_match_team2,ELO_after_match_team2,average_score_before_match_team2
684,5-4-1,0.0,0.0,0.0,0,0,1500.0,1456.25,0.0
608,3-4-3,0.0,0.0,0.0,0,0,1500.0,1468.75,0.0
646,3-5-2,0.0,0.0,0.0,0,0,1500.0,1456.25,0.0
722,3-4-3,0.0,0.0,0.0,0,0,1500.0,1468.75,0.0
418,4-2-3-1,0.0,0.0,0.0,0,0,1500.0,1500.0,0.0


Thay đổi lại giá trị index của các hàng chẵn để có thể nối thêm vào các cột của hàng lẻ

In [16]:
test_df_sorted_even.index = test_df_sorted_odd.index
test_df_sorted_even.head()

Unnamed: 0,formation_team2,win_percent_before_team2,lose_percent_before_team2,draw_percent_before_team2,total_goals_before_team2,total_lost_goals_before_team2,ELO_before_match_team2,ELO_after_match_team2,average_score_before_match_team2
0,5-4-1,0.0,0.0,0.0,0,0,1500.0,1456.25,0.0
38,3-4-3,0.0,0.0,0.0,0,0,1500.0,1468.75,0.0
380,3-5-2,0.0,0.0,0.0,0,0,1500.0,1456.25,0.0
342,3-4-3,0.0,0.0,0.0,0,0,1500.0,1468.75,0.0
304,4-2-3-1,0.0,0.0,0.0,0,0,1500.0,1500.0,0.0


In [17]:
matches_groupby = pd.concat([test_df_sorted_odd, test_df_sorted_even], axis=1)
matches_groupby.head()

Unnamed: 0,date,time,round,venue,result,gf,ga,opponent,attendance,formation_team1,...,ELO_after_match_team1,formation_team2,win_percent_before_team2,lose_percent_before_team2,draw_percent_before_team2,total_goals_before_team2,total_lost_goals_before_team2,ELO_before_match_team2,ELO_after_match_team2,average_score_before_match_team2
0,2023-08-11,20:00,Matchweek 1,Away,W,3,0,Burnley,21572.0,4-2-3-1,...,1543.75,5-4-1,0.0,0.0,0.0,0,0,1500.0,1456.25,0.0
38,2023-08-12,12:30,Matchweek 1,Home,W,2,1,Nottingham Forest,59984.0,4-3-3,...,1531.25,3-4-3,0.0,0.0,0.0,0,0,1500.0,1468.75,0.0
380,2023-08-12,15:00,Matchweek 1,Home,W,4,1,Luton Town,31872.0,4-2-3-1,...,1543.75,3-5-2,0.0,0.0,0.0,0,0,1500.0,1456.25,0.0
342,2023-08-12,15:00,Matchweek 1,Away,W,1,0,Sheffield United,31194.0,4-2-3-1,...,1531.25,3-4-3,0.0,0.0,0.0,0,0,1500.0,1468.75,0.0
304,2023-08-12,15:00,Matchweek 1,Away,D,1,1,Bournemouth,11245.0,4-2-3-1,...,1500.0,4-2-3-1,0.0,0.0,0.0,0,0,1500.0,1500.0,0.0


Sắp xếp lại các cột cho hợp lý

In [22]:
removed_cols = [
    "team",
    "opponent",
    "venue",
    "season",
    "date",
    "time",
    "result",
    "win_percent_before_team1",
    "lose_percent_before_team1",
    "draw_percent_before_team1",
    "win_percent_before_team2",
    "lose_percent_before_team2",
    "draw_percent_before_team2",
    "total_goals_before_team1",
    "total_lost_goals_before_team1",
    "total_goals_before_team2",
    "total_lost_goals_before_team2",
    "formation_team1",
    "formation_team2",
    "ELO_before_match_team1",
    "ELO_before_match_team2",
    "ELO_after_match_team1",
    "ELO_after_match_team2",
    "average_score_before_match_team1",
    "average_score_before_match_team2",
]

cols = [col for col in matches_groupby.columns if col not in removed_cols]
matches_groupby = matches_groupby[removed_cols + cols]

test_df = matches_groupby.sort_values(by=['team', 'date_time'])
test_df.head()

Unnamed: 0,team,opponent,venue,season,date,time,result,win_percent_before_team1,lose_percent_before_team1,draw_percent_before_team1,...,ELO_after_match_team1,ELO_after_match_team2,average_score_before_match_team1,average_score_before_match_team2,round,gf,ga,attendance,referee,date_time
38,Arsenal,Nottingham Forest,Home,2023-2024,2023-08-12,12:30,W,0.0,0.0,0.0,...,1531.25,1468.75,0.0,0.0,Matchweek 1,2,1,59984.0,Michael Oliver,2023-08-12 12:30:00
39,Arsenal,Crystal Palace,Away,2023-2024,2023-08-21,20:00,W,1.0,0.0,0.0,...,1562.5,1500.0,3.0,3.0,Matchweek 2,1,0,24189.0,David Coote,2023-08-21 20:00:00
40,Arsenal,Fulham,Home,2023-2024,2023-08-26,15:00,D,1.0,0.0,0.0,...,1556.916636,1477.992132,3.0,1.5,Matchweek 3,2,2,59961.0,Paul Tierney,2023-08-26 15:00:00
41,Arsenal,Manchester United,Home,2023-2024,2023-09-03,16:30,W,0.666667,0.0,0.333333,...,1591.369581,1488.076393,2.333333,2.0,Matchweek 4,3,1,60192.0,Anthony Taylor,2023-09-03 16:30:00
42,Arsenal,Everton,Away,2023-2024,2023-09-17,16:30,W,0.75,0.0,0.25,...,1638.818334,1376.857403,2.5,0.25,Matchweek 5,1,0,39217.0,Simon Hooper,2023-09-17 16:30:00


### II. Tính toán lịch sử sử đối đầu giữa 2 đội trong 10 trận gần nhất

In [25]:
# Gộp tập train, tập test để đi tính lịch sử đối đầu
full_df = pd.concat([train_df, test_df])
full_df = full_df.sort_values(by=["date_time", "referee"])

In [26]:
# Tạo các cột mới
test_df["head2head_win_percent"] = 0.0
test_df["head2head_draw_percent"] = 0.0
test_df["head2head_lost_percent"] = 0.0

# Duyệt qua từng hàng trong DataFrame
for index, row in test_df.iterrows():
    team = row["team"]
    opponent = row["opponent"]

    # Tìm các trận đấu trước đó của đội
    previous_matches = full_df[
        (full_df["date_time"] < row["date_time"])
        & (
            ((full_df["team"] == team) & (full_df["opponent"] == opponent))
            | ((full_df["opponent"] == team) & (full_df["team"] == opponent))
        )
    ]

    # Lấy 10 trận gần nhất
    previous_matches = previous_matches[-10:]
    matches_total = previous_matches.shape[0]
    if matches_total > 0:
        # Đếm số lần thắng, thua và hòa của đội
        wins = len(
            previous_matches[
                ((previous_matches["team"] == team) & (previous_matches["result"] == "W"))
                | ((previous_matches["opponent"] == team) & (previous_matches["result"] == "L"))
            ]
        )

        draws = len(
            previous_matches[
                ((previous_matches["team"] == team) & (previous_matches["result"] == "D"))
                | ((previous_matches["opponent"] == team) & (previous_matches["result"] == "D"))
            ]
        )
        
        losses = len(
            previous_matches[
                ((previous_matches["team"] == team) & (previous_matches["result"] == "L"))
                | ((previous_matches["opponent"] == team) & (previous_matches["result"] == "W"))
            ]
        )

        test_df.at[index, "head2head_win_percent"] = wins / matches_total
        test_df.at[index, "head2head_draw_percent"] = draws / matches_total
        test_df.at[index, "head2head_lost_percent"] = losses / matches_total

In [28]:
test_df.head()

Unnamed: 0,team,opponent,venue,season,date,time,result,win_percent_before_team1,lose_percent_before_team1,draw_percent_before_team1,...,average_score_before_match_team2,round,gf,ga,attendance,referee,date_time,head2head_win_percent,head2head_draw_percent,head2head_lost_percent
38,Arsenal,Nottingham Forest,Home,2023-2024,2023-08-12,12:30,W,0.0,0.0,0.0,...,0.0,Matchweek 1,2,1,59984.0,Michael Oliver,2023-08-12 12:30:00,0.5,0.0,0.5
39,Arsenal,Crystal Palace,Away,2023-2024,2023-08-21,20:00,W,1.0,0.0,0.0,...,3.0,Matchweek 2,1,0,24189.0,David Coote,2023-08-21 20:00:00,0.3,0.5,0.2
40,Arsenal,Fulham,Home,2023-2024,2023-08-26,15:00,D,1.0,0.0,0.0,...,1.5,Matchweek 3,2,2,59961.0,Paul Tierney,2023-08-26 15:00:00,0.8,0.2,0.0
41,Arsenal,Manchester United,Home,2023-2024,2023-09-03,16:30,W,0.666667,0.0,0.333333,...,2.0,Matchweek 4,3,1,60192.0,Anthony Taylor,2023-09-03 16:30:00,0.5,0.3,0.2
42,Arsenal,Everton,Away,2023-2024,2023-09-17,16:30,W,0.75,0.0,0.25,...,0.25,Matchweek 5,1,0,39217.0,Simon Hooper,2023-09-17 16:30:00,0.4,0.1,0.5


In [29]:
# Hiện tại với file data của từng đội thì đội đó hoặc xuất hiện ở cột team hoặc xuất hiện ở cột component
# nên sẽ tiến hành hoán vị lại sao cho với file arsenal.csv thì cột team phải luôn là arsenal
def swap_columns(df, target_team):
    for idx, row in df.iterrows():
        if row["team"] != target_team:
            df.at[idx, "team"], df.at[idx, "opponent"] = df.at[idx, "opponent"], df.at[idx, "team"]
            
            if df.at[idx, "venue"] == "Home":
                df.at[idx, "venue"] = "Away"
            elif df.at[idx, "venue"] == "Away":
                df.at[idx, "venue"] = "Home"

            if df.at[idx, "result"] == "W":
                df.at[idx, "result"] = "L"
            elif df.at[idx, "result"] == "L":
                df.at[idx, "result"] = "W"

            # Đổi chỗ cột lịch sử đối đầu
            df.at[idx, "head2head_win_percent"], df.at[idx, "head2head_lost_percent"] = \
                df.at[idx, "head2head_lost_percent"], df.at[idx, "head2head_win_percent"]
            
            # Đổi chỗ các cột phong độ của 2 đội
            df.at[idx, "win_percent_before_team1"], df.at[idx, "win_percent_before_team2"] = \
                df.at[idx, "win_percent_before_team2"], df.at[idx, "win_percent_before_team1"]
            
            df.at[idx, "lose_percent_before_team1"], df.at[idx, "lose_percent_before_team2"] = \
                df.at[idx, "lose_percent_before_team2"], df.at[idx, "lose_percent_before_team1"]
            
            df.at[idx, "draw_percent_before_team1"], df.at[idx, "draw_percent_before_team2"] = \
                df.at[idx, "draw_percent_before_team2"], df.at[idx, "draw_percent_before_team1"]
            
            df.at[idx, "total_goals_before_team1"], df.at[idx, "total_goals_before_team2"] = \
                df.at[idx, "total_goals_before_team2"], df.at[idx, "total_goals_before_team1"]
            
            df.at[idx, "total_lost_goals_before_team1"], df.at[idx, "total_lost_goals_before_team2"] = \
                df.at[idx, "total_lost_goals_before_team2"], df.at[idx, "total_lost_goals_before_team1"]
            
            df.at[idx, "ELO_before_match_team1"], df.at[idx, "ELO_before_match_team2"] = \
                df.at[idx, "ELO_before_match_team2"], df.at[idx, "ELO_before_match_team1"]
            
            df.at[idx, "ELO_after_match_team1"], df.at[idx, "ELO_after_match_team2"] = \
                df.at[idx, "ELO_after_match_team2"], df.at[idx, "ELO_after_match_team1"]
            
            df.at[idx, "average_score_before_match_team1"], df.at[idx, "average_score_before_match_team2"] = \
                df.at[idx, "average_score_before_match_team2"], df.at[idx, "average_score_before_match_team1"]

In [30]:
target_teams = ["Arsenal", "Manchester City", "Manchester United"]

In [31]:
test_df = test_df.drop(
    columns=[
        "date",
        "time",
        "round",
        "attendance",
        "referee",
        "gf",
        "ga",
        "formation_team1",
        "formation_team2",
    ]
)

In [33]:
# Lưu từng đội vào file csv
for team in target_teams:
    team_df = test_df[(test_df["team"] == team) | (test_df["opponent"] == team)]
    team_df = team_df.sort_values(by="date_time")
    lower_name = team.replace(" ", "_").lower()
    swap_columns(team_df, team)
    team_df.drop(columns=["team"]).to_csv(
        f"../../feature_engineering_data/test/{lower_name}.csv", index=False
    )

III. Đội đối thủ có phải big 6 hay không?

In [35]:
big6_teams = ["Manchester United", "Manchester City", "Liverpool", "Chelsea", "Arsenal", "Tottenham Hotspur"]
big6_teams = [team.lower() for team in big6_teams]

def create_big6_team(file_name):
    team_df = pd.read_csv(file_name)
    team_df["is_opponent_big6"] = team_df["opponent"].apply(lambda x: 1 if x.lower() in big6_teams else 0)
    team_df.to_csv(file_name, index=False)

for team in target_teams:
    create_big6_team(
        f"../../feature_engineering_data/test/{team.replace(' ', '_').lower()}.csv"
    )