### Tạo ra các đặc trưng mới cho dữ liệu

In [3]:
import pandas as pd

### 1. Load dữ liệu

In [2]:
# Load dữ liệu thô đã thu thập cho từng đội bóng
matches_df = pd.read_csv("../../raw_data/clean_data.csv")

In [3]:
matches_df.shape

(5776, 12)

In [4]:
matches_df.head()

Unnamed: 0,date,time,round,venue,result,opponent,attendance,captain,formation,referee,season,team
0,2022-08-05,20:00,Matchweek 1,Away,W,Crystal Palace,25286.0,Martin Ødegaard,4-3-3,Anthony Taylor,2022-2023,Arsenal
1,2022-08-13,15:00,Matchweek 2,Home,W,Leicester City,60033.0,Martin Ødegaard,4-3-3,Darren England,2022-2023,Arsenal
2,2022-08-20,17:30,Matchweek 3,Away,W,Bournemouth,10423.0,Martin Ødegaard,4-3-3,Craig Pawson,2022-2023,Arsenal
3,2022-08-27,17:30,Matchweek 4,Home,W,Fulham,60164.0,Martin Ødegaard,4-3-3,Jarred Gillett,2022-2023,Arsenal
4,2022-08-31,19:30,Matchweek 5,Home,W,Aston Villa,60012.0,Martin Ødegaard,4-3-3,Robert Jones,2022-2023,Arsenal


In [5]:
# Tạo cột date_time để sắp xếp các trận tăng dần theo thời gian
matches_df["date_time"] = pd.to_datetime(matches_df["date"] + " " + matches_df["time"])
matches_df = matches_df.sort_values(by=["date_time", "referee"])

In [6]:
matches_df[matches_df["season"] == "2013-2014"][["date_time", "referee", "team", "opponent"]].head(20)

Unnamed: 0,date_time,referee,team,opponent
342,2013-08-17,Anthony Taylor,Arsenal,Aston Villa
608,2013-08-17,Anthony Taylor,Aston Villa,Arsenal
5548,2013-08-17,Howard Webb,West Ham United,Cardiff City
4788,2013-08-17,Kevin Friend,Southampton,West Bromwich Albion
3268,2013-08-17,Martin Atkinson,Liverpool,Stoke City
2280,2013-08-17,Michael Oliver,Everton,Norwich City
2432,2013-08-17,Neil Swarbrick,Fulham,Sunderland
4028,2013-08-17,Phil Dowd,Manchester United,Swansea City
1520,2013-08-18,Jon Moss,Chelsea,Hull City
1900,2013-08-18,Mark Clattenburg,Crystal Palace,Tottenham Hotspur


Dữ liệu hiện tại chưa có tính đối xứng, tức là 1 trận A đấu với B nhưng không có hàng dữ liệu B đấu với A trong ngày đó

In [7]:
def make_symmetric_data():
    # Nếu đang xét hàng của đội A đấu với đội B mà trong tập data không có
    # thông tin đội B đấu với đội A thì thêm một hàng B đấu với A vào tập data

    temp_df = matches_df.copy()
    print(temp_df.shape)
    new_rows = []

    for _, row in temp_df.iterrows():
        team1 = row["team"]
        team2 = row["opponent"]
        date_time = row["date_time"]
        referee = row["referee"]

        if len(temp_df[(temp_df["team"] == team2) & (temp_df["opponent"] == team1) & (temp_df["date_time"] == date_time)]) == 0:
            new_row = row.copy()
            new_row["team"] = team2
            new_row["opponent"] = team1
            new_row["referee"] = referee

            if row["result"] == "W":
                new_row["result"] = "L"
            elif row["result"] == "L":
                new_row["result"] = "W"

            # Thêm hàng mới vào cuối tập data
            new_rows.append(new_row)  # Thêm hàng mới vào danh sách

    temp_df = pd.concat([temp_df, pd.DataFrame(new_rows)], ignore_index=True)
    temp_df = temp_df.sort_values(by=["date_time", "referee"])
    return temp_df
    

In [8]:
matches_df = make_symmetric_data()

(5776, 13)


In [9]:
matches_df[matches_df["season"] == "2013-2014"][["date_time", "referee", "team", "opponent", "result"]].head(20)

Unnamed: 0,date_time,referee,team,opponent,result
0,2013-08-17,Anthony Taylor,Arsenal,Aston Villa,L
1,2013-08-17,Anthony Taylor,Aston Villa,Arsenal,W
2,2013-08-17,Howard Webb,West Ham United,Cardiff City,W
5776,2013-08-17,Howard Webb,Cardiff City,West Ham United,L
3,2013-08-17,Kevin Friend,Southampton,West Bromwich Albion,W
5777,2013-08-17,Kevin Friend,West Bromwich Albion,Southampton,L
4,2013-08-17,Martin Atkinson,Liverpool,Stoke City,W
5778,2013-08-17,Martin Atkinson,Stoke City,Liverpool,L
5,2013-08-17,Michael Oliver,Everton,Norwich City,D
5779,2013-08-17,Michael Oliver,Norwich City,Everton,D


In [10]:
# Dữ liệu bây giờ đã đối xứng, tiến hành lưu lại file csv
matches_df.to_csv("../../raw_data/symmetric_data.csv", index=False)

I. Tính toán phong độ của đội đang đi xây dựng model trong 5 trận gần nhất

In [11]:
matches_df["win_before"] = 0.0
matches_df["lose_before"] = 0.0
matches_df["draw_before"] = 0.0

for index, row in matches_df.iterrows():
    team = row['team']
    date_time = row["date_time"]
    season = row["season"]

    # Lấy các trận đấu trước đó của đội bóng
    previous_matches = matches_df[
        (matches_df["team"] == team)
        & (matches_df["date_time"] < date_time)
        & (matches_df["season"] == season)
    ]

    # Chỉ lấy 5 trận gần nhất
    previous_matches = previous_matches[-5:]

    # Đếm số trận thắng, hoà trước đó
    win_before = previous_matches[previous_matches["result"] == "W"].shape[0]
    lose_before = previous_matches[previous_matches["result"] == "L"].shape[0]
    draw_before = previous_matches[previous_matches["result"] == "D"].shape[0]

    # Tính tỉ lệ số trận thắng, hoà trước đó
    match_total = 1 if previous_matches.shape[0] == 0 else previous_matches.shape[0]  
    matches_df.at[index, "win_before"] = win_before / match_total
    matches_df.at[index, "lose_before"] = lose_before / match_total
    matches_df.at[index, "draw_before"] = draw_before / match_total

In [12]:
matches_df[["date_time", "referee", "team", "opponent", "result", "win_before", "lose_before", "draw_before"]].tail(10)

Unnamed: 0,date_time,referee,team,opponent,result,win_before,lose_before,draw_before
5766,2023-05-28 16:30:00,John Brooks,Brentford,Manchester City,W,0.8,0.2,0.0
5767,2023-05-28 16:30:00,John Brooks,Manchester City,Brentford,L,0.8,0.0,0.2
5768,2023-05-28 16:30:00,Robert Jones,Fulham,Manchester United,L,0.4,0.4,0.2
5769,2023-05-28 16:30:00,Robert Jones,Manchester United,Fulham,W,0.6,0.4,0.0
5770,2023-05-28 16:30:00,Simon Hooper,Leicester City,West Ham United,W,0.0,0.4,0.6
5771,2023-05-28 16:30:00,Simon Hooper,West Ham United,Leicester City,L,0.4,0.6,0.0
5772,2023-05-28 16:30:00,Stuart Attwell,Bournemouth,Everton,L,0.4,0.6,0.0
5773,2023-05-28 16:30:00,Stuart Attwell,Everton,Bournemouth,W,0.2,0.4,0.4
5774,2023-05-28 16:30:00,Thomas Bramall,Crystal Palace,Nottingham Forest,D,0.4,0.4,0.2
5775,2023-05-28 16:30:00,Thomas Bramall,Nottingham Forest,Crystal Palace,D,0.6,0.2,0.2


Gộp các trận đấu đối xứng

In [13]:
# Đảm bảo dữ liệu đã được sắp xếp theo thời gian, nếu 2 thời gian giống nhau thì sắp xếp theo referee
matches_df = matches_df.sort_values(by=["date_time", "referee"])

In [14]:
matches_df.head()

Unnamed: 0,date,time,round,venue,result,opponent,attendance,captain,formation,referee,season,team,date_time,win_before,lose_before,draw_before
0,2013-08-17,00:00,Matchweek 1,Home,L,Aston Villa,60003.0,,4-2-3-1,Anthony Taylor,2013-2014,Arsenal,2013-08-17,0.0,0.0,0.0
1,2013-08-17,00:00,Matchweek 1,Away,W,Arsenal,60003.0,,4-3-3,Anthony Taylor,2013-2014,Aston Villa,2013-08-17,0.0,0.0,0.0
2,2013-08-17,00:00,Matchweek 1,Home,W,Cardiff City,34977.0,,4-2-3-1,Howard Webb,2013-2014,West Ham United,2013-08-17,0.0,0.0,0.0
5776,2013-08-17,00:00,Matchweek 1,Home,L,West Ham United,34977.0,,4-2-3-1,Howard Webb,2013-2014,Cardiff City,2013-08-17,0.0,0.0,0.0
3,2013-08-17,00:00,Matchweek 1,Away,W,West Bromwich Albion,25927.0,,4-3-3,Kevin Friend,2013-2014,Southampton,2013-08-17,0.0,0.0,0.0


Chia tập data thành 2 loại: hàng lẻ và hàng chẵn, sau đó gộp các cột hàng chẵn vào hàng lẻ

In [15]:
matches_df_sorted_odd = matches_df.iloc[::2]
matches_df_sorted_even = matches_df.iloc[1::2]

In [16]:
matches_df_sorted_odd = matches_df_sorted_odd.rename(
    columns={
        "captain": "captain_team1",
        "formation": "formation_team1",
        "win_before": "win_before_team1",
        "lose_before": "lose_before_team1",
        "draw_before": "draw_before_team1",
    }
)
matches_df_sorted_odd.head()

Unnamed: 0,date,time,round,venue,result,opponent,attendance,captain_team1,formation_team1,referee,season,team,date_time,win_before_team1,lose_before_team1,draw_before_team1
0,2013-08-17,00:00,Matchweek 1,Home,L,Aston Villa,60003.0,,4-2-3-1,Anthony Taylor,2013-2014,Arsenal,2013-08-17,0.0,0.0,0.0
2,2013-08-17,00:00,Matchweek 1,Home,W,Cardiff City,34977.0,,4-2-3-1,Howard Webb,2013-2014,West Ham United,2013-08-17,0.0,0.0,0.0
3,2013-08-17,00:00,Matchweek 1,Away,W,West Bromwich Albion,25927.0,,4-3-3,Kevin Friend,2013-2014,Southampton,2013-08-17,0.0,0.0,0.0
4,2013-08-17,00:00,Matchweek 1,Home,W,Stoke City,44822.0,,4-2-3-1,Martin Atkinson,2013-2014,Liverpool,2013-08-17,0.0,0.0,0.0
5,2013-08-17,00:00,Matchweek 1,Away,D,Norwich City,26824.0,,4-4-1-1,Michael Oliver,2013-2014,Everton,2013-08-17,0.0,0.0,0.0


Lấy các cột cần thiết đối với hàng chẵn, đồng thời đổi tên chúng

In [17]:
matches_df_sorted_even = matches_df_sorted_even[
    ["captain", "formation", "win_before", "lose_before", "draw_before"]
].rename(
    columns={
        "captain": "captain_team2",
        "formation": "formation_team2",
        "win_before": "win_before_team2",
        "lose_before": "lose_before_team2",
        "draw_before": "draw_before_team2",
    }
)

matches_df_sorted_even.head()

Unnamed: 0,captain_team2,formation_team2,win_before_team2,lose_before_team2,draw_before_team2
1,,4-3-3,0.0,0.0,0.0
5776,,4-2-3-1,0.0,0.0,0.0
5777,,4-3-3,0.0,0.0,0.0
5778,,4-2-3-1,0.0,0.0,0.0
5779,,4-4-1-1,0.0,0.0,0.0


Thay đổi lại giá trị index của các hàng chẵn để có thể nối thêm vào các cột của hàng lẻ

In [18]:
matches_df_sorted_even.index = matches_df_sorted_odd.index
matches_df_sorted_even.head()

Unnamed: 0,captain_team2,formation_team2,win_before_team2,lose_before_team2,draw_before_team2
0,,4-3-3,0.0,0.0,0.0
2,,4-2-3-1,0.0,0.0,0.0
3,,4-3-3,0.0,0.0,0.0
4,,4-2-3-1,0.0,0.0,0.0
5,,4-4-1-1,0.0,0.0,0.0


In [19]:
matches_groupby = pd.concat([matches_df_sorted_odd, matches_df_sorted_even], axis=1)
matches_groupby.head()

Unnamed: 0,date,time,round,venue,result,opponent,attendance,captain_team1,formation_team1,referee,...,team,date_time,win_before_team1,lose_before_team1,draw_before_team1,captain_team2,formation_team2,win_before_team2,lose_before_team2,draw_before_team2
0,2013-08-17,00:00,Matchweek 1,Home,L,Aston Villa,60003.0,,4-2-3-1,Anthony Taylor,...,Arsenal,2013-08-17,0.0,0.0,0.0,,4-3-3,0.0,0.0,0.0
2,2013-08-17,00:00,Matchweek 1,Home,W,Cardiff City,34977.0,,4-2-3-1,Howard Webb,...,West Ham United,2013-08-17,0.0,0.0,0.0,,4-2-3-1,0.0,0.0,0.0
3,2013-08-17,00:00,Matchweek 1,Away,W,West Bromwich Albion,25927.0,,4-3-3,Kevin Friend,...,Southampton,2013-08-17,0.0,0.0,0.0,,4-3-3,0.0,0.0,0.0
4,2013-08-17,00:00,Matchweek 1,Home,W,Stoke City,44822.0,,4-2-3-1,Martin Atkinson,...,Liverpool,2013-08-17,0.0,0.0,0.0,,4-2-3-1,0.0,0.0,0.0
5,2013-08-17,00:00,Matchweek 1,Away,D,Norwich City,26824.0,,4-4-1-1,Michael Oliver,...,Everton,2013-08-17,0.0,0.0,0.0,,4-4-1-1,0.0,0.0,0.0


Sắp xếp lại các cột cho hợp lý

In [20]:
removed_cols = [
    "team",
    "opponent",
    "venue",
    "season",
    "date",
    "time",
    "result",
    "win_before_team1",
    "lose_before_team1",
    "draw_before_team1",
    "win_before_team2",
    "lose_before_team2",
    "draw_before_team2",
    "captain_team1",
    "formation_team1",
    "captain_team2",
    "formation_team2",
]

cols = [col for col in matches_groupby.columns if col not in removed_cols]
matches_groupby = matches_groupby[removed_cols + cols]

matches_groupby.sort_values(by=['team', 'date_time'] , inplace=True)
matches_groupby.head()

Unnamed: 0,team,opponent,venue,season,date,time,result,win_before_team1,lose_before_team1,draw_before_team1,...,lose_before_team2,draw_before_team2,captain_team1,formation_team1,captain_team2,formation_team2,round,attendance,referee,date_time
0,Arsenal,Aston Villa,Home,2013-2014,2013-08-17,00:00,L,0.0,0.0,0.0,...,0.0,0.0,,4-2-3-1,,4-3-3,Matchweek 1,60003.0,Anthony Taylor,2013-08-17
16,Arsenal,Fulham,Away,2013-2014,2013-08-24,00:00,W,0.0,1.0,0.0,...,0.0,0.0,,4-2-3-1,,4-4-1-1,Matchweek 2,25622.0,Howard Webb,2013-08-24
37,Arsenal,Tottenham Hotspur,Home,2013-2014,2013-09-01,00:00,W,0.5,0.5,0.0,...,0.0,0.0,,4-2-3-1,,4-2-3-1,Matchweek 3,60071.0,Michael Oliver,2013-09-01
46,Arsenal,Sunderland,Away,2013-2014,2013-09-14,00:00,W,0.666667,0.333333,0.0,...,0.666667,0.333333,,4-2-3-1,,4-2-3-1,Matchweek 4,39055.0,Martin Atkinson,2013-09-14
64,Arsenal,Stoke City,Home,2013-2014,2013-09-22,00:00,W,0.75,0.25,0.0,...,0.25,0.25,,4-2-3-1,,4-2-3-1,Matchweek 5,60002.0,Mike Dean,2013-09-22


In [21]:
matches_groupby.columns

Index(['team', 'opponent', 'venue', 'season', 'date', 'time', 'result',
       'win_before_team1', 'lose_before_team1', 'draw_before_team1',
       'win_before_team2', 'lose_before_team2', 'draw_before_team2',
       'captain_team1', 'formation_team1', 'captain_team2', 'formation_team2',
       'round', 'attendance', 'referee', 'date_time'],
      dtype='object')

In [22]:
# Xoá một số cột không còn cần thiết nữa
matches_groupby.drop(columns=["season", "date", "time", "captain_team1", "captain_team2"], inplace=True)

In [23]:
matches_groupby.columns

Index(['team', 'opponent', 'venue', 'result', 'win_before_team1',
       'lose_before_team1', 'draw_before_team1', 'win_before_team2',
       'lose_before_team2', 'draw_before_team2', 'formation_team1',
       'formation_team2', 'round', 'attendance', 'referee', 'date_time'],
      dtype='object')

II. Tính toán lịch sử sử đối đầu giữa 2 đội

In [24]:
# Định nghĩa hàm để tính số lần thắng, thua và hòa của mỗi đội so với đối thủ
def calculate_previous_outcomes(df):
    # Tạo các cột mới
    df["history_team1_win_team2"] = 0.0
    df["history_team1_draw_team2"] = 0.0
    df["history_team1_lose_team2"] = 0.0

    # Duyệt qua từng hàng trong DataFrame
    for index, row in df.iterrows():
        team = row["team"]
        opponent = row["opponent"]

        # Tìm các trận đấu trước đó của đội
        previous_matches = df[
            (df["date_time"] < row["date_time"])
            & (
                ((df["team"] == team) & (df["opponent"] == opponent))
                | ((df["opponent"] == team) & (df["team"] == opponent))
            )
        ]

        matches_total = previous_matches.shape[0]
        if matches_total > 0:
            # Đếm số lần thắng, thua và hòa của đội
            wins = len(
                previous_matches[
                    ((previous_matches["team"] == team) & (previous_matches["result"] == "W"))
                    | ((previous_matches["opponent"] == team) & (previous_matches["result"] == "L"))
                ]
            )

            draws = len(
                previous_matches[
                    ((previous_matches["team"] == team) & (previous_matches["result"] == "D"))
                    | ((previous_matches["opponent"] == team) & (previous_matches["result"] == "D"))
                ]
            )
            
            losses = len(
                previous_matches[
                    ((previous_matches["team"] == team) & (previous_matches["result"] == "L"))
                    | ((previous_matches["opponent"] == team) & (previous_matches["result"] == "W"))
                ]
            )

            df.at[index, "history_team1_win_team2"] = wins / matches_total
            df.at[index, "history_team1_draw_team2"] = draws / matches_total
            df.at[index, "history_team1_lose_team2"] = losses / matches_total

    return df

In [25]:
matches_groupby = calculate_previous_outcomes(matches_groupby)

In [26]:
matches_groupby

Unnamed: 0,team,opponent,venue,result,win_before_team1,lose_before_team1,draw_before_team1,win_before_team2,lose_before_team2,draw_before_team2,formation_team1,formation_team2,round,attendance,referee,date_time,history_team1_win_team2,history_team1_draw_team2,history_team1_lose_team2
0,Arsenal,Aston Villa,Home,L,0.000000,0.000000,0.0,0.0,0.000000,0.000000,4-2-3-1,4-3-3,Matchweek 1,60003.0,Anthony Taylor,2013-08-17 00:00:00,0.000000,0.000000,0.000000
16,Arsenal,Fulham,Away,W,0.000000,1.000000,0.0,1.0,0.000000,0.000000,4-2-3-1,4-4-1-1,Matchweek 2,25622.0,Howard Webb,2013-08-24 00:00:00,0.000000,0.000000,0.000000
37,Arsenal,Tottenham Hotspur,Home,W,0.500000,0.500000,0.0,1.0,0.000000,0.000000,4-2-3-1,4-2-3-1,Matchweek 3,60071.0,Michael Oliver,2013-09-01 00:00:00,0.000000,0.000000,0.000000
46,Arsenal,Sunderland,Away,W,0.666667,0.333333,0.0,0.0,0.666667,0.333333,4-2-3-1,4-2-3-1,Matchweek 4,39055.0,Martin Atkinson,2013-09-14 00:00:00,0.000000,0.000000,0.000000
64,Arsenal,Stoke City,Home,W,0.750000,0.250000,0.0,0.5,0.250000,0.250000,4-2-3-1,4-2-3-1,Matchweek 5,60002.0,Mike Dean,2013-09-22 00:00:00,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4580,Wolverhampton Wanderers,Norwich City,Away,D,0.600000,0.200000,0.2,0.4,0.400000,0.200000,3-4-3,3-4-3,Matchweek 13,26911.0,Simon Hooper,2021-11-27 15:00:00,1.000000,0.000000,0.000000
4596,Wolverhampton Wanderers,Burnley,Home,D,0.400000,0.200000,0.4,0.2,0.200000,0.600000,3-4-3,3-4-3,Matchweek 14,30328.0,John Brooks,2021-12-01 19:30:00,0.166667,0.333333,0.500000
4824,Wolverhampton Wanderers,Watford,Home,W,0.400000,0.600000,0.0,0.2,0.600000,0.200000,3-4-3,3-4-3,Matchweek 19,29658.0,Darren England,2022-03-10 19:30:00,0.600000,0.000000,0.400000
4924,Wolverhampton Wanderers,Burnley,Away,L,0.600000,0.400000,0.0,0.4,0.400000,0.200000,3-4-3,3-4-3,Matchweek 34,19246.0,Anthony Taylor,2022-04-24 14:00:00,0.142857,0.428571,0.428571


In [27]:
matches_groupby["team"].unique()

array(['Arsenal', 'Aston Villa', 'Bournemouth', 'Brentford',
       'Brighton and Hove Albion', 'Chelsea', 'Crystal Palace', 'Everton',
       'Fulham', 'Leeds United', 'Leicester City', 'Liverpool',
       'Manchester City', 'Manchester United', 'Newcastle United',
       'Nottingham Forest', 'Southampton', 'Tottenham Hotspur',
       'West Ham United', 'Wolverhampton Wanderers'], dtype=object)

In [28]:
# Hiện tại với file data của từng đội thì đội đó hoặc xuất hiện ở cột team hoặc xuất hiện ở cột component
# nên sẽ tiến hành hoán vị lại sao cho với file arsenal.csv thì cột team phải luôn là arsenal

def swap_columns(df, target_team):
    for idx, row in df.iterrows():
        if row["team"] != target_team:
            df.at[idx, "team"], df.at[idx, "opponent"] = df.at[idx, "opponent"], df.at[idx, "team"]
            
            if df.at[idx, "venue"] == "Home":
                df.at[idx, "venue"] = "Away"
            elif df.at[idx, "venue"] == "Away":
                df.at[idx, "venue"] = "Home"

            if df.at[idx, "result"] == "W":
                df.at[idx, "result"] = "L"
            elif df.at[idx, "result"] == "L":
                df.at[idx, "result"] = "W"

            # Đổi chỗ cột lịch sử đối đầu
            df.at[idx, "history_team1_win_team2"], df.at[idx, "history_team1_lose_team2"] = \
                df.at[idx, "history_team1_lose_team2"], df.at[idx, "history_team1_win_team2"]
            
            # Đổi chỗ các cột phong độ của 2 đội
            df.at[idx, "win_before_team1"], df.at[idx, "win_before_team2"] = \
                df.at[idx, "win_before_team2"], df.at[idx, "win_before_team1"]
            
            df.at[idx, "lose_before_team1"], df.at[idx, "lose_before_team2"] = \
                df.at[idx, "lose_before_team2"], df.at[idx, "lose_before_team1"]
            
            df.at[idx, "draw_before_team1"], df.at[idx, "draw_before_team2"] = \
                df.at[idx, "draw_before_team2"], df.at[idx, "draw_before_team1"]

In [2]:
target_teams = ["Arsenal", "Manchester City", "Manchester United"]

In [35]:
# Lưu từng đội vào file csv
for team in target_teams:
    team_df = matches_groupby[(matches_groupby["team"] == team) | (matches_groupby["opponent"] == team)]
    team_df = team_df.sort_values(by="date_time")
    lower_name = team.replace(" ", "_").lower()
    team_df.drop(columns=["history_team1_lose_team2"]).to_csv(f"../training_data/{lower_name}_origin.csv", index=False)
    swap_columns(team_df, team)
    team_df.drop(columns=["history_team1_lose_team2"]).to_csv(f"../training_data/{lower_name}_swap.csv", index=False)

III. Đội đối thủ có phải big 6 hay không?

In [19]:
big6_teams = ["Manchester United", "Manchester City", "Liverpool", "Chelsea", "Arsenal", "Tottenham Hotspur"]
big6_teams = [team.lower() for team in big6_teams]

def create_big6_team(file_name):
    team_df = pd.read_csv(file_name)
    team_df["is_opponent_big6"] = team_df["opponent"].apply(lambda x: 1 if x.lower() in big6_teams else 0)
    team_df.to_csv(file_name, index=False)

for team in target_teams:
    create_big6_team(f"../training_data/{team.replace(' ', '_').lower()}_origin.csv")
    create_big6_team(f"../training_data/{team.replace(' ', '_').lower()}_swap.csv")


### Kết luận. 
Như vậy đã bổ sung thêm các đặc trưng:
- Phong độ hiện tại (%W, %L, %D) của 2 đội trong 5 trận gần nhất của mùa giải
- Lịch sử đối đầu (%W, %D) của đội đang đi xây dựng model với đối thủ
- Đội đối thủ có phải là big 6 hay không?