### Tạo ra các đặc trưng mới cho dữ liệu

In [1]:
import pandas as pd

### 1. Load dữ liệu

In [2]:
# Load dữ liệu thô đã thu thập cho từng đội bóng
matches_df = pd.read_csv("../../raw_data/clean_data.csv")

In [3]:
matches_df.shape

(17480, 13)

In [4]:
matches_df.head()

Unnamed: 0,date,time,round,venue,result,gf,ga,opponent,attendance,formation,referee,season,team
0,8/7/2022,16:30,Matchweek 1,Away,W,2,0,West Ham United,62443.0,4-3-3,Michael Oliver,2022-2023,Manchester City
1,8/13/2022,15:00,Matchweek 2,Home,W,4,0,Bournemouth,53453.0,4-2-3-1,David Coote,2022-2023,Manchester City
2,8/21/2022,16:30,Matchweek 3,Away,D,3,3,Newcastle United,52258.0,4-3-3,Jarred Gillett,2022-2023,Manchester City
3,8/27/2022,15:00,Matchweek 4,Home,W,4,2,Crystal Palace,53112.0,4-2-3-1,Darren England,2022-2023,Manchester City
4,8/31/2022,19:30,Matchweek 5,Home,W,6,0,Nottingham Forest,53409.0,4-2-3-1,Paul Tierney,2022-2023,Manchester City


In [5]:
# Tạo cột date_time để sắp xếp các trận tăng dần theo thời gian
matches_df["date_time"] = pd.to_datetime(matches_df["date"] + " " + matches_df["time"])
matches_df = matches_df.sort_values(by=["date_time", "referee"])

### Kiểm tra tính đối xứng của dữ liệu

In [6]:
matches_df[matches_df["season"] == "2013-2014"][["date_time", "referee", "team", "opponent"]].head(20)

Unnamed: 0,date_time,referee,team,opponent
6954,2013-08-17,Anthony Taylor,Arsenal,Aston Villa
7372,2013-08-17,Anthony Taylor,Aston Villa,Arsenal
7296,2013-08-17,Howard Webb,West Ham United,Cardiff City
7562,2013-08-17,Howard Webb,Cardiff City,West Ham United
7106,2013-08-17,Kevin Friend,Southampton,West Bromwich Albion
7448,2013-08-17,Kevin Friend,West Bromwich Albion,Southampton
6878,2013-08-17,Martin Atkinson,Liverpool,Stoke City
7144,2013-08-17,Martin Atkinson,Stoke City,Liverpool
6992,2013-08-17,Michael Oliver,Everton,Norwich City
7486,2013-08-17,Michael Oliver,Norwich City,Everton


In [7]:
matches_df[matches_df["season"] == "2001-2002"][
    ["date_time", "referee", "team", "opponent"]
].head(20)

Unnamed: 0,date_time,referee,team,opponent
16112,2001-08-18,Clive Wilkes,Leeds United,Southampton
16340,2001-08-18,Clive Wilkes,Southampton,Leeds United
16226,2001-08-18,Dermot Gallagher,Aston Villa,Tottenham Hotspur
16264,2001-08-18,Dermot Gallagher,Tottenham Hotspur,Aston Villa
15960,2001-08-18,Graham Barber,Arsenal,Middlesbrough
16378,2001-08-18,Graham Barber,Middlesbrough,Arsenal
16568,2001-08-18,Graham Poll,Sunderland,Ipswich Town
16606,2001-08-18,Graham Poll,Ipswich Town,Sunderland
15998,2001-08-18,Jeff Winter,Liverpool,West Ham United
16188,2001-08-18,Jeff Winter,West Ham United,Liverpool


### Để cho chắc chắn về đảm bảo tính đối xứng thì sẽ thực hiện hàm sau:

In [8]:
def make_symmetric_data():
    # Nếu đang xét hàng của đội A đấu với đội B mà trong tập data không có
    # thông tin đội B đấu với đội A thì thêm một hàng B đấu với A vào tập data

    temp_df = matches_df.copy()
    print(temp_df.shape)
    new_rows = []

    for _, row in temp_df.iterrows():
        team1 = row["team"]
        team2 = row["opponent"]
        date_time = row["date_time"]
        referee = row["referee"]

        if len(temp_df[(temp_df["team"] == team2) & (temp_df["opponent"] == team1) & (temp_df["date_time"] == date_time)]) == 0:
            new_row = row.copy()
            new_row["team"] = team2
            new_row["opponent"] = team1
            new_row["referee"] = referee

            if row["result"] == "W":
                new_row["result"] = "L"
            elif row["result"] == "L":
                new_row["result"] = "W"

            # Thêm hàng mới vào cuối tập data
            new_rows.append(new_row)  # Thêm hàng mới vào danh sách

    temp_df = pd.concat([temp_df, pd.DataFrame(new_rows)], ignore_index=True)
    temp_df = temp_df.sort_values(by=["date_time", "referee"])
    return temp_df
    

In [9]:
matches_df_temp = make_symmetric_data()

(17480, 14)


In [10]:
matches_df_temp.shape

(17480, 14)

In [11]:
# Dữ liệu đã đối xứng rồi
matches_df = matches_df_temp

### I. Tính toán phong độ và số bàn thắng,thua của 2 đội trong 5 trận gần nhất

In [12]:
matches_df["win_percent_before"] = 0.0
matches_df["lose_percent_before"] = 0.0
matches_df["draw_percent_before"] = 0.0

matches_df["total_goals_before"] = 0
matches_df["total_lost_goals_before"] = 0

for index, row in matches_df.iterrows():
    team = row['team']
    date_time = row["date_time"]
    season = row["season"]

    # Lấy các trận đấu trước đó của đội bóng
    previous_matches = matches_df[
        (matches_df["team"] == team)
        & (matches_df["date_time"] < date_time)
        & (matches_df["season"] == season)
    ]

    # Chỉ lấy 5 trận gần nhất
    previous_matches = previous_matches[-5:]

    # Đếm số trận thắng, hoà trước đó
    win_before = previous_matches[previous_matches["result"] == "W"].shape[0]
    lose_before = previous_matches[previous_matches["result"] == "L"].shape[0]
    draw_before = previous_matches[previous_matches["result"] == "D"].shape[0]

    # Tính tổng số bàn thắng, bàn thua trước đó
    total_goals_before = previous_matches["gf"].sum()
    total_lost_goals_before = previous_matches["ga"].sum()

    # Tính tỉ lệ số trận thắng, hoà trước đó
    match_total = 1 if previous_matches.shape[0] == 0 else previous_matches.shape[0]  
    matches_df.at[index, "win_percent_before"] = win_before / match_total
    matches_df.at[index, "lose_percent_before"] = lose_before / match_total
    matches_df.at[index, "draw_percent_before"] = draw_before / match_total

    # Tổng số bàn thắng, bàn thua trước đó
    matches_df.at[index, "total_goals_before"] = total_goals_before
    matches_df.at[index, "total_lost_goals_before"] = total_lost_goals_before

In [13]:
matches_df[
    [
        "date_time",
        "team",
        "opponent",
        "result",
        "win_percent_before",
        "lose_percent_before",
        "draw_percent_before",
        "total_goals_before",
        "total_lost_goals_before",
    ]
].tail(10)

Unnamed: 0,date_time,team,opponent,result,win_percent_before,lose_percent_before,draw_percent_before,total_goals_before,total_lost_goals_before
17470,2023-05-28 16:30:00,Manchester City,Brentford,L,0.8,0.0,0.2,10,2
17471,2023-05-28 16:30:00,Brentford,Manchester City,W,0.8,0.2,0.0,9,3
17472,2023-05-28 16:30:00,Manchester United,Fulham,W,0.6,0.4,0.0,7,3
17473,2023-05-28 16:30:00,Fulham,Manchester United,L,0.4,0.4,0.2,10,8
17474,2023-05-28 16:30:00,West Ham United,Leicester City,L,0.4,0.6,0.0,7,10
17475,2023-05-28 16:30:00,Leicester City,West Ham United,W,0.0,0.4,0.6,6,11
17476,2023-05-28 16:30:00,Bournemouth,Everton,L,0.4,0.6,0.0,6,7
17477,2023-05-28 16:30:00,Everton,Bournemouth,W,0.2,0.4,0.4,9,11
17478,2023-05-28 16:30:00,Crystal Palace,Nottingham Forest,D,0.4,0.4,0.2,8,8
17479,2023-05-28 16:30:00,Nottingham Forest,Crystal Palace,D,0.6,0.2,0.2,11,8


#### II. Tính toán chỉ số ELO của 2 đội

In [14]:
def get_game_different(score_different):
    if score_different == 0:
        return 0
    if score_different == 1:
        return 0.25
    elif score_different == 2:
        return 0.5
    elif score_different == 3:
        return 0.75
    else:
        return 0.75 + (score_different - 3) / 8


def get_result(result):
    if result == "W":
        return 1
    elif result == "D":
        return 0.5
    else:
        return 0


def get_expected_result(elo_diff):
    return 1 / (1 + 10 ** (elo_diff / 400))

In [15]:
k = 50
for index, row in matches_df.iterrows():
    previous_matches_of_team = matches_df[
        (matches_df["team"] == row["team"]) & (matches_df["date_time"] < row["date_time"]) & (matches_df["season"] == row["season"])
    ][-1:]

    previous_matches_of_opponent = matches_df[
        (matches_df["team"] == row["opponent"])
        & (matches_df["date_time"] < row["date_time"]) & (matches_df["season"] == row["season"])
    ][-1:]

    score_different = abs(row["gf"] - row["ga"])

    if previous_matches_of_team.shape[0] == 0 or previous_matches_of_opponent.shape[0] == 0:
        elo_different = 0
        matches_df.at[index, "ELO_before_match"] = 1500
        matches_df.at[index, "ELO_after_match"] = matches_df.at[index, "ELO_before_match"] + (
            k + k * get_game_different(score_different)
        ) * (get_result(row["result"]) - get_expected_result(elo_different))
    else:
        elo_different = (
            previous_matches_of_opponent["ELO_after_match"].values[0]
            - previous_matches_of_team["ELO_after_match"].values[0]
        )

        if row["venue"] == "Away":
            elo_different *= -1

        matches_df.at[index, "ELO_before_match"] = previous_matches_of_team["ELO_after_match"].values[0]
        matches_df.at[index, "ELO_after_match"] = matches_df.at[index, "ELO_before_match"] + (
            k + k * get_game_different(score_different)
        ) * (get_result(row["result"]) - get_expected_result(elo_different))


In [16]:
matches_df.to_csv("temp.csv", index=False)

Gộp các trận đấu đối xứng

In [17]:
# Đảm bảo dữ liệu đã được sắp xếp theo thời gian, nếu 2 thời gian giống nhau thì sắp xếp theo referee
matches_df = matches_df.sort_values(by=["date_time", "referee"])

In [18]:
matches_df.head()

Unnamed: 0,date,time,round,venue,result,gf,ga,opponent,attendance,formation,...,season,team,date_time,win_percent_before,lose_percent_before,draw_percent_before,total_goals_before,total_lost_goals_before,ELO_before_match,ELO_after_match
0,8/19/2000,00:00,Matchweek 1,Away,L,1,3,Tottenham Hotspur,36148.0,4-3-3,...,2000-2001,Ipswich Town,2000-08-19,0.0,0.0,0.0,0,0,1500.0,1462.5
1,8/19/2000,00:00,Matchweek 1,Home,W,3,1,Ipswich Town,36148.0,4-3-3,...,2000-2001,Tottenham Hotspur,2000-08-19,0.0,0.0,0.0,0,0,1500.0,1537.5
2,8/19/2000,00:00,Matchweek 1,Away,D,2,2,Derby County,27223.0,4-3-3,...,2000-2001,Southampton,2000-08-19,0.0,0.0,0.0,0,0,1500.0,1500.0
3,8/19/2000,00:00,Matchweek 1,Home,D,2,2,Southampton,27223.0,4-3-3,...,2000-2001,Derby County,2000-08-19,0.0,0.0,0.0,0,0,1500.0,1500.0
4,8/19/2000,00:00,Matchweek 1,Away,W,3,1,Coventry City,20624.0,4-3-3,...,2000-2001,Middlesbrough,2000-08-19,0.0,0.0,0.0,0,0,1500.0,1537.5


Chia tập data thành 2 loại: hàng lẻ và hàng chẵn, sau đó gộp các cột hàng chẵn vào hàng lẻ

In [19]:
matches_df_sorted_odd = matches_df.iloc[::2]
matches_df_sorted_even = matches_df.iloc[1::2]

In [20]:
matches_df_sorted_odd = matches_df_sorted_odd.rename(
    columns={
        "formation": "formation_team1",
        "win_percent_before": "win_percent_before_team1",
        "lose_percent_before": "lose_percent_before_team1",
        "draw_percent_before": "draw_percent_before_team1",
        "total_goals_before": "total_goals_before_team1",
        "total_lost_goals_before": "total_lost_goals_before_team1",
        "ELO_before_match": "ELO_before_match_team1",
        "ELO_after_match": "ELO_after_match_team1",
    }
)
matches_df_sorted_odd.head()

Unnamed: 0,date,time,round,venue,result,gf,ga,opponent,attendance,formation_team1,...,season,team,date_time,win_percent_before_team1,lose_percent_before_team1,draw_percent_before_team1,total_goals_before_team1,total_lost_goals_before_team1,ELO_before_match_team1,ELO_after_match_team1
0,8/19/2000,00:00,Matchweek 1,Away,L,1,3,Tottenham Hotspur,36148.0,4-3-3,...,2000-2001,Ipswich Town,2000-08-19,0.0,0.0,0.0,0,0,1500.0,1462.5
2,8/19/2000,00:00,Matchweek 1,Away,D,2,2,Derby County,27223.0,4-3-3,...,2000-2001,Southampton,2000-08-19,0.0,0.0,0.0,0,0,1500.0,1500.0
4,8/19/2000,00:00,Matchweek 1,Away,W,3,1,Coventry City,20624.0,4-3-3,...,2000-2001,Middlesbrough,2000-08-19,0.0,0.0,0.0,0,0,1500.0,1537.5
6,8/19/2000,00:00,Matchweek 1,Home,W,2,0,Everton,40010.0,4-3-3,...,2000-2001,Leeds United,2000-08-19,0.0,0.0,0.0,0,0,1500.0,1537.5
8,8/19/2000,00:00,Matchweek 1,Home,W,4,2,West Ham United,34914.0,4-3-3,...,2000-2001,Chelsea,2000-08-19,0.0,0.0,0.0,0,0,1500.0,1537.5


Lấy các cột cần thiết đối với hàng chẵn, đồng thời đổi tên chúng

In [21]:
matches_df_sorted_even = matches_df_sorted_even[
    [
        "formation",
        "win_percent_before",
        "lose_percent_before",
        "draw_percent_before",
        "total_goals_before",
        "total_lost_goals_before",
        "ELO_before_match",
        "ELO_after_match",
    ]
].rename(
    columns={
        "formation": "formation_team2",
        "win_percent_before": "win_percent_before_team2",
        "lose_percent_before": "lose_percent_before_team2",
        "draw_percent_before": "draw_percent_before_team2",
        "total_goals_before": "total_goals_before_team2",
        "total_lost_goals_before": "total_lost_goals_before_team2",
        "ELO_before_match": "ELO_before_match_team2",
        "ELO_after_match": "ELO_after_match_team2",
    }
)

matches_df_sorted_even.head()

Unnamed: 0,formation_team2,win_percent_before_team2,lose_percent_before_team2,draw_percent_before_team2,total_goals_before_team2,total_lost_goals_before_team2,ELO_before_match_team2,ELO_after_match_team2
1,4-3-3,0.0,0.0,0.0,0,0,1500.0,1537.5
3,4-3-3,0.0,0.0,0.0,0,0,1500.0,1500.0
5,4-3-3,0.0,0.0,0.0,0,0,1500.0,1462.5
7,4-3-3,0.0,0.0,0.0,0,0,1500.0,1462.5
9,4-3-3,0.0,0.0,0.0,0,0,1500.0,1462.5


Thay đổi lại giá trị index của các hàng chẵn để có thể nối thêm vào các cột của hàng lẻ

In [22]:
matches_df_sorted_even.index = matches_df_sorted_odd.index
matches_df_sorted_even.head()

Unnamed: 0,formation_team2,win_percent_before_team2,lose_percent_before_team2,draw_percent_before_team2,total_goals_before_team2,total_lost_goals_before_team2,ELO_before_match_team2,ELO_after_match_team2
0,4-3-3,0.0,0.0,0.0,0,0,1500.0,1537.5
2,4-3-3,0.0,0.0,0.0,0,0,1500.0,1500.0
4,4-3-3,0.0,0.0,0.0,0,0,1500.0,1462.5
6,4-3-3,0.0,0.0,0.0,0,0,1500.0,1462.5
8,4-3-3,0.0,0.0,0.0,0,0,1500.0,1462.5


In [23]:
matches_groupby = pd.concat([matches_df_sorted_odd, matches_df_sorted_even], axis=1)
matches_groupby.head()

Unnamed: 0,date,time,round,venue,result,gf,ga,opponent,attendance,formation_team1,...,ELO_before_match_team1,ELO_after_match_team1,formation_team2,win_percent_before_team2,lose_percent_before_team2,draw_percent_before_team2,total_goals_before_team2,total_lost_goals_before_team2,ELO_before_match_team2,ELO_after_match_team2
0,8/19/2000,00:00,Matchweek 1,Away,L,1,3,Tottenham Hotspur,36148.0,4-3-3,...,1500.0,1462.5,4-3-3,0.0,0.0,0.0,0,0,1500.0,1537.5
2,8/19/2000,00:00,Matchweek 1,Away,D,2,2,Derby County,27223.0,4-3-3,...,1500.0,1500.0,4-3-3,0.0,0.0,0.0,0,0,1500.0,1500.0
4,8/19/2000,00:00,Matchweek 1,Away,W,3,1,Coventry City,20624.0,4-3-3,...,1500.0,1537.5,4-3-3,0.0,0.0,0.0,0,0,1500.0,1462.5
6,8/19/2000,00:00,Matchweek 1,Home,W,2,0,Everton,40010.0,4-3-3,...,1500.0,1537.5,4-3-3,0.0,0.0,0.0,0,0,1500.0,1462.5
8,8/19/2000,00:00,Matchweek 1,Home,W,4,2,West Ham United,34914.0,4-3-3,...,1500.0,1537.5,4-3-3,0.0,0.0,0.0,0,0,1500.0,1462.5


Sắp xếp lại các cột cho hợp lý

In [24]:
removed_cols = [
    "team",
    "opponent",
    "venue",
    "season",
    "date",
    "time",
    "result",
    "win_percent_before_team1",
    "lose_percent_before_team1",
    "draw_percent_before_team1",
    "win_percent_before_team2",
    "lose_percent_before_team2",
    "draw_percent_before_team2",
    "total_goals_before_team1",
    "total_lost_goals_before_team1",
    "total_goals_before_team2",
    "total_lost_goals_before_team2",
    "formation_team1",
    "formation_team2",
    "ELO_before_match_team1",
    "ELO_before_match_team2",
    "ELO_after_match_team1",
    "ELO_after_match_team2",
]

cols = [col for col in matches_groupby.columns if col not in removed_cols]
matches_groupby = matches_groupby[removed_cols + cols]

matches_groupby.sort_values(by=['team', 'date_time'] , inplace=True)
matches_groupby.head()

Unnamed: 0,team,opponent,venue,season,date,time,result,win_percent_before_team1,lose_percent_before_team1,draw_percent_before_team1,...,ELO_before_match_team1,ELO_before_match_team2,ELO_after_match_team1,ELO_after_match_team2,round,gf,ga,attendance,referee,date_time
16,Arsenal,Sunderland,Away,2000-2001,8/19/2000,00:00,L,0.0,0.0,0.0,...,1500.0,1500.0,1468.75,1531.25,Matchweek 1,0,1,47121.0,Steve Dunn,2000-08-19
20,Arsenal,Liverpool,Home,2000-2001,8/21/2000,00:00,W,0.0,1.0,0.0,...,1468.75,1531.25,1512.924019,1500.424019,Matchweek 2,2,0,38014.0,Graham Poll,2000-08-21
54,Arsenal,Charlton Athletic,Home,2000-2001,8/26/2000,00:00,W,0.5,0.5,0.0,...,1512.924019,1513.545664,1550.491115,1476.112761,Matchweek 3,5,3,38025.0,Stephen Lodge,2000-08-26
70,Arsenal,Chelsea,Away,2000-2001,9/6/2000,00:00,D,0.666667,0.333333,0.0,...,1550.491115,1507.852086,1553.543929,1510.9049,Matchweek 4,2,2,34923.0,Mike Riley,2000-09-06
78,Arsenal,Bradford City,Away,2000-2001,9/9/2000,00:00,D,0.5,0.25,0.25,...,1553.543929,1456.587085,1560.344898,1463.388055,Matchweek 5,1,1,17160.0,Alan Wiley,2000-09-09


In [25]:
matches_groupby.columns

Index(['team', 'opponent', 'venue', 'season', 'date', 'time', 'result',
       'win_percent_before_team1', 'lose_percent_before_team1',
       'draw_percent_before_team1', 'win_percent_before_team2',
       'lose_percent_before_team2', 'draw_percent_before_team2',
       'total_goals_before_team1', 'total_lost_goals_before_team1',
       'total_goals_before_team2', 'total_lost_goals_before_team2',
       'formation_team1', 'formation_team2', 'ELO_before_match_team1',
       'ELO_before_match_team2', 'ELO_after_match_team1',
       'ELO_after_match_team2', 'round', 'gf', 'ga', 'attendance', 'referee',
       'date_time'],
      dtype='object')

### II. Tính toán lịch sử sử đối đầu giữa 2 đội trong 10 trận gần nhất

In [26]:
# Định nghĩa hàm để tính số lần thắng, thua và hòa của mỗi đội so với đối thủ
def calculate_previous_outcomes(df):
    # Tạo các cột mới
    df["head2head_win_percent"] = 0.0
    df["head2head_draw_percent"] = 0.0
    df["head2head_lost_percent"] = 0.0

    # Duyệt qua từng hàng trong DataFrame
    for index, row in df.iterrows():
        team = row["team"]
        opponent = row["opponent"]

        # Tìm các trận đấu trước đó của đội
        previous_matches = df[
            (df["date_time"] < row["date_time"])
            & (
                ((df["team"] == team) & (df["opponent"] == opponent))
                | ((df["opponent"] == team) & (df["team"] == opponent))
            )
        ]

        # Lấy 10 trận gần nhất
        previous_matches = previous_matches[-10:]
        matches_total = previous_matches.shape[0]
        if matches_total > 0:
            # Đếm số lần thắng, thua và hòa của đội
            wins = len(
                previous_matches[
                    ((previous_matches["team"] == team) & (previous_matches["result"] == "W"))
                    | ((previous_matches["opponent"] == team) & (previous_matches["result"] == "L"))
                ]
            )

            draws = len(
                previous_matches[
                    ((previous_matches["team"] == team) & (previous_matches["result"] == "D"))
                    | ((previous_matches["opponent"] == team) & (previous_matches["result"] == "D"))
                ]
            )
            
            losses = len(
                previous_matches[
                    ((previous_matches["team"] == team) & (previous_matches["result"] == "L"))
                    | ((previous_matches["opponent"] == team) & (previous_matches["result"] == "W"))
                ]
            )

            df.at[index, "head2head_win_percent"] = wins / matches_total
            df.at[index, "head2head_draw_percent"] = draws / matches_total
            df.at[index, "head2head_lost_percent"] = losses / matches_total

    return df

In [27]:
matches_groupby = calculate_previous_outcomes(matches_groupby)

In [28]:
matches_groupby

Unnamed: 0,team,opponent,venue,season,date,time,result,win_percent_before_team1,lose_percent_before_team1,draw_percent_before_team1,...,ELO_after_match_team2,round,gf,ga,attendance,referee,date_time,head2head_win_percent,head2head_draw_percent,head2head_lost_percent
16,Arsenal,Sunderland,Away,2000-2001,8/19/2000,00:00,L,0.000000,0.000000,0.00,...,1531.250000,Matchweek 1,0,1,47121.0,Steve Dunn,2000-08-19 00:00:00,0.000000,0.000000,0.000000
20,Arsenal,Liverpool,Home,2000-2001,8/21/2000,00:00,W,0.000000,1.000000,0.00,...,1500.424019,Matchweek 2,2,0,38014.0,Graham Poll,2000-08-21 00:00:00,0.000000,0.000000,0.000000
54,Arsenal,Charlton Athletic,Home,2000-2001,8/26/2000,00:00,W,0.500000,0.500000,0.00,...,1476.112761,Matchweek 3,5,3,38025.0,Stephen Lodge,2000-08-26 00:00:00,0.000000,0.000000,0.000000
70,Arsenal,Chelsea,Away,2000-2001,9/6/2000,00:00,D,0.666667,0.333333,0.00,...,1510.904900,Matchweek 4,2,2,34923.0,Mike Riley,2000-09-06 00:00:00,0.000000,0.000000,0.000000
78,Arsenal,Bradford City,Away,2000-2001,9/9/2000,00:00,D,0.500000,0.250000,0.25,...,1463.388055,Matchweek 5,1,1,17160.0,Alan Wiley,2000-09-09 00:00:00,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17172,Wolverhampton Wanderers,Bournemouth,Home,2022-2023,2/18/2023,15:00,L,0.600000,0.200000,0.20,...,1126.402235,Matchweek 24,0,1,31222.0,Michael Salisbury,2023-02-18 15:00:00,0.600000,0.400000,0.000000
17250,Wolverhampton Wanderers,Leeds United,Home,2022-2023,3/18/2023,15:00,L,0.200000,0.600000,0.20,...,1313.828284,Matchweek 28,2,4,31570.0,Michael Salisbury,2023-03-18 15:00:00,0.428571,0.142857,0.428571
17262,Wolverhampton Wanderers,Nottingham Forest,Away,2022-2023,4/1/2023,15:00,D,0.200000,0.600000,0.20,...,1218.965027,Matchweek 29,1,1,29368.0,Chris Kavanagh,2023-04-01 15:00:00,1.000000,0.000000,0.000000
17336,Wolverhampton Wanderers,Leicester City,Away,2022-2023,4/22/2023,15:00,L,0.400000,0.400000,0.20,...,1169.094557,Matchweek 32,1,2,32053.0,Andy Madley,2023-04-22 15:00:00,0.200000,0.400000,0.400000


In [29]:
matches_groupby["team"].unique()

array(['Arsenal', 'Aston Villa', 'Birmingham City', 'Blackburn Rovers',
       'Blackpool', 'Bolton Wanderers', 'Bournemouth', 'Brentford',
       'Brighton and Hove Albion', 'Burnley', 'Cardiff City',
       'Charlton Athletic', 'Chelsea', 'Coventry City', 'Crystal Palace',
       'Derby County', 'Everton', 'Fulham', 'Huddersfield Town',
       'Hull City', 'Ipswich Town', 'Leeds United', 'Leicester City',
       'Liverpool', 'Manchester City', 'Manchester United',
       'Middlesbrough', 'Newcastle United', 'Norwich City',
       'Nottingham Forest', 'Portsmouth', 'Queens Park Rangers',
       'Reading', 'Sheffield United', 'Southampton', 'Stoke City',
       'Sunderland', 'Swansea City', 'Tottenham Hotspur', 'Watford',
       'West Bromwich Albion', 'West Ham United', 'Wigan Athletic',
       'Wolverhampton Wanderers'], dtype=object)

In [30]:
# Hiện tại với file data của từng đội thì đội đó hoặc xuất hiện ở cột team hoặc xuất hiện ở cột component
# nên sẽ tiến hành hoán vị lại sao cho với file arsenal.csv thì cột team phải luôn là arsenal
def swap_columns(df, target_team):
    for idx, row in df.iterrows():
        if row["team"] != target_team:
            df.at[idx, "team"], df.at[idx, "opponent"] = df.at[idx, "opponent"], df.at[idx, "team"]
            
            if df.at[idx, "venue"] == "Home":
                df.at[idx, "venue"] = "Away"
            elif df.at[idx, "venue"] == "Away":
                df.at[idx, "venue"] = "Home"

            if df.at[idx, "result"] == "W":
                df.at[idx, "result"] = "L"
            elif df.at[idx, "result"] == "L":
                df.at[idx, "result"] = "W"

            # Đổi chỗ cột lịch sử đối đầu
            df.at[idx, "head2head_win_percent"], df.at[idx, "head2head_lost_percent"] = \
                df.at[idx, "head2head_lost_percent"], df.at[idx, "head2head_win_percent"]
            
            # Đổi chỗ các cột phong độ của 2 đội
            df.at[idx, "win_percent_before_team1"], df.at[idx, "win_percent_before_team2"] = \
                df.at[idx, "win_percent_before_team2"], df.at[idx, "win_percent_before_team1"]
            
            df.at[idx, "lose_percent_before_team1"], df.at[idx, "lose_percent_before_team2"] = \
                df.at[idx, "lose_percent_before_team2"], df.at[idx, "lose_percent_before_team1"]
            
            df.at[idx, "draw_percent_before_team1"], df.at[idx, "draw_percent_before_team2"] = \
                df.at[idx, "draw_percent_before_team2"], df.at[idx, "draw_percent_before_team1"]
            
            df.at[idx, "total_goals_before_team1"], df.at[idx, "total_goals_before_team2"] = \
                df.at[idx, "total_goals_before_team2"], df.at[idx, "total_goals_before_team1"]
            
            df.at[idx, "total_lost_goals_before_team1"], df.at[idx, "total_lost_goals_before_team2"] = \
                df.at[idx, "total_lost_goals_before_team2"], df.at[idx, "total_lost_goals_before_team1"]
            
            df.at[idx, "ELO_before_match_team1"], df.at[idx, "ELO_before_match_team2"] = \
                df.at[idx, "ELO_before_match_team2"], df.at[idx, "ELO_before_match_team1"]
            
            df.at[idx, "ELO_after_match_team1"], df.at[idx, "ELO_after_match_team2"] = \
                df.at[idx, "ELO_after_match_team2"], df.at[idx, "ELO_after_match_team1"]

In [31]:
target_teams = ["Arsenal", "Manchester City", "Manchester United"]

In [32]:
matches_groupby = matches_groupby.drop(
    columns=[
        "date",
        "time",
        "round",
        "attendance",
        "referee",
        "gf",
        "ga",
        "formation_team1",
        "formation_team2",
    ]
)

In [33]:
# Lưu từng đội vào file csv
for team in target_teams:
    team_df = matches_groupby[(matches_groupby["team"] == team) | (matches_groupby["opponent"] == team)]
    team_df = team_df.sort_values(by="date_time")
    lower_name = team.replace(" ", "_").lower()
    swap_columns(team_df, team)
    team_df.drop(columns=["team"]).to_csv(
        f"../../raw_data/clean_and_transformation/{lower_name}.csv", index=False
    )

III. Đội đối thủ có phải big 6 hay không?

In [34]:
big6_teams = ["Manchester United", "Manchester City", "Liverpool", "Chelsea", "Arsenal", "Tottenham Hotspur"]
big6_teams = [team.lower() for team in big6_teams]

def create_big6_team(file_name):
    team_df = pd.read_csv(file_name)
    team_df["is_opponent_big6"] = team_df["opponent"].apply(lambda x: 1 if x.lower() in big6_teams else 0)
    team_df.to_csv(file_name, index=False)

for team in target_teams:
    create_big6_team(
        f"../../raw_data/clean_and_transformation/{team.replace(' ', '_').lower()}.csv"
    )

### Kết luận. 
Như vậy đã bổ sung thêm các đặc trưng:
- Phong độ hiện tại (%W, %L, %D) của 2 đội trong 5 trận gần nhất của mùa giải
- Số bàn thắng, thua của 2 đội trong 5 trận gần nhất
- Lịch sử đối đầu (%W, %D, %L) của đội đang đi xây dựng model với đối thủ trong 10 trận gần nhất. Vẫn lưu lại cả %L vì lí do:
    + Nếu 2 đội chưa từng gặp nhau thì %W, %D, %L đều bằng 0
    + Nhưng nếu đội A gặp B và A luôn thắng thì %W = 1, %D = 0, %L = 0. Nếu không lưu %L thì lúc <br/>
    với tập dữ liệu của B đấu với A thì %W = 0, %D = 0 thì không biết được liệu rằng B luôn thua A hay là B chưa từng gặp A
- Bổ sung tham số ELO trước trận đấu của 2 đội
- Đội đối thủ có phải là big 6 hay không?