#### MU tại các mùa 2000 -> 2010 là một đội bóng mạnh và liên tục vô địch, tuy nhiên từ mùa 2010 đến nay, MU đã không còn giữ được vị thế của mình. Do đó sẽ không đưa dữ liệu của MU từ mùa 2000 -> 2010 vào tập dữ liệu.

### Tạo ra các đặc trưng mới cho dữ liệu cho dữ liệu MU

In [1]:
import pandas as pd

### 1. Load dữ liệu

In [2]:
# Load dữ liệu thô đã thu thập cho từng đội bóng
matches_df = pd.read_csv("../../raw_data/clean_data_train.csv")

In [3]:
# Chỉ lấy các mùa giải từ 2010 -> 2020
matches_df["season_temp"] = matches_df["season"].apply(lambda x: x.split("-")[0])
matches_df["season_temp"] = matches_df["season_temp"].astype(int)

matches_df = matches_df[(matches_df["season_temp"] >= 2010) & (matches_df["season_temp"] <= 2020)]
matches_df = matches_df.drop(columns=["season_temp"])

In [4]:
# 8360 = 38 * 11 * 20
matches_df.shape

(8360, 13)

In [5]:
matches_df.head()

Unnamed: 0,date,time,round,venue,result,gf,ga,opponent,attendance,formation,referee,season,team
1520,9/21/2020,20:15,Matchweek 2,Away,W,3,1,Wolverhampton Wanderers,39898.5625,4-2-3-1,Andre Marriner,2020-2021,Manchester City
1521,9/27/2020,16:30,Matchweek 3,Home,L,2,5,Leicester City,40648.75,4-2-3-1,Michael Oliver,2020-2021,Manchester City
1522,10/3/2020,17:30,Matchweek 4,Away,D,1,1,Leeds United,41201.3,4-3-3,Mike Dean,2020-2021,Manchester City
1523,10/17/2020,17:30,Matchweek 5,Home,W,1,0,Arsenal,51031.368421,3-1-4-2,Chris Kavanagh,2020-2021,Manchester City
1524,10/24/2020,12:30,Matchweek 6,Away,D,1,1,West Ham United,44813.757576,4-3-3,Anthony Taylor,2020-2021,Manchester City


In [6]:
# Tạo cột date_time để sắp xếp các trận tăng dần theo thời gian
matches_df["date_time"] = pd.to_datetime(matches_df["date"] + " " + matches_df["time"])
matches_df = matches_df.sort_values(by=["date_time", "referee"])

### I. Tính toán phong độ và số bàn thắng,thua của 2 đội trong 5 trận gần nhất

In [7]:
matches_df["average_score_before_match"] = 0.0
matches_df["win_percent_before"] = 0.0
matches_df["lose_percent_before"] = 0.0
matches_df["draw_percent_before"] = 0.0

matches_df["total_goals_before"] = 0
matches_df["total_lost_goals_before"] = 0

for index, row in matches_df.iterrows():
    team = row["team"]
    date_time = row["date_time"]
    season = row["season"]

    # Lấy các trận đấu trước đó của đội bóng
    previous_matches = matches_df[
        (matches_df["team"] == team)
        & (matches_df["date_time"] < date_time)
        & (matches_df["season"] == season)
    ]

    # Chỉ lấy 5 trận gần nhất
    previous_matches = previous_matches[-5:]

    # Đếm số trận thắng, hoà trước đó
    win_before = previous_matches[previous_matches["result"] == "W"].shape[0]
    lose_before = previous_matches[previous_matches["result"] == "L"].shape[0]
    draw_before = previous_matches[previous_matches["result"] == "D"].shape[0]

    # Tính tổng số bàn thắng, bàn thua trước đó
    total_goals_before = previous_matches["gf"].sum()
    total_lost_goals_before = previous_matches["ga"].sum()

    # Tính tỉ lệ số trận thắng, hoà trước đó
    match_total = 1 if previous_matches.shape[0] == 0 else previous_matches.shape[0]
    matches_df.at[index, "win_percent_before"] = win_before / match_total
    matches_df.at[index, "lose_percent_before"] = lose_before / match_total
    matches_df.at[index, "draw_percent_before"] = draw_before / match_total

    # Tổng số bàn thắng, bàn thua trước đó
    matches_df.at[index, "total_goals_before"] = total_goals_before
    matches_df.at[index, "total_lost_goals_before"] = total_lost_goals_before

    # Tính điểm số trung bình từ 5 trận gần nhất
    matches_df.at[index, "average_score_before_match"] = (
        previous_matches["result"].map({"W": 3.0, "D": 1.0, "L": 0.0}).sum()
        / match_total
    )

In [8]:
matches_df[
    [
        "date_time",
        "team",
        "opponent",
        "result",
        "win_percent_before",
        "lose_percent_before",
        "draw_percent_before",
        "total_goals_before",
        "total_lost_goals_before",
        "average_score_before_match",
    ]
].tail(10)

Unnamed: 0,date_time,team,opponent,result,win_percent_before,lose_percent_before,draw_percent_before,total_goals_before,total_lost_goals_before,average_score_before_match
2165,2021-05-23 16:00:00,Burnley,Sheffield United,L,0.4,0.6,0.0,7,9,1.2
2279,2021-05-23 16:00:00,Sheffield United,Burnley,W,0.4,0.6,0.0,2,7,1.2
1747,2021-05-23 16:00:00,West Ham United,Southampton,W,0.4,0.4,0.2,6,5,1.4
2089,2021-05-23 16:00:00,Southampton,West Ham United,L,0.4,0.4,0.2,7,7,1.4
1557,2021-05-23 16:00:00,Manchester City,Everton,W,0.6,0.4,0.0,11,9,1.8
1899,2021-05-23 16:00:00,Everton,Manchester City,L,0.4,0.4,0.2,3,3,1.4
1595,2021-05-23 16:00:00,Manchester United,Wolverhampton Wanderers,W,0.2,0.4,0.4,7,8,1.0
2013,2021-05-23 16:00:00,Wolverhampton Wanderers,Manchester United,L,0.2,0.6,0.2,3,9,0.8
1671,2021-05-23 16:00:00,Chelsea,Aston Villa,L,0.8,0.2,0.0,7,3,2.4
1937,2021-05-23 16:00:00,Aston Villa,Chelsea,W,0.4,0.4,0.2,7,8,1.4


#### II. Tính toán chỉ số ELO của 2 đội

In [9]:
def get_game_different(score_different):
    if score_different == 0:
        return 0
    if score_different == 1:
        return 0.25
    elif score_different == 2:
        return 0.5
    elif score_different == 3:
        return 0.75
    else:
        return 0.75 + (score_different - 3) / 8


def get_result(result):
    if result == "W":
        return 1
    elif result == "D":
        return 0.5
    else:
        return 0


def get_expected_result(elo_diff):
    return 1 / (1 + 10 ** (elo_diff / 400))

In [10]:
k = 50
for index, row in matches_df.iterrows():
    previous_matches_of_team = matches_df[
        (matches_df["team"] == row["team"]) & (matches_df["date_time"] < row["date_time"]) & (matches_df["season"] == row["season"])
    ][-1:]

    previous_matches_of_opponent = matches_df[
        (matches_df["team"] == row["opponent"])
        & (matches_df["date_time"] < row["date_time"]) & (matches_df["season"] == row["season"])
    ][-1:]

    score_different = abs(row["gf"] - row["ga"])

    if previous_matches_of_team.shape[0] == 0 or previous_matches_of_opponent.shape[0] == 0:
        elo_different = 0
        matches_df.at[index, "ELO_before_match"] = 1500
        matches_df.at[index, "ELO_after_match"] = matches_df.at[index, "ELO_before_match"] + (
            k + k * get_game_different(score_different)
        ) * (get_result(row["result"]) - get_expected_result(elo_different))
    else:
        elo_different = (
            previous_matches_of_opponent["ELO_after_match"].values[0]
            - previous_matches_of_team["ELO_after_match"].values[0]
        )

        if row["venue"] == "Away":
            elo_different *= -1

        matches_df.at[index, "ELO_before_match"] = previous_matches_of_team["ELO_after_match"].values[0]
        matches_df.at[index, "ELO_after_match"] = matches_df.at[index, "ELO_before_match"] + (
            k + k * get_game_different(score_different)
        ) * (get_result(row["result"]) - get_expected_result(elo_different))


Gộp các trận đấu đối xứng

In [11]:
# Đảm bảo dữ liệu đã được sắp xếp theo thời gian, nếu 2 thời gian giống nhau thì sắp xếp theo referee
matches_df = matches_df.sort_values(by=["date_time", "referee"])

In [12]:
matches_df.head()

Unnamed: 0,date,time,round,venue,result,gf,ga,opponent,attendance,formation,...,team,date_time,average_score_before_match,win_percent_before,lose_percent_before,draw_percent_before,total_goals_before,total_lost_goals_before,ELO_before_match,ELO_after_match
9196,8/14/2010,00:00,Matchweek 1,Away,D,0,0,Tottenham Hotspur,35928.0,4-3-3,...,Manchester City,2010-08-14,0.0,0.0,0.0,0.0,0,0,1500.0,1500.0
9272,8/14/2010,00:00,Matchweek 1,Home,D,0,0,Manchester City,35928.0,4-4-2,...,Tottenham Hotspur,2010-08-14,0.0,0.0,0.0,0.0,0,0,1500.0,1500.0
9462,8/14/2010,00:00,Matchweek 1,Home,D,2,2,Birmingham City,38390.0,4-4-2,...,Sunderland,2010-08-14,0.0,0.0,0.0,0.0,0,0,1500.0,1500.0
9766,8/14/2010,00:00,Matchweek 1,Away,D,2,2,Sunderland,38390.0,4-4-2,...,Birmingham City,2010-08-14,0.0,0.0,0.0,0.0,0,0,1500.0,1500.0
9576,8/14/2010,00:00,Matchweek 1,Away,L,1,2,Wolverhampton Wanderers,27850.0,4-4-2,...,Stoke City,2010-08-14,0.0,0.0,0.0,0.0,0,0,1500.0,1468.75


Chia tập data thành 2 loại: hàng lẻ và hàng chẵn, sau đó gộp các cột hàng chẵn vào hàng lẻ

In [13]:
matches_df_sorted_odd = matches_df.iloc[::2]
matches_df_sorted_even = matches_df.iloc[1::2]

In [14]:
matches_df_sorted_odd = matches_df_sorted_odd.rename(
    columns={
        "formation": "formation_team1",
        "win_percent_before": "win_percent_before_team1",
        "lose_percent_before": "lose_percent_before_team1",
        "draw_percent_before": "draw_percent_before_team1",
        "total_goals_before": "total_goals_before_team1",
        "total_lost_goals_before": "total_lost_goals_before_team1",
        "ELO_before_match": "ELO_before_match_team1",
        "ELO_after_match": "ELO_after_match_team1",
        "average_score_before_match": "average_score_before_match_team1",
    }
)
matches_df_sorted_odd.head()

Unnamed: 0,date,time,round,venue,result,gf,ga,opponent,attendance,formation_team1,...,team,date_time,average_score_before_match_team1,win_percent_before_team1,lose_percent_before_team1,draw_percent_before_team1,total_goals_before_team1,total_lost_goals_before_team1,ELO_before_match_team1,ELO_after_match_team1
9196,8/14/2010,00:00,Matchweek 1,Away,D,0,0,Tottenham Hotspur,35928.0,4-3-3,...,Manchester City,2010-08-14,0.0,0.0,0.0,0.0,0,0,1500.0,1500.0
9462,8/14/2010,00:00,Matchweek 1,Home,D,2,2,Birmingham City,38390.0,4-4-2,...,Sunderland,2010-08-14,0.0,0.0,0.0,0.0,0,0,1500.0,1500.0
9576,8/14/2010,00:00,Matchweek 1,Away,L,1,2,Wolverhampton Wanderers,27850.0,4-4-2,...,Stoke City,2010-08-14,0.0,0.0,0.0,0.0,0,0,1500.0,1468.75
9158,8/14/2010,00:00,Matchweek 1,Home,W,6,0,West Bromwich Albion,41589.0,4-3-3,...,Chelsea,2010-08-14,0.0,0.0,0.0,0.0,0,0,1500.0,1553.125
9690,8/14/2010,00:00,Matchweek 1,Home,L,0,4,Blackpool,16152.0,4-5-1,...,Wigan Athletic,2010-08-14,0.0,0.0,0.0,0.0,0,0,1500.0,1453.125


Lấy các cột cần thiết đối với hàng chẵn, đồng thời đổi tên chúng

In [15]:
matches_df_sorted_even = matches_df_sorted_even[
    [
        "formation",
        "win_percent_before",
        "lose_percent_before",
        "draw_percent_before",
        "total_goals_before",
        "total_lost_goals_before",
        "ELO_before_match",
        "ELO_after_match",
        "average_score_before_match",
    ]
].rename(
    columns={
        "formation": "formation_team2",
        "win_percent_before": "win_percent_before_team2",
        "lose_percent_before": "lose_percent_before_team2",
        "draw_percent_before": "draw_percent_before_team2",
        "total_goals_before": "total_goals_before_team2",
        "total_lost_goals_before": "total_lost_goals_before_team2",
        "ELO_before_match": "ELO_before_match_team2",
        "ELO_after_match": "ELO_after_match_team2",
        "average_score_before_match": "average_score_before_match_team2",
    }
)

matches_df_sorted_even.head()

Unnamed: 0,formation_team2,win_percent_before_team2,lose_percent_before_team2,draw_percent_before_team2,total_goals_before_team2,total_lost_goals_before_team2,ELO_before_match_team2,ELO_after_match_team2,average_score_before_match_team2
9272,4-4-2,0.0,0.0,0.0,0,0,1500.0,1500.0,0.0
9766,4-4-2,0.0,0.0,0.0,0,0,1500.0,1500.0,0.0
9728,4-4-2,0.0,0.0,0.0,0,0,1500.0,1531.25,0.0
9500,4-2-3-1,0.0,0.0,0.0,0,0,1500.0,1446.875,0.0
9804,4-3-3,0.0,0.0,0.0,0,0,1500.0,1546.875,0.0


Thay đổi lại giá trị index của các hàng chẵn để có thể nối thêm vào các cột của hàng lẻ

In [16]:
matches_df_sorted_even.index = matches_df_sorted_odd.index
matches_df_sorted_even.head()

Unnamed: 0,formation_team2,win_percent_before_team2,lose_percent_before_team2,draw_percent_before_team2,total_goals_before_team2,total_lost_goals_before_team2,ELO_before_match_team2,ELO_after_match_team2,average_score_before_match_team2
9196,4-4-2,0.0,0.0,0.0,0,0,1500.0,1500.0,0.0
9462,4-4-2,0.0,0.0,0.0,0,0,1500.0,1500.0,0.0
9576,4-4-2,0.0,0.0,0.0,0,0,1500.0,1531.25,0.0
9158,4-2-3-1,0.0,0.0,0.0,0,0,1500.0,1446.875,0.0
9690,4-3-3,0.0,0.0,0.0,0,0,1500.0,1546.875,0.0


In [17]:
matches_groupby = pd.concat([matches_df_sorted_odd, matches_df_sorted_even], axis=1)
matches_groupby.head()

Unnamed: 0,date,time,round,venue,result,gf,ga,opponent,attendance,formation_team1,...,ELO_after_match_team1,formation_team2,win_percent_before_team2,lose_percent_before_team2,draw_percent_before_team2,total_goals_before_team2,total_lost_goals_before_team2,ELO_before_match_team2,ELO_after_match_team2,average_score_before_match_team2
9196,8/14/2010,00:00,Matchweek 1,Away,D,0,0,Tottenham Hotspur,35928.0,4-3-3,...,1500.0,4-4-2,0.0,0.0,0.0,0,0,1500.0,1500.0,0.0
9462,8/14/2010,00:00,Matchweek 1,Home,D,2,2,Birmingham City,38390.0,4-4-2,...,1500.0,4-4-2,0.0,0.0,0.0,0,0,1500.0,1500.0,0.0
9576,8/14/2010,00:00,Matchweek 1,Away,L,1,2,Wolverhampton Wanderers,27850.0,4-4-2,...,1468.75,4-4-2,0.0,0.0,0.0,0,0,1500.0,1531.25,0.0
9158,8/14/2010,00:00,Matchweek 1,Home,W,6,0,West Bromwich Albion,41589.0,4-3-3,...,1553.125,4-2-3-1,0.0,0.0,0.0,0,0,1500.0,1446.875,0.0
9690,8/14/2010,00:00,Matchweek 1,Home,L,0,4,Blackpool,16152.0,4-5-1,...,1453.125,4-3-3,0.0,0.0,0.0,0,0,1500.0,1546.875,0.0


Sắp xếp lại các cột cho hợp lý

In [18]:
removed_cols = [
    "team",
    "opponent",
    "venue",
    "season",
    "date",
    "time",
    "result",
    "win_percent_before_team1",
    "lose_percent_before_team1",
    "draw_percent_before_team1",
    "win_percent_before_team2",
    "lose_percent_before_team2",
    "draw_percent_before_team2",
    "total_goals_before_team1",
    "total_lost_goals_before_team1",
    "total_goals_before_team2",
    "total_lost_goals_before_team2",
    "formation_team1",
    "formation_team2",
    "ELO_before_match_team1",
    "ELO_before_match_team2",
    "ELO_after_match_team1",
    "ELO_after_match_team2",
    "average_score_before_match_team1",
    "average_score_before_match_team2",
]

cols = [col for col in matches_groupby.columns if col not in removed_cols]
matches_groupby = matches_groupby[removed_cols + cols]

matches_groupby.sort_values(by=['team', 'date_time'] , inplace=True)
matches_groupby.head()

Unnamed: 0,team,opponent,venue,season,date,time,result,win_percent_before_team1,lose_percent_before_team1,draw_percent_before_team1,...,ELO_after_match_team1,ELO_after_match_team2,average_score_before_match_team1,average_score_before_match_team2,round,gf,ga,attendance,referee,date_time
9234,Arsenal,Liverpool,Away,2010-2011,8/15/2010,00:00,D,0.0,0.0,0.0,...,1500.0,1500.0,0.0,0.0,Matchweek 1,1,1,44722.0,Martin Atkinson,2010-08-15
9235,Arsenal,Blackpool,Home,2010-2011,8/21/2010,00:00,W,0.0,0.0,1.0,...,1560.249296,1500.874296,1.0,3.0,Matchweek 2,6,0,60032.0,Mike Jones,2010-08-21
9236,Arsenal,Blackburn Rovers,Away,2010-2011,8/28/2010,00:00,W,0.5,0.0,0.5,...,1596.619692,1476.673613,2.0,1.5,Matchweek 3,2,1,25059.0,Chris Foy,2010-08-28
9237,Arsenal,Bolton Wanderers,Home,2010-2011,9/11/2010,00:00,W,0.666667,0.0,0.333333,...,1633.500207,1490.991905,2.333333,1.666667,Matchweek 4,4,1,59876.0,Stuart Attwell,2010-09-11
9238,Arsenal,Sunderland,Away,2010-2011,9/18/2010,00:00,D,0.75,0.0,0.25,...,1641.627383,1524.428717,2.5,1.25,Matchweek 5,1,1,38950.0,Phil Dowd,2010-09-18


In [19]:
matches_groupby.columns

Index(['team', 'opponent', 'venue', 'season', 'date', 'time', 'result',
       'win_percent_before_team1', 'lose_percent_before_team1',
       'draw_percent_before_team1', 'win_percent_before_team2',
       'lose_percent_before_team2', 'draw_percent_before_team2',
       'total_goals_before_team1', 'total_lost_goals_before_team1',
       'total_goals_before_team2', 'total_lost_goals_before_team2',
       'formation_team1', 'formation_team2', 'ELO_before_match_team1',
       'ELO_before_match_team2', 'ELO_after_match_team1',
       'ELO_after_match_team2', 'average_score_before_match_team1',
       'average_score_before_match_team2', 'round', 'gf', 'ga', 'attendance',
       'referee', 'date_time'],
      dtype='object')

### II. Tính toán lịch sử sử đối đầu giữa 2 đội trong 10 trận gần nhất

In [20]:
# Định nghĩa hàm để tính số lần thắng, thua và hòa của mỗi đội so với đối thủ
def calculate_previous_outcomes(df):
    # Tạo các cột mới
    df["head2head_win_percent"] = 0.0
    df["head2head_draw_percent"] = 0.0
    df["head2head_lost_percent"] = 0.0

    # Duyệt qua từng hàng trong DataFrame
    for index, row in df.iterrows():
        team = row["team"]
        opponent = row["opponent"]

        # Tìm các trận đấu trước đó của đội
        previous_matches = df[
            (df["date_time"] < row["date_time"])
            & (
                ((df["team"] == team) & (df["opponent"] == opponent))
                | ((df["opponent"] == team) & (df["team"] == opponent))
            )
        ]

        # Lấy 10 trận gần nhất
        previous_matches = previous_matches[-10:]
        matches_total = previous_matches.shape[0]
        if matches_total > 0:
            # Đếm số lần thắng, thua và hòa của đội
            wins = len(
                previous_matches[
                    ((previous_matches["team"] == team) & (previous_matches["result"] == "W"))
                    | ((previous_matches["opponent"] == team) & (previous_matches["result"] == "L"))
                ]
            )

            draws = len(
                previous_matches[
                    ((previous_matches["team"] == team) & (previous_matches["result"] == "D"))
                    | ((previous_matches["opponent"] == team) & (previous_matches["result"] == "D"))
                ]
            )
            
            losses = len(
                previous_matches[
                    ((previous_matches["team"] == team) & (previous_matches["result"] == "L"))
                    | ((previous_matches["opponent"] == team) & (previous_matches["result"] == "W"))
                ]
            )

            df.at[index, "head2head_win_percent"] = wins / matches_total
            df.at[index, "head2head_draw_percent"] = draws / matches_total
            df.at[index, "head2head_lost_percent"] = losses / matches_total

    return df

In [21]:
matches_groupby = calculate_previous_outcomes(matches_groupby)

In [22]:
matches_groupby

Unnamed: 0,team,opponent,venue,season,date,time,result,win_percent_before_team1,lose_percent_before_team1,draw_percent_before_team1,...,average_score_before_match_team2,round,gf,ga,attendance,referee,date_time,head2head_win_percent,head2head_draw_percent,head2head_lost_percent
9234,Arsenal,Liverpool,Away,2010-2011,8/15/2010,00:00,D,0.000000,0.0,0.000000,...,0.000000,Matchweek 1,1,1,44722.000000,Martin Atkinson,2010-08-15 00:00:00,0.000000,0.000000,0.000000
9235,Arsenal,Blackpool,Home,2010-2011,8/21/2010,00:00,W,0.000000,0.0,1.000000,...,3.000000,Matchweek 2,6,0,60032.000000,Mike Jones,2010-08-21 00:00:00,0.000000,0.000000,0.000000
9236,Arsenal,Blackburn Rovers,Away,2010-2011,8/28/2010,00:00,W,0.500000,0.0,0.500000,...,1.500000,Matchweek 3,2,1,25059.000000,Chris Foy,2010-08-28 00:00:00,0.000000,0.000000,0.000000
9237,Arsenal,Bolton Wanderers,Home,2010-2011,9/11/2010,00:00,W,0.666667,0.0,0.333333,...,1.666667,Matchweek 4,4,1,59876.000000,Stuart Attwell,2010-09-11 00:00:00,0.000000,0.000000,0.000000
9238,Arsenal,Sunderland,Away,2010-2011,9/18/2010,00:00,D,0.750000,0.0,0.250000,...,1.250000,Matchweek 5,1,1,38950.000000,Phil Dowd,2010-09-18 00:00:00,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2006,Wolverhampton Wanderers,Fulham,Away,2020-2021,4/9/2021,20:00,W,0.000000,0.6,0.400000,...,0.600000,Matchweek 31,1,0,25827.750000,Jonathan Moss,2021-04-09 20:00:00,0.428571,0.285714,0.285714
2007,Wolverhampton Wanderers,Sheffield United,Home,2020-2021,4/17/2021,20:15,W,0.200000,0.6,0.200000,...,0.600000,Matchweek 32,1,0,31642.000000,Robert Jones,2021-04-17 20:15:00,0.333333,0.333333,0.333333
2008,Wolverhampton Wanderers,Burnley,Home,2020-2021,4/25/2021,12:00,L,0.400000,0.4,0.200000,...,0.800000,Matchweek 33,0,4,25731.285714,Darren England,2021-04-25 12:00:00,0.200000,0.400000,0.400000
2009,Wolverhampton Wanderers,West Bromwich Albion,Away,2020-2021,5/3/2021,18:00,D,0.400000,0.6,0.000000,...,1.400000,Matchweek 34,1,1,26670.750000,Paul Tierney,2021-05-03 18:00:00,0.200000,0.200000,0.600000


In [23]:
matches_groupby.to_csv("../../feature_engineering_data/train/all_clubs_for_mu.csv", index=False)

In [24]:
matches_groupby["team"].unique()

array(['Arsenal', 'Aston Villa', 'Birmingham City', 'Blackburn Rovers',
       'Blackpool', 'Bolton Wanderers', 'Bournemouth',
       'Brighton and Hove Albion', 'Burnley', 'Cardiff City', 'Chelsea',
       'Crystal Palace', 'Everton', 'Fulham', 'Huddersfield Town',
       'Hull City', 'Leeds United', 'Leicester City', 'Liverpool',
       'Manchester City', 'Manchester United', 'Middlesbrough',
       'Newcastle United', 'Norwich City', 'Queens Park Rangers',
       'Reading', 'Sheffield United', 'Southampton', 'Stoke City',
       'Sunderland', 'Swansea City', 'Tottenham Hotspur', 'Watford',
       'West Bromwich Albion', 'West Ham United', 'Wigan Athletic',
       'Wolverhampton Wanderers'], dtype=object)

In [25]:
# Hiện tại với file data của từng đội thì đội đó hoặc xuất hiện ở cột team hoặc xuất hiện ở cột component
# nên sẽ tiến hành hoán vị lại sao cho với file arsenal.csv thì cột team phải luôn là arsenal
def swap_columns(df, target_team):
    for idx, row in df.iterrows():
        if row["team"] != target_team:
            df.at[idx, "team"], df.at[idx, "opponent"] = df.at[idx, "opponent"], df.at[idx, "team"]
            
            if df.at[idx, "venue"] == "Home":
                df.at[idx, "venue"] = "Away"
            elif df.at[idx, "venue"] == "Away":
                df.at[idx, "venue"] = "Home"

            if df.at[idx, "result"] == "W":
                df.at[idx, "result"] = "L"
            elif df.at[idx, "result"] == "L":
                df.at[idx, "result"] = "W"

            # Đổi chỗ cột lịch sử đối đầu
            df.at[idx, "head2head_win_percent"], df.at[idx, "head2head_lost_percent"] = \
                df.at[idx, "head2head_lost_percent"], df.at[idx, "head2head_win_percent"]
            
            # Đổi chỗ các cột phong độ của 2 đội
            df.at[idx, "win_percent_before_team1"], df.at[idx, "win_percent_before_team2"] = \
                df.at[idx, "win_percent_before_team2"], df.at[idx, "win_percent_before_team1"]
            
            df.at[idx, "lose_percent_before_team1"], df.at[idx, "lose_percent_before_team2"] = \
                df.at[idx, "lose_percent_before_team2"], df.at[idx, "lose_percent_before_team1"]
            
            df.at[idx, "draw_percent_before_team1"], df.at[idx, "draw_percent_before_team2"] = \
                df.at[idx, "draw_percent_before_team2"], df.at[idx, "draw_percent_before_team1"]
            
            df.at[idx, "total_goals_before_team1"], df.at[idx, "total_goals_before_team2"] = \
                df.at[idx, "total_goals_before_team2"], df.at[idx, "total_goals_before_team1"]
            
            df.at[idx, "total_lost_goals_before_team1"], df.at[idx, "total_lost_goals_before_team2"] = \
                df.at[idx, "total_lost_goals_before_team2"], df.at[idx, "total_lost_goals_before_team1"]
            
            df.at[idx, "ELO_before_match_team1"], df.at[idx, "ELO_before_match_team2"] = \
                df.at[idx, "ELO_before_match_team2"], df.at[idx, "ELO_before_match_team1"]
            
            df.at[idx, "ELO_after_match_team1"], df.at[idx, "ELO_after_match_team2"] = \
                df.at[idx, "ELO_after_match_team2"], df.at[idx, "ELO_after_match_team1"]
            
            df.at[idx, "average_score_before_match_team1"], df.at[idx, "average_score_before_match_team2"] = \
                df.at[idx, "average_score_before_match_team2"], df.at[idx, "average_score_before_match_team1"]

In [26]:
target_teams = ["Manchester United"]

In [27]:
matches_groupby = matches_groupby.drop(
    columns=[
        "date",
        "time",
        "round",
        "attendance",
        "referee",
        "gf",
        "ga",
        "formation_team1",
        "formation_team2",
    ]
)

In [28]:
# Lưu từng đội vào file csv
for team in target_teams:
    team_df = matches_groupby[(matches_groupby["team"] == team) | (matches_groupby["opponent"] == team)]
    team_df = team_df.sort_values(by="date_time")
    lower_name = team.replace(" ", "_").lower()
    swap_columns(team_df, team)
    team_df.drop(columns=["team"]).to_csv(
        f"../../feature_engineering_data/train/{lower_name}.csv", index=False
    )

III. Đội đối thủ có phải big 6 hay không?

In [29]:
big6_teams = ["Manchester United", "Manchester City", "Liverpool", "Chelsea", "Arsenal", "Tottenham Hotspur"]
big6_teams = [team.lower() for team in big6_teams]

def create_big6_team(file_name):
    team_df = pd.read_csv(file_name)
    team_df["is_opponent_big6"] = team_df["opponent"].apply(lambda x: 1 if x.lower() in big6_teams else 0)
    team_df.to_csv(file_name, index=False)

for team in target_teams:
    create_big6_team(
        f"../../feature_engineering_data/train/{team.replace(' ', '_').lower()}.csv"
    )