In [1]:
import pandas as pd
import matplotlib.pyplot as plt

pd.options.display.max_rows=100

# Content

This dataset includes 42,183 results of international football matches starting from the very first official match in 1972 up to 2019. The matches range from FIFA World Cup to FIFI Wild Cup to regular friendly matches. The matches are strictly men's full internationals and the data does not include Olympic Games or matches where at least one of the teams was the nation's B-team, U-23 or a league select team.

results.csv includes the following columns:

    date - date of the match
    home_team - the name of the home team
    away_team - the name of the away team
    home_score - full-time home team score including extra time, not including penalty-shootouts
    away_score - full-time away team score including extra time, not including penalty-shootouts
    tournament - the name of the tournament
    city - the name of the city/town/administrative unit where the match was played
    country - the name of the country where the match was played
    neutral - TRUE/FALSE column indicating whether the match was played at a neutral venue

Note on team and country names:
For home and away teams the current name of the team has been used. For example, when in 1882 a team who called themselves Ireland played against England, in this dataset, it is called Northern Ireland because the current team of Northern Ireland is the successor of the 1882 Ireland team. This is done so it is easier to track the history and statistics of teams.

For country names, the name of the country at the time of the match is used. So when Ghana played in Accra, Gold Coast in the 1950s, even though the names of the home team and the country don't match, it was a home match for Ghana. This is indicated by the neutral column, which says FALSE for those matches, meaning it was not at a neutral venue.

In [2]:
results = pd.read_csv("dataset/results.csv")

In [3]:
results["home_won"] = results.home_score > results.away_score
results["away_won"] = results.home_score < results.away_score
results["draw"] = results.home_score == results.away_score
results["at_home_team"] = results.home_team == results.country
results["at_away_team"] = results.away_team == results.country

results.sort_values("date", ascending=False)

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,home_won,away_won,draw,at_home_team,at_away_team
42182,2021-06-08,Spain,Lithuania,4,0,Friendly,Leganés,Spain,False,True,False,False,True,False
42176,2021-06-08,Hungary,Republic of Ireland,0,0,Friendly,Budapest,Hungary,False,False,False,True,True,False
42171,2021-06-08,Benin,Zambia,2,2,Friendly,Cotonou,Benin,False,False,False,True,True,False
42173,2021-06-08,Czech Republic,Albania,3,1,Friendly,Prague,Czech Republic,False,True,False,False,True,False
42174,2021-06-08,France,Bulgaria,3,0,Friendly,Paris,France,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,True,False,False,True,False
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,False,False,True,True,False
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,True,False,False,True,False
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,True,False,False,True,False


In [4]:
""" let's drop results where the away team is at home """
display(results[results["at_away_team"]])

drop = results[results["at_away_team"]].index

results = results.drop(drop).reset_index(drop=True)

del results["at_away_team"]

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,home_won,away_won,draw,at_home_team,at_away_team
789,1924-03-13,Catalonia,Spain,0,7,Friendly,Barcelona,Spain,False,False,True,False,False,True
1619,1933-10-04,Silesia,Poland,1,2,Friendly,Katowice,Poland,False,False,True,False,False,True
1651,1934-02-02,Catalonia,Spain,0,2,Friendly,Barcelona,Spain,False,False,True,False,False,True
2754,1947-10-19,Catalonia,Spain,3,1,Friendly,Barcelona,Spain,False,True,False,False,False,True
3485,1953-04-26,Silesia,Poland,2,3,Friendly,Chorzów,Poland,False,False,True,False,False,True
3524,1953-08-09,Catalonia,Spain,0,6,Friendly,Barcelona,Spain,False,False,True,False,False,True
3537,1953-09-13,Silesia,Poland,3,3,Friendly,Byton,Poland,False,False,False,True,False,True
28838,2006-11-19,Crimea,Northern Cyprus,0,5,ELF Cup,Lefkoşa,Northern Cyprus,True,False,True,False,False,True
28888,2006-12-09,Silesia,Poland,1,1,Friendly,Chorzów,Poland,False,False,False,True,False,True
39580,2018-01-21,Sudan,Morocco,0,0,African Nations Championship,Casablanca,Morocco,True,False,False,True,False,True


In [5]:
results

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,home_won,away_won,draw,at_home_team
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,False,False,True,True
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,True,False,False,True
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,True,False,False,True
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,False,False,True,True
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
42162,2021-06-08,Morocco,Ghana,1,0,Friendly,Rabat,Morocco,False,True,False,False,True
42163,2021-06-08,Mozambique,Eswatini,1,1,Friendly,Maputo,Mozambique,False,False,False,True,True
42164,2021-06-08,Poland,Iceland,2,2,Friendly,Poznan,Poland,False,False,False,True,True
42165,2021-06-08,Senegal,Cape Verde,2,0,Friendly,Thiès,Senegal,False,True,False,False,True


In [6]:
results.groupby("home_team").agg({
    "date": "count",
    "home_score": "sum",
    "away_score": "sum",
    "home_won": "sum",
    "away_won": "sum",
    "draw": "sum",
}).sort_values("date", ascending=False).head(25)

Unnamed: 0_level_0,date,home_score,away_score,home_won,away_won,draw
home_team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Brazil,573,1416,473,408,56,109
Argentina,550,1208,479,362,68,120
Mexico,516,1032,467,304,98,114
Germany,515,1269,534,321,84,110
England,507,1170,450,316,82,109
Sweden,494,1146,561,288,101,105
France,492,988,532,285,107,100
South Korea,469,937,345,278,79,112
Hungary,462,1077,557,260,99,103
Italy,451,913,352,283,49,119


In [7]:
results.to_pickle("dataset/results_cleaned.pickle")

In [8]:
pd.read_pickle("dataset/results_cleaned.pickle")

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,home_won,away_won,draw,at_home_team
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,False,False,True,True
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,True,False,False,True
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,True,False,False,True
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,False,False,True,True
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
42162,2021-06-08,Morocco,Ghana,1,0,Friendly,Rabat,Morocco,False,True,False,False,True
42163,2021-06-08,Mozambique,Eswatini,1,1,Friendly,Maputo,Mozambique,False,False,False,True,True
42164,2021-06-08,Poland,Iceland,2,2,Friendly,Poznan,Poland,False,False,False,True,True
42165,2021-06-08,Senegal,Cape Verde,2,0,Friendly,Thiès,Senegal,False,True,False,False,True


# Small test of predivtiveness with basic model

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

In [10]:
from datetime import datetime, timedelta

X = results.copy()
y = X.home_score


features = ["home_team", "away_team", "tournament", "country", "neutral", "at_home_team"]
# too many cats for now
del X["city"]

for col in features:
    X[col] = X[col].astype("category").cat.codes

X["date"] = X["date"].apply(lambda x: (datetime.strptime(x, "%Y-%m-%d").date() - datetime.today().date()).days)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

X_train

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,country,neutral,home_won,away_won,draw,at_home_team
22651,-7769,229,253,1,0,55,198,0,True,False,False,1
24540,-7083,10,77,4,0,57,6,0,True,False,False,1
20082,-8966,274,282,0,1,55,236,0,False,True,False,1
5971,-20356,281,301,2,2,57,242,0,False,False,True,1
18010,-10111,26,98,2,1,57,19,0,True,False,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...
6265,-20012,114,45,2,2,57,97,0,False,False,True,1
11284,-14976,57,135,1,0,57,46,0,True,False,False,1
38158,-1813,87,73,1,0,28,157,1,True,False,False,0
860,-35250,83,27,4,0,57,69,0,True,False,False,1


In [11]:
model_tree = DecisionTreeRegressor(
    max_depth=5,
)

model_tree.fit(X=X_train[features], y=y_train)

DecisionTreeRegressor(max_depth=5)

In [12]:
X_test["predicted_home_score"] = model_tree.predict(X_test[features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test["predicted_home_score"] = model_tree.predict(X_test[features])


In [13]:
(X_test["predicted_home_score"] - y_test).mean(), (X_test["predicted_home_score"] - y_test).std() 

(-0.020006783137526834, 1.7690746196106948)

In [14]:
X_test

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,country,neutral,home_won,away_won,draw,at_home_team,predicted_home_score
3290,-25409,122,94,8,0,57,103,0,True,False,False,1,1.969424
28562,-5397,237,91,6,0,99,204,0,True,False,False,1,1.730908
10474,-15748,35,206,3,0,57,29,0,True,False,False,1,1.813699
7,-52694,293,233,0,2,57,255,0,False,True,False,1,1.881429
41023,-647,264,116,0,1,55,228,0,False,True,False,1,1.881429
...,...,...,...,...,...,...,...,...,...,...,...,...,...
41700,-245,155,70,2,0,100,132,0,True,False,False,1,0.768421
34322,-3223,123,91,2,0,57,104,0,True,False,False,1,1.969424
1598,-32120,148,255,1,1,57,124,0,False,False,True,1,1.366307
14174,-12827,252,251,4,0,57,177,1,True,False,False,0,1.523792
