In [1]:
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
# Load player history (training data)
# df_train = pd.read_csv('data/during-season/player_history.csv', parse_dates=['kickoff_time'])
df_train = pd.read_csv('data/during-season/player_history.csv')

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1734 entries, 0 to 1733
Data columns (total 37 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   element                          1734 non-null   int64 
 1   fixture                          1734 non-null   int64 
 2   opponent_team                    1734 non-null   object
 3   total_points                     1734 non-null   int64 
 4   was_home                         1734 non-null   bool  
 5   kickoff_time                     1734 non-null   object
 6   team_h_score                     1734 non-null   int64 
 7   team_a_score                     1734 non-null   int64 
 8   round                            1734 non-null   int64 
 9   minutes                          1734 non-null   int64 
 10  goals_scored                     1734 non-null   int64 
 11  assists                          1734 non-null   int64 
 12  clean_sheets                     1

In [4]:
df_train.head(5)

Unnamed: 0,element,fixture,opponent_team,total_points,was_home,kickoff_time,team_h_score,team_a_score,round,minutes,...,recoveries,value,transfers_balance,selected,transfers_in,transfers_out,name,team_id,team,opponent_team_id
0,1,6,Kalmar FF,6,False,2022-04-03T15:30:00Z,0,1,1,90,...,5,60,0,4920,0,0,Johan Dahlin,1,Malmö FF,10
1,1,16,IF Elfsborg,3,True,2022-04-11T17:10:00Z,1,1,2,90,...,6,60,201,5787,284,83,Johan Dahlin,1,Malmö FF,5
2,1,24,AIK,7,True,2022-04-17T13:00:00Z,3,0,3,90,...,11,60,39,5997,127,88,Johan Dahlin,1,Malmö FF,12
3,1,32,IFK Värnamo,7,False,2022-04-21T17:00:00Z,0,0,4,90,...,5,60,241,6355,319,78,Johan Dahlin,1,Malmö FF,18
4,2,6,Kalmar FF,0,False,2022-04-03T15:30:00Z,0,1,1,0,...,0,45,0,1073,0,0,Ismael Diawara,1,Malmö FF,10


In [5]:
df_tmp_X = df_train[['round', 'name', 'team', 'opponent_team', 'was_home']]
train_y = df_train[['total_points']]

In [6]:
df_tmp_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1734 entries, 0 to 1733
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   round          1734 non-null   int64 
 1   name           1734 non-null   object
 2   team           1734 non-null   object
 3   opponent_team  1734 non-null   object
 4   was_home       1734 non-null   bool  
dtypes: bool(1), int64(1), object(3)
memory usage: 56.0+ KB


In [7]:
df_tmp_X.head()

Unnamed: 0,round,name,team,opponent_team,was_home
0,1,Johan Dahlin,Malmö FF,Kalmar FF,False
1,2,Johan Dahlin,Malmö FF,IF Elfsborg,True
2,3,Johan Dahlin,Malmö FF,AIK,True
3,4,Johan Dahlin,Malmö FF,IFK Värnamo,False
4,1,Ismael Diawara,Malmö FF,Kalmar FF,False


## Preprocessing Data

In [8]:
df_tmp_X

Unnamed: 0,round,name,team,opponent_team,was_home
0,1,Johan Dahlin,Malmö FF,Kalmar FF,False
1,2,Johan Dahlin,Malmö FF,IF Elfsborg,True
2,3,Johan Dahlin,Malmö FF,AIK,True
3,4,Johan Dahlin,Malmö FF,IFK Värnamo,False
4,1,Ismael Diawara,Malmö FF,Kalmar FF,False
...,...,...,...,...,...
1729,2,Filip Trpchevski,BK Häcken,Degerfors IF,False
1730,3,Filip Trpchevski,BK Häcken,IFK Göteborg,True
1731,4,Filip Trpchevski,BK Häcken,IFK Norrköping,False
1732,3,Darrell Tibell,IFK Norrköping,Djurgården,False


In [9]:
# We need to encode the strings for the scikit model to be able to handle them
# Label encoding for ordered data eg. Low, Medium, High
# OneHot encoding for unordered data
# https://towardsdatascience.com/encoding-categorical-features-21a2651a065c

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

def preprocess(df):
    categorical_features = ["name", "team", "opponent_team"]
    one_hot = OneHotEncoder()
    transformer = ColumnTransformer([("one_hot",
                                      one_hot,
                                     categorical_features)],
                                     remainder="passthrough")

    transformed = transformer.fit_transform(df).toarray()
    transformed = pd.DataFrame(transformed)
    return transformed

transformed_X = preprocess(df_tmp_X)
transformed_X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,458,459,460,461,462,463,464,465,466,467
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
1730,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0
1731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
1732,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0


In [10]:
# Check number of unique values in columns, just to compare against transformed df
print(df_tmp_X.apply(lambda col: col.nunique()))

round              4
name             434
team              16
opponent_team     16
was_home           2
dtype: int64


In [11]:
# Ok, appearantly, there is an easier way to do that.. but we will skip this for now
dummies = pd.get_dummies(df_tmp_X[["name", "team", "opponent_team"]])
dummies

Unnamed: 0,name_Abbe Khalili,name_Abdelrahman Saidi,name_Abdul Malik Abubakari,name_Abdussalam Magashy,name_Adam Bergmark Wiberg,name_Adam Carlén,name_Adam Hellborg,name_Adam Ingi Benediktsson,name_Adam Kaied,name_Adam Petersson,...,opponent_team_Helsingborgs IF,opponent_team_IF Elfsborg,opponent_team_IFK Göteborg,opponent_team_IFK Norrköping,opponent_team_IFK Värnamo,opponent_team_IK Sirius,opponent_team_Kalmar FF,opponent_team_Malmö FF,opponent_team_Mjällby AIF,opponent_team_Varbergs BoIS
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1729,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1730,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1731,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1732,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Convert strings into pandas category codes since the regressor model cannot handle strings
# for label, content in df_tmp.items():
    # if pd.api.types.is_string_dtype(content):
        # df_tmp[label] = content.astype("category")
#         df_tmp[label] = pd.Categorical(content).codes

In [13]:
# transformed.to_csv('data/during-season/df_transformed.csv', index=False)

In [14]:
# df_tmp = pd.read_csv('data/during-season/df_tmp.csv')

## Build model

In [15]:
train_X = transformed_X

In [16]:
%%time
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_jobs=-1, random_state=1234)
model.fit(train_X, train_y.values.ravel())

CPU times: user 31.3 s, sys: 148 ms, total: 31.4 s
Wall time: 4.88 s


RandomForestRegressor(n_jobs=-1, random_state=1234)

In [17]:
def plot_feature_importance(columns, importances):
    df = (pd.DataFrame({'features': columns, 'feature_importances': importances})
          .sort_values('feature_importances', ascending=False))
    
    fig, ax = plt.subplots()
    ax.barh(df['features'], df['feature_importances'])
    ax.set_ylabel("Features")
    ax.set_xlabel("Feature importance")
    ax.invert_yaxis()

In [18]:
# plot_feature_importance(df_tmp_X.columns, model.feature_importances_)

In [19]:
df_tmp_X.shape

(1734, 5)

In [20]:
df_tmp_X.sample(5)

Unnamed: 0,round,name,team,opponent_team,was_home
585,2,Andreas Murbeck,IK Sirius,IFK Värnamo,False
1197,2,Nahom Girmai Netabay,Kalmar FF,Varbergs BoIS,False
1282,3,Zak Elbouzedi,AIK,Malmö FF,False
704,1,Gustav Henriksson,IF Elfsborg,Mjällby AIF,True
701,2,Tim Rönning,IF Elfsborg,Malmö FF,False


In [21]:
df_tmp_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1734 entries, 0 to 1733
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   round          1734 non-null   int64 
 1   name           1734 non-null   object
 2   team           1734 non-null   object
 3   opponent_team  1734 non-null   object
 4   was_home       1734 non-null   bool  
dtypes: bool(1), int64(1), object(3)
memory usage: 56.0+ KB


In [22]:
import numpy as np
import scipy as sp
from scipy.cluster import hierarchy as hc
import seaborn as sns

def cluster_columns(df, figsize=(10,6), font_size=12):
    corr = sp.stats.spearmanr(df).correlation
    
    fix, ax = plt.subplots(figsize=(10, 15))
    ax = sns.heatmap(corr, annot=True, linewidths=0.5, fmt='.3f', cmap='YlGnBu')
    plt.show()
    
# cluster_columns(df_tmp_X)

In [23]:
from scipy.cluster.hierarchy import dendrogram, linkage

def cluster_columns(df):
    print(df.shape)
    linked = linkage(df)
    print(linked.shape)
    
    
# cluster_columns(df_tmp_X)

In [24]:
import numpy as np
from scipy.cluster import hierarchy as hc
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram


def cluster_columns(df, figsize=(20,16), font_size=12):
    corr = np.round(sp.stats.spearmanr(df).correlation, 4)
    corr_condensed = hc.distance.squareform(1-corr)
    z = hc.linkage(corr_condensed, method='average')
    fig = plt.figure(figsize=figsize)
    hc.dendrogram(z, labels=df.columns, orientation='left', leaf_font_size=font_size)
    plt.show()
    
# cluster_columns(df_tmp_X)

## Predict

In [25]:
# Update me for each gw
predict_gw = 5

In [26]:
# Show train data to see correct format
df_tmp_X.head()

Unnamed: 0,round,name,team,opponent_team,was_home
0,1,Johan Dahlin,Malmö FF,Kalmar FF,False
1,2,Johan Dahlin,Malmö FF,IF Elfsborg,True
2,3,Johan Dahlin,Malmö FF,AIK,True
3,4,Johan Dahlin,Malmö FF,IFK Värnamo,False
4,1,Ismael Diawara,Malmö FF,Kalmar FF,False


### Get data to predict

In [27]:
df_all_player_fixtures = pd.read_csv('data/during-season/player_fixtures.csv')

In [28]:
df_all_player_fixtures

Unnamed: 0,id,code,team_h,team_h_score,team_a,team_a_score,event,finished,minutes,provisional_start_time,kickoff_time,event_name,is_home,element,name,team_id,opponent_team_id,team,opponent_team
0,39,2270066,1,,4,,5,False,0,False,2022-04-25T17:10:00Z,Omgång 5,True,1,Johan Dahlin,1,4,Malmö FF,IFK Göteborg
1,46,2270071,2,,1,,6,False,0,False,2022-05-02T17:10:00Z,Omgång 6,False,1,Johan Dahlin,1,2,Malmö FF,Hammarby
2,51,2270081,1,,14,,7,False,0,False,2022-05-07T15:30:00Z,Omgång 7,True,1,Johan Dahlin,1,14,Malmö FF,Mjällby AIF
3,58,2270148,12,,1,,7,False,0,False,2022-05-11T17:00:00Z,Omgång 7,False,1,Johan Dahlin,1,12,Malmö FF,AIK
4,66,2270086,7,,1,,8,False,0,False,2022-05-16T17:10:00Z,Omgång 8,False,1,Johan Dahlin,1,7,Malmö FF,Djurgården
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11305,207,2270234,3,,9,,26,False,0,False,2022-10-15T23:00:00Z,Omgång 26,False,435,Darrell Tibell,9,3,IFK Norrköping,IK Sirius
11306,213,2270239,9,,19,,27,False,0,False,2022-10-18T23:00:00Z,Omgång 27,True,435,Darrell Tibell,9,19,IFK Norrköping,Helsingborgs IF
11307,222,2270249,18,,9,,28,False,0,False,2022-10-22T23:00:00Z,Omgång 28,False,435,Darrell Tibell,9,18,IFK Norrköping,IFK Värnamo
11308,230,2270256,9,,7,,29,False,0,False,2022-10-29T23:00:00Z,Omgång 29,True,435,Darrell Tibell,9,7,IFK Norrköping,Djurgården


In [29]:
df_predict = df_all_player_fixtures[['event', 'name', 'team', 'opponent_team', 'is_home']]
df_predict = df_predict.rename(columns={'event': 'round', 'is_home': 'was_home'})
df_predict = df_predict[df_predict['round'] == predict_gw]
df_predict.head(20)

Unnamed: 0,round,name,team,opponent_team,was_home
0,5,Johan Dahlin,Malmö FF,IFK Göteborg,True
26,5,Ismael Diawara,Malmö FF,IFK Göteborg,True
52,5,Niklas Moisander,Malmö FF,IFK Göteborg,True
78,5,Jonas Knudsen,Malmö FF,IFK Göteborg,True
104,5,Lasse Nielsen,Malmö FF,IFK Göteborg,True
130,5,Eric Larsson,Malmö FF,IFK Göteborg,True
156,5,Felix Beijmo,Malmö FF,IFK Göteborg,True
182,5,Martin Olsson,Malmö FF,IFK Göteborg,True
208,5,Matej Chalus,Malmö FF,IFK Göteborg,True
234,5,Markus Björkqvist,Malmö FF,IFK Göteborg,True


In [30]:
df_predict_transformed = preprocess(df_predict)

### Make prediction

In [31]:
predicted = model.predict(df_predict_transformed)

In [32]:
df_predicted = pd.DataFrame(predicted)

In [36]:
df_predict.reset_index(drop=True, inplace=True)
df_predicted.reset_index(drop=True, inplace=True)
df_result = df_predict.join(df_predicted)
df_result.rename(columns={0: 'predicted_points'}, inplace=True)
df_result.sort_values(by='predicted_points', ascending=False, inplace=True)

# Print the top predicted points for round 4
df_result.head(20)

Unnamed: 0,round,name,team,opponent_team,was_home,predicted_points
108,5,Edvin Kurtulus,Hammarby,IK Sirius,False,8.94
126,5,Nahir Besara,Hammarby,IK Sirius,False,8.63
357,5,Noah Eile,Mjällby AIF,Varbergs BoIS,False,8.18
297,5,Oliver Berg,Kalmar FF,IFK Norrköping,True,7.66
78,5,Kalle Joelsson,Helsingborgs IF,BK Häcken,True,7.42
4,5,Lasse Nielsen,Malmö FF,IFK Göteborg,True,7.0
113,5,Mohanad Jeahze,Hammarby,IK Sirius,False,6.96
313,5,Sebastian Larsson,AIK,Djurgården,True,6.74
311,5,Alexander Milosevic,AIK,Djurgården,True,6.5
404,5,Dennis Hadzikadunic,Malmö FF,IFK Göteborg,True,6.38


In [37]:
df_result.tail()

Unnamed: 0,round,name,team,opponent_team,was_home,predicted_points
232,5,Frank Odhiambo,Djurgården,AIK,False,0.0
236,5,Melker Jonsson,Djurgården,AIK,False,0.0
237,5,Axel Wallenborg,Djurgården,AIK,False,0.0
434,5,Darrell Tibell,IFK Norrköping,Kalmar FF,False,0.0
421,5,Joe Corona,GIF Sundsvall,IF Elfsborg,False,-0.08


### Add "predicted point/cost"

In [34]:
# Get latest price for every player
player_last_gw = df_train.copy()
last_gw_indices = player_last_gw.groupby('name')['round'].agg(pd.Series.idxmax)
player_last_gw = player_last_gw.iloc[last_gw_indices]
player_last_gw.set_index('name', inplace=True)
df_result['price (M)'] = df_result['name'].map(player_last_gw['value']/10)

# Calculate points/price
df_result['predicted_point_per_price'] = df_result['predicted_points']/df_result['price (M)']
df_result.sort_values(by=['predicted_point_per_price'], ascending=False, inplace=True)
df_result.head(30)

Unnamed: 0,round,name,team,opponent_team,was_home,predicted_points,price (M),predicted_point_per_price
108,5,Edvin Kurtulus,Hammarby,IK Sirius,False,8.94,5.0,1.788
357,5,Noah Eile,Mjällby AIF,Varbergs BoIS,False,8.18,4.7,1.740426
78,5,Kalle Joelsson,Helsingborgs IF,BK Häcken,True,7.42,4.5,1.648889
54,5,Francis de Vries,IFK Värnamo,Degerfors IF,True,5.66,4.0,1.415
82,5,Viljormur Davidsen,Helsingborgs IF,BK Häcken,True,6.14,4.5,1.364444
55,5,Victor Larsson,IFK Värnamo,Degerfors IF,True,5.13,4.0,1.2825
111,5,Simon Sandberg,Hammarby,IK Sirius,False,6.29,5.0,1.258
283,5,David Kristjan Olafsson,Kalmar FF,IFK Norrköping,True,6.38,5.1,1.25098
4,5,Lasse Nielsen,Malmö FF,IFK Göteborg,True,7.0,5.6,1.25
365,5,Noah Persson,Mjällby AIF,Varbergs BoIS,False,5.38,4.5,1.195556
