In [None]:
import os
import pandas as pd
pd.options.display.max_rows = 5
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

In [None]:
data_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir, 'data'))
data_path

#### Read and clean sofifa players data

In [None]:
df = pd.read_csv(os.path.join(data_path, 'sofifa_players_201314to201718_Jan2019.csv'))

In [None]:
df.head(2)

In [None]:
df.columns = ['id', 'date', 'name1', 'name2', 'country', 'age', 'pos1', 'pos2', 'pos3', 'overall_rating', 'potential', 'value', 'wage', 'special_total', 'team', 'contract']
df = df[['date', 'name2', 'team', 'country', 'age', 'pos1', 'overall_rating', 'potential', 'value', 'wage']]

In [None]:
df.head(2)

#### Read and process sofifa club transfer balance data

In [None]:
bal = pd.read_excel(os.path.join(data_path, 'PL_clubs_transfer_balances.xlsx'), sheet_name='transfer_budgets_processed')
bal = bal.dropna()
bal.head(2)

There are two comparisons to be made, for each team:
1. Objective function value change over the years - actual team's (A1) v/s optimal team's (O1).
2. Transfer market balances over the years - actual team's (A2) v/s optimal team's (O2).

O1 and O2 can be calculated from the results given by the optimization model.

The "bal" table directly gives A2. A1 needs to be calculated from "df" table, as follows.

##### 1. Get team compositions of 13 clubs, before and after each of the 12 transfer windows

In [None]:
df.team.unique()

In [None]:
# df.team.values in 
bal.clubs.values

In [None]:
# select player details of PL teams
# note that df team names do not have " FC" suffix, hence remove " FC"
df_pl = df[df['team'].isin(bal['clubs'].str.replace(" FC", ""))]
df_pl['team'].unique()

In [None]:
df_pl.head(2)

In [None]:
# process df_pl_pl wage and value 
df_pl.date = df_pl.date.str[-14:]
# df_pl.value = df_pl.value.str.replace('M', '')
df_pl.value = df_pl.value.str.replace('€', '')
# df_pl.wage = df_pl.wage.str.replace('K', '')
df_pl.wage = df_pl.wage.str.replace('€', '')
df_pl.value = (df_pl.value.replace(r'[KM]+$', '', regex=True).astype(float) * df_pl.value.str.extract(r'[\d\.]+([KM]+)', expand=False) .fillna(1).replace(['K','M'], [10**3, 10**6]).astype(int))
df_pl.wage = (df_pl.wage.replace(r'[KM]+$', '', regex=True).astype(float) * df_pl.wage.str.extract(r'[\d\.]+([KM]+)', expand=False) .fillna(1).replace(['K','M'], [10**3, 10**6]).astype(int))
df_pl.head(2)

In [None]:
# process df_pl_pl date-time 
# df_pl = df_pl.replace('0', np.nan).ffill() # to replace NAs with 0. Not using this method
df_pl = df_pl.dropna() # this may be better than replacing NAs with 0
# df_pl
df_pl.date = pd.to_datetime(df_pl.date)
df_pl = df_pl.sort_values(by = 'date')
df_pl.index = range(len(df_pl.index))
# df_pl
df_pl['month'] = df_pl['date'].dt.month
df_pl['year'] = df_pl['date'].dt.year
# df_pl.value = pd.to_numeric(df_pl.value)
# df_pl.wage = pd.to_numeric(df_pl.wage)
df_pl_names = df_pl.name2.unique()
df_pl.tail(2)

In [None]:
# list of unique clubs
clubs = df_pl['team'].unique()
clubs.sort()
# list of unique dates (time periods)
times = df_pl['date'].unique() 

In [None]:
max_age = np.max(df_pl['age'])

In [None]:
np.max(df_pl['value'])

In [None]:
# calculate market value and salary expenses for each time of each PL club
mkt_sly = pd.DataFrame(columns=['club', 'time', 'mkt_val', 'sly_exp'])
for club in clubs:
    for time in times:
        df_pl_temp = df_pl[(df_pl['team']==club) & (df_pl['date']==time)]
        # decide later whether to multiply market value and salary expense with weight
        df_pl_temp['mkt_val'] = (df_pl_temp['value'] # player p's market value
                          * (1 + (df_pl_temp['potential'] - df_pl_temp['overall_rating']) / df_pl_temp['overall_rating']) # multiplication factor based on potential and rating 
                          * (1 + (max_age - df_pl_temp['age']) / max_age))
        
        mkt_val_temp = sum(df_pl_temp['mkt_val']) # market value
        sly_exp_temp = 26 * sum(df_pl_temp['wage']) # salary expenses (half-season wage = weekly wage * 26)
        row_vals = club, time, mkt_val_temp, sly_exp_temp
        mkt_sly.loc[len(mkt_sly)] = row_vals     

Only two time periods will be shortlisted from each year, hence multiplying weekly wage by 26 (# of weeks in a half-season) makes sense

In [None]:
mkt_sly

In [None]:
times_str = times.astype(str)
times_str = [x[0:10] for x in times_str]
# times_str

In [None]:
# required time windows
time_windows = pd.to_datetime(['2016-09-20', '2017-02-14', '2017-09-18', '2018-02-15'])

In [None]:
# select only the required time windwows
mkt_sly = mkt_sly[mkt_sly['time'].isin(time_windows)]

In [None]:
# get the mean values (of all clubs) of market value and salary expenses, for each time period
mkt_sly_means = mkt_sly.groupby(['time']).agg({'mkt_val':'mean', 'sly_exp':'mean'})
mkt_sly_means = mkt_sly_means.reset_index()
mkt_sly_means

In [None]:
# load objective function values from model results for plotting
mkt_val_model = pd.read_excel(os.path.join(data_path, 'results_Feb2019_v6_modified.xlsx'), sheet_name='exp_market_value')
mkt_val_model = mkt_val_model.iloc[1:, :]
mkt_val_model['time'] = time_windows
mkt_val_model

In [None]:
xticklabs = ['2016s', '2016w', '2017s', '2017w'] 

In [None]:
fs = 12 # fontsize

In [None]:
# plot market values
fig, ax = plt.subplots(3, 4, figsize=(12,10))
plt.subplots_adjust(left=0.05, right=0.95, bottom=0.05, top=0.95, wspace=0.3, hspace=0.3)
min_mkt_val_model = min(np.min(mkt_val_model[clubs]))
max_mkt_val_model = max(np.max(mkt_val_model[clubs]))
for c, club in enumerate(clubs):
    mkt_sly_temp = mkt_sly[mkt_sly['club']==club]
    mkt_val_model_temp = mkt_val_model[['time', club, 'Mean']]
    ax[c//4, c%4].plot(mkt_val_model_temp['time'], mkt_val_model_temp[club], marker='s')
    ax[c//4, c%4].plot(mkt_sly_temp['time'], mkt_sly_temp['mkt_val'], marker='o')
    ax[c//4, c%4].set_xlabel('transfer windows', fontsize=fs)
    ax[c//4, c%4].set_xticks(time_windows)
    ax[c//4, c%4].set_xticklabels(xticklabs, fontsize=fs)
    ax[c//4, c%4].plot(mkt_val_model_temp['time'], mkt_val_model_temp['Mean'], linewidth=3, linestyle='--', marker='s')
    ax[c//4, c%4].plot(mkt_sly_means['time'], mkt_sly_means['mkt_val'], linewidth=3, linestyle='--', marker='o')
    ax[c//4, c%4].set_ylabel('market value in Euros', fontsize=fs)
    ax[c//4, c%4].set_ylim(0.9*min(min_mkt_val_model, min(mkt_sly['mkt_val']), min(mkt_val_model_temp['Mean']), min(mkt_sly_means['mkt_val'])), 
                           1.1*max(max_mkt_val_model, max(mkt_sly['mkt_val']), max(mkt_val_model_temp['Mean']), max(mkt_sly_means['mkt_val'])))
    ax[c//4, c%4].set_title(club, fontsize=fs)
    if c==0:
        ax[c//4, c%4].legend(['club (model)', 'club (actual)', 'mean (model)', 'mean (actual)'], fontsize=fs)
# fig.savefig('mkt_val_comparison.eps')
# fig.savefig('mkt_val_comparison.png')

In [None]:
# # plot salary expenses
# fig, ax = plt.subplots(3, 4, figsize=(16,12))
# for c, club in enumerate(clubs):
#     mkt_sly_temp = mkt_sly[mkt_sly['club']==club]
#     ax[c//4, c%4].plot(mkt_sly_temp['time'], mkt_sly_temp['sly_exp'], marker='o')
#     ax[c//4, c%4].set_xlabel('times', fontsize=fs)
#     ax[c//4, c%4].set_xticks(time_windows)
#     ax[c//4, c%4].set_xticklabels(xticklabs, fontsize=fs)
#     ax[c//4, c%4].set_ylabel('salary expenses in Euros', fontsize=fs)
#     ax[c//4, c%4].set_ylim(0.95*min(mkt_sly['sly_exp']), 1.05*max(mkt_sly['sly_exp']))
#     ax[c//4, c%4].plot(mkt_sly_means['sly_exp'], linewidth=3, linestyle='--', marker='o')
#     ax[c//4, c%4].set_title(club, fontsize=fs)
#     if c==0:
#         ax[c//4, c%4].legend(['club', 'mean'], fontsize=fs)

In [None]:
# load objective function values from model results for plotting
obj_val = pd.read_excel(os.path.join(data_path, 'results_Feb2019_v6_modified.xlsx'), sheet_name='obj_val')
obj_val['time'] = time_windows
obj_val

In [None]:
# # plot objective function values
# fig, ax = plt.subplots(3, 4, figsize=(12,10))
# plt.subplots_adjust(left=0.05, right=0.95, bottom=0.05, top=0.95, wspace=0.3, hspace=0.3)
# min_obj_val = min(np.min(mkt_val_model[clubs]))
# max_obj_val = max(np.max(mkt_val_model[clubs]))
# for c, club in enumerate(clubs):
#     obj_val_temp = obj_val[['time', club, 'Mean']]
#     ax[c//4, c%4].plot(obj_val_temp['time'], obj_val_temp[club], marker='s')
#     ax[c//4, c%4].set_xlabel('transfer windows', fontsize=fs)
#     ax[c//4, c%4].set_xticks(time_windows)
#     ax[c//4, c%4].set_xticklabels(xticklabs, fontsize=fs)
#     ax[c//4, c%4].plot(obj_val_temp['time'], obj_val_temp['Mean'], linewidth=3, linestyle='--', marker='s')
#     ax[c//4, c%4].set_ylabel('objective function value', fontsize=fs)
#     ax[c//4, c%4].set_ylim(0.9*min(min_obj_val, min(mkt_val_model_temp['Mean'])), 
#                            1.1*max(max_obj_val, max(obj_val_temp['Mean'])))
#     ax[c//4, c%4].set_title(club, fontsize=fs)
#     if c==0:
#         ax[c//4, c%4].legend(['club', 'mean'], fontsize=fs)
# fig.savefig('obj_val_comparison.eps')
# fig.savefig('obj_val_comparison.png')

#### Analysis relating market value to team performance

In [None]:
df_pl_summary = df_pl.groupby(['date', 'team']).agg({'value':'sum'})
df_pl_summary

In [None]:
df_pl_summary = df_pl_summary.reset_index()
df_pl_summary['month'] = df_pl_summary['date'].dt.month
df_pl_summary['year'] = df_pl_summary['date'].dt.year
df_pl_summary

In [None]:
# group by seasons (average values)
conditions = [(df_pl_summary['month']==2) | (df_pl_summary['month']==5), 
             (df_pl_summary['month']==9) | (df_pl_summary['month']==12)]
choices = [(df_pl_summary['year']-1).astype(str) + '-' + df_pl_summary['year'].astype(str), 
          df_pl_summary['year'].astype(str) + '-' + (df_pl_summary['year']+1).astype(str)]
df_pl_summary['season'] = np.select(conditions, choices)
df_pl_summary = df_pl_summary.groupby(['season', 'team']).agg({'value':'mean'})
df_pl_summary = df_pl_summary.reset_index()
df_pl_summary

In [None]:
# retain only the seasons for which good data is available
seasons = ['2013-2014', '2014-2015', '2015-2016', '2016-2017', '2017-2018']
df_pl_summary = df_pl_summary[df_pl_summary['season'].isin(seasons)]

In [None]:
df_pl_summary_avg = df_pl_summary.groupby(['team']).agg({'value':'mean'})
df_pl_summary_avg = df_pl_summary_avg.reset_index()
df_pl_summary_avg

In [None]:
# load PL standings (table) data
pl_table = pd.read_excel(os.path.join(data_path, 'PL_standings.xlsx'), sheet_name='summary')
pl_table = pl_table.dropna()
pl_table = pl_table.apply(lambda x: x.astype(int) if x.name in seasons else x)
pl_table = pl_table.sort_values(['clubs'])
pl_table.index = range(len(pl_table))
pl_table

In [None]:
# 1. comparison of average market value and performance across all 5 seasons
X = df_pl_summary_avg['value']
y = pl_table['average']
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

In [None]:
# # to get the attributes in the results object
# dir(est2)

In [None]:
est2.pvalues

In [None]:
# # 2. comparison of average market value and performance across all 5 seasons
# # create data frame for performance v/s market value comparison 
# d = {'clubs': clubs, 
#      'market value in million euros': df_pl_summary_avg['value']/1000000, 
#      'average position': pl_table['average']}
# perf_val = pd.DataFrame(data=d)
# perf_val
# # perf_val.to_excel("perf_val.xlsx")

# for s, season in enumerate(seasons):
#     X = df_pl_summary[df_pl_summary['season']==season]['value']
#     X.index = range(len(X))
# #     print(X)
#     y = pl_table[season]
# #     print(y)
#     X2 = sm.add_constant(X)
#     est = sm.OLS(y, X2)
#     est2 = est.fit()
#     print(est2.summary())