In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
raw_data = pd.read_csv("data/FIFA22_official_data.csv")
raw_data

In [None]:
data = raw_data.drop(["Photo", "Flag", "Club Logo", "Special", "International Reputation", "Body Type", "Real Face", "Joined", "Loaned From",
                      "Contract Valid Until", "Release Clause", "Best Position", "Best Overall Rating", "Potential"], axis=1, inplace=False)
data

In [None]:
plt.hist(data["Age"], bins=39)
average_age = data["Age"].mean()
plt.axvline(x=average_age, color="red", linestyle="--", linewidth=4, label=f"Average Age ({average_age: .2f})")
plt.xlabel("Age")
plt.legend()
plt.title("Distribution of player age", fontsize=18)
plt.show();

In [None]:
oldest_ten_players = data.sort_values(by="Age", ascending=False).head(10)
print("The 10 oldest players are:")
for index, row in oldest_ten_players.iterrows():
    print(f"{row['Name']} - {row['Age']}")

oldest_ten_players

In [None]:
data["Name"] = data["Name"].str.replace(r'\d+', '', regex=True)
data["Name"] = data["Name"].str.lstrip()

oldest_ten_players = data.sort_values(by="Age", ascending=False).head(10)
print("The 10 oldest players are:")
for index, row in oldest_ten_players.iterrows():
    print(f"{row['Name']} - {row['Age']}")

In [None]:
data.drop(data[data["Position"] == "GK"].index, inplace=True)
data = data.drop(["GKDiving", "GKHandling", "GKKicking", "GKPositioning", "GKReflexes"], axis=1, inplace=False)

In [None]:
age_rating_avg = data.groupby("Age")["Overall"].mean().reset_index()
plt.scatter(data["Age"], data["Overall"], s=5, alpha=0.08)
plt.plot(age_rating_avg['Age'], age_rating_avg['Overall'], color='red', linewidth=2, label='Average Overall per Age')
plt.xlabel("Age")
plt.ylabel("Overall")
plt.title("Comparison of overall rating against player age")
plt.legend()
plt.show();

In [None]:
from scipy import stats

sns.boxplot(x="Weak Foot", y="Finishing", data=data)
plt.xlabel("Weak Foot")
plt.ylabel("Finishing")

groups = data["Weak Foot"].unique()
grouped_data = [data[data["Weak Foot"] == group]["Finishing"] for group in groups]
F, p = stats.kruskal(*grouped_data)

plt.title(f"Distribution of all 'Weak Foot' groups (ANOVA, $p = {p}$)")
plt.show();

In [None]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

pairwise_comparison = pairwise_tukeyhsd(data["Finishing"], data["Weak Foot"])
print(pairwise_comparison)

In [None]:
sns.boxplot(x="Skill Moves", y="Dribbling", data=data)
plt.xlabel("Skill Moves")
plt.ylabel("Dribbling")

groups = data["Skill Moves"].unique()
grouped_data = [data[data["Skill Moves"] == group]["Dribbling"] for group in groups]
F, p = stats.kruskal(*grouped_data)

plt.title(f"Distribution of all 'Skill Moves' groups (ANOVA, $p = {p}$)")
plt.show();

In [None]:
pairwise_comparison = pairwise_tukeyhsd(data["Dribbling"], data["Skill Moves"])
print(pairwise_comparison)

In [None]:
sns.regplot(x="Age", y="SprintSpeed", data=data, scatter_kws={'alpha':0.5}, line_kws={'color': 'red'})
test = stats.pearsonr(data["SprintSpeed"], data["Age"])
plt.title(f"Correlation between Age and SprintSpeed ($R = {test.statistic:.3f}$)")
plt.show();

In [None]:
sns.regplot(x="Age", y="Acceleration", data=data, scatter_kws={'alpha':0.5}, line_kws={'color': 'red'})
test = stats.pearsonr(data["Age"], data["Acceleration"])
plt.title(f"Correlation between Age and Acceleration ($R = {test.statistic:.3f}$)")
plt.show();

In [None]:
sns.regplot(x="SprintSpeed", y="Acceleration", data=data, scatter_kws={'alpha':0.5}, line_kws={'color': 'red'})
plt.plot(np.linspace(1, 100, 100), np.linspace(1, 100, 100), color="purple", linestyle="dashed")

test = stats.pearsonr(data["SprintSpeed"], data["Acceleration"])
plt.title(f"Correlation between SprintSpeed and Acceleration ($R = {test.statistic:.3f}$)")
plt.show();

In [None]:
stats_no_label = data.drop(["ID", "Name", "Nationality", "Overall", "Club", "Value", "Wage", "Preferred Foot", "Weak Foot", "Skill Moves",
                            "Work Rate", "Jersey Number", "Height", "Weight", "Position", "Age", "Marking"], axis=1, inplace=False)
stats_no_label

In [None]:
correlation_matrix = stats_no_label.corr()

plt.figure(figsize=(20, 18))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=.5)
plt.title('Correlation Matrix Heatmap')
plt.show();