In [None]:
import pandas as pd
import zipfile
import numpy as np

In [None]:
pd.set_option('display.max_columns', None)

# Data Cleaning and Transformation

In [None]:
def read_datasets(zipf):
    datasets = {}
    with zipfile.ZipFile(zipf, "r") as arch:
        datafiles = [datafile for datafile in arch.namelist() if datafile.split(".")[1] == "csv"]
        for datafile in datafiles:
            datasets[datafile.split(".")[0]] = pd.read_csv(arch.open(datafile))
    return datasets

In [None]:
## 1. Check for dtype of two columns
def height_weight_dtype(datasets):    
    for key, dataset in datasets.items():
        if(dataset["height_cm"].dtype == 'int64' and dataset["weight_kg"].dtype == 'int64'):
            print(f"{key} => Columns : Height and Weight are of type int64")

In [None]:
## 2. Break date column into multiple columns
def split_date_cols(datasets):
    for key, dataset in datasets.items():
        dataset[["joined_y","joined_d","joined_m"]] = dataset["joined"].str.split("-", expand=True)
        print(dataset.loc[:, ["joined","joined_y","joined_m", "joined_d"]])

In [None]:
## 3. Clean and transform specific columns
def clean_and_transform_cols(datasets):
    for key, dataset in datasets.items():
        dataset.fillna({"release_clause_eur":0, "wage_eur":0, "value_eur":0}, inplace=True)
        dataset = dataset.astype({"release_clause_eur":'int', "wage_eur":'int', "value_eur":'int'})

In [None]:
datasets = read_datasets("archive.zip")

In [None]:
height_weight_dtype(datasets)

In [None]:
split_date_cols(datasets)

In [None]:
clean_and_transform_cols(datasets)


# Data Visulaisation

1. AGE CLASSIFICATION

In [None]:
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

In [None]:
def age_classification_pie(datasets):
    for key, dataset in datasets.items():
        data = dataset.groupby(pd.cut(dataset['age'], np.arange(15,55,5)))['sofifa_id'].count()
        ser = pd.Series(data)
        values = ser.values
        labels = ser.index
        break
    return values, labels

In [None]:
values, labels = age_classification_pie(datasets)

In [None]:
fig, ax = plt.subplots(figsize=(5, 15), subplot_kw=dict(aspect="equal"))
wedges, texts = ax.pie(values, textprops={'size':'smaller','color':'black'}, shadow=True)

pct = [round(value*100/values.sum(),2) for value in values]

labels_customised = [f"{labels[i]} - {pct[i]}%" for i in range(0,len(pct))]


ax.legend(wedges, labels_customised,
          title="Age Classification among FIFA Players",
          loc="center left",
          bbox_to_anchor=(1, 0, 0.5, 1))


2. TOP 10 HIGHEST PAID PLAYERS IN A GIVEN YEAR

In [None]:
def top10_highest_paid(year, datasets):
    result = {}
    dataset = datasets[f"players_{year[-2:]}"]
    top10_df = dataset.nlargest(10,'wage_eur')[['short_name','wage_eur']]
    
    xvalues = top10_df["short_name"]
    yvalues = top10_df["wage_eur"]
    
    fig = plt.figure(figsize = (15, 5))
    
    plt.bar(xvalues, yvalues, color ='maroon',
        width = 0.4)
    
    
    plt.xlabel(f"Top 10 Players of {year}")
    plt.ylabel("Salary")
    plt.title(f"Top 10 highest paid players of {year}")
    
    
year = input("Please provide the year between 2015 - 2021: ")   
top10_highest_paid(year, datasets)

    

3. CHANGE IN ATTRIBUTES OF A PLAYER OVER THE YEARS

In [None]:
legend_items = datasets["players_15"].keys()[33:39]

attributes = {}

for i,dataset in datasets.items():
    df = dataset[dataset['short_name']=='Cristiano Ronaldo'].iloc[:,33:39]
    for x,y in df.items():
        attributes.setdefault(x,[]).append(y.values[0])
    
print(attributes)

X_axis = np.arange(len(legend_items))



'''
for attribute,values in attributes.items():
    plt.bar(X_axis - 0.2, values, 0.4, label = attribute)
plt.xticks(X_axis, legend_items)
plt.xlabel("Attributes")
plt.ylabel("Values")
plt.title("Change in attributes")
plt.legend()
'''
