# Sono più importanti per la classifica elements o components?

**L'immagine usata nella presentazione si trova alla sezione 2**

Questo file presenta l'excursus per arrivare alla soluzione della domanda 3.

## 1. Sistemo dataset

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
data = pd.read_csv("data/final.csv")
print(data.shape) #214531, 19 rows, cols
data.head(1)

(214491, 19)


Unnamed: 0,event,program,performance_id,athlete_name,athlete_nation,rank,total_segment_score,total_element_score,total_component_score,total_deductions,aspect_id,aspect_desc,element_order,element_base_value,section,aspect_final_score,judge_name,judge_nation,judge_score
0,ISU European Figure Skating Championships 2017,LADIES SHORT PROGRAM,b639d77459,Evgenia MEDVEDEVA,RUS,1,78.92,42.0,36.92,0.0,00034b9414,Transitions,,,components,9.07,Agita ABELE,LAT,9.0


Colonne importanti:
* rank 
* total_segment_score	
* total_element_score	
* total_component_score

Righe importanti per performance_id $\implies$ elimino doppioni

In [3]:
# Elimino le righe con stessa performance_id:
data = data.drop_duplicates(subset=['performance_id']).reset_index()

# Elimino le colonne che non mi servono più
data.drop('section', axis=1, inplace=True)
data.drop('total_deductions', axis=1, inplace=True)
data.drop('aspect_id', axis=1, inplace=True)
data.drop('aspect_desc', axis=1, inplace=True)
data.drop('element_order', axis=1, inplace=True)
data.drop('element_base_value', axis=1, inplace=True)
data.drop('aspect_final_score', axis=1, inplace=True)
data.drop('judge_name', axis=1, inplace=True)
data.drop('athlete_name', axis=1, inplace=True)
data.drop('athlete_nation', axis=1, inplace=True)
data.drop('judge_nation', axis=1, inplace=True)
data.drop('judge_score', axis=1, inplace=True)

data.head(1)

Unnamed: 0,index,event,program,performance_id,rank,total_segment_score,total_element_score,total_component_score
0,0,ISU European Figure Skating Championships 2017,LADIES SHORT PROGRAM,b639d77459,1,78.92,42.0,36.92


Creo le nuove 4 categorie:

In [4]:
# quali sono le categorie di program?
data['program'].value_counts()

MEN SHORT PROGRAM               246
LADIES SHORT PROGRAM            244
MEN FREE SKATING                218
LADIES FREE SKATING             218
ICE DANCE SHORT DANCE           207
ICE DANCE FREE DANCE            185
PAIRS SHORT PROGRAM             164
PAIRS FREE SKATING              150
JUNIOR ICE DANCE SHORT DANCE     12
JUNIOR MEN FREE SKATING          12
JUNIOR MEN SHORT PROGRAM         12
JUNIOR PAIRS SHORT PROGRAM       12
JUNIOR PAIRS FREE SKATING        12
JUNIOR ICE DANCE FREE DANCE      12
JUNIOR LADIES FREE SKATING       11
JUNIOR LADIES SHORT PROGRAM      11
Name: program, dtype: int64

In [5]:
# creo una colonna caregory in cui sono presenti le 8 categorie: (Ladies, men, pairs, ice_dance) x short + free
# create a list of our conditions
conditions = [
    
    (data['program'] == "LADIES SHORT PROGRAM") |
    (data['program'] == "JUNIOR LADIES SHORT PROGRAM"),

    (data['program'] == "MEN SHORT PROGRAM") |
    (data['program'] == "JUNIOR MEN SHORT PROGRAM"),

    (data['program'] == "PAIRS SHORT PROGRAM") |
    (data['program'] == "JUNIOR PAIRS SHORT PROGRAM"),

    (data['program'] == "ICE DANCE SHORT DANCE") |
    (data['program'] == "JUNIOR ICE DANCE SHORT DANCE"),

    (data['program'] == "ICE DANCE FREE DANCE") |
    (data['program'] == "JUNIOR ICE DANCE FREE DANCE"),

    (data['program'] == "JUNIOR LADIES FREE SKATING") |
    (data['program'] == "LADIES FREE SKATING") ,

    (data['program'] == "MEN FREE SKATING") |
    (data['program'] == "JUNIOR MEN FREE SKATING"),

    (data['program'] == "PAIRS FREE SKATING") |
    (data['program'] == "JUNIOR PAIRS FREE SKATING")

    ]

# create a list of the values we want to assign for each condition
values = ['Ladies short','Men short','Pairs short','Ice dance short', 'Ice dance free', 'Ladies free','Men free','Pairs free']

# create a new column and use np.select to assign values to it using our lists as arguments
data['category'] = np.select(conditions, values)

# display updated DataFrame
data.head()

Unnamed: 0,index,event,program,performance_id,rank,total_segment_score,total_element_score,total_component_score,category
0,0,ISU European Figure Skating Championships 2017,LADIES SHORT PROGRAM,b639d77459,1,78.92,42.0,36.92,Ladies short
1,12,ISU European Figure Skating Championships 2017,LADIES SHORT PROGRAM,4764e4873e,24,45.97,24.77,21.2,Ladies short
2,24,ISU European Figure Skating Championships 2017,LADIES SHORT PROGRAM,877e5c34bb,2,74.39,39.9,34.49,Ladies short
3,36,ISU European Figure Skating Championships 2017,LADIES SHORT PROGRAM,53868b204a,17,51.47,29.04,22.43,Ladies short
4,48,ISU European Figure Skating Championships 2017,LADIES SHORT PROGRAM,da4b091d04,3,72.4,36.8,35.6,Ladies short


In [6]:
# Calcola quante categorie presenta la colonna 'rank' in 'data' per ogni 'event' e 'program':
ranking = data.groupby(['event','program'])['rank'].nunique() #Count number of distinct elements in specified axis
ranking.head(20)

event                                           program                     
Grand Prix Final 2017 Senior and Junior         ICE DANCE FREE DANCE             6
                                                ICE DANCE SHORT DANCE            6
                                                JUNIOR ICE DANCE FREE DANCE      6
                                                JUNIOR ICE DANCE SHORT DANCE     6
                                                JUNIOR LADIES FREE SKATING       6
                                                JUNIOR LADIES SHORT PROGRAM      6
                                                JUNIOR MEN FREE SKATING          6
                                                JUNIOR MEN SHORT PROGRAM         6
                                                JUNIOR PAIRS FREE SKATING        6
                                                JUNIOR PAIRS SHORT PROGRAM       6
                                                LADIES FREE SKATING              6
          

In [7]:
# Ordino data secondo il rank in modo rescente:
data = data.sort_values(by=['rank'], ascending=True)
# Create a new column with the inverse of the rank:
data['rank_inv'] = 1/data['rank']
data.head(1)

Unnamed: 0,index,event,program,performance_id,rank,total_segment_score,total_element_score,total_component_score,category,rank_inv
0,0,ISU European Figure Skating Championships 2017,LADIES SHORT PROGRAM,b639d77459,1,78.92,42.0,36.92,Ladies short,1.0


In [9]:
# Aggiungi colonna con il peso dei component_score e element_score per ogni performance_id in percentuale rispetto al totale total_segment_score:
data['weight_elem'] = data['total_element_score']/data['total_segment_score']
data['weight_comp'] = data['total_component_score']/data['total_segment_score']
data.head()

Unnamed: 0,index,event,program,performance_id,rank,total_segment_score,total_element_score,total_component_score,category,rank_inv,weight_elem,weight_comp
0,0,ISU European Figure Skating Championships 2017,LADIES SHORT PROGRAM,b639d77459,1,78.92,42.0,36.92,Ladies short,1.0,0.532184,0.467816
837,103731,Grand Prix Final 2017 Senior and Junior,LADIES FREE SKATING,5e8023f5f0,1,147.03,76.61,70.42,Ladies free,1.0,0.52105,0.47895
805,100830,ISU World Figure Skating Championships 2017,ICE DANCE SHORT DANCE,8e7983756f,1,82.43,43.74,38.69,Ice dance short,1.0,0.530632,0.469368
793,99910,ISU GP Rostelecom Cup 2017,ICE DANCE SHORT DANCE,43f5771db8,1,77.3,40.23,37.07,Ice dance short,1.0,0.52044,0.47956
782,98280,ISU GP Audi Cup of China 2016,MEN FREE SKATING,58a25fecce,1,196.31,104.31,93.0,Men free,1.0,0.531353,0.473741


Raggruppo i dati per categoria e rank prendendo la media dei pesi degli elementi e dei pesi dei componenti

In [11]:
# Raggruppo i dati per categoria e rank prendendo la media dei pesi degli elementi e dei pesi dei componenti:
medie = data.groupby(['category', 'rank'])['weight_elem', 'weight_comp'].mean().reset_index()
# Ordino data secondo il rank in modo crescente:
medie = medie.sort_values(by=['rank'], ascending=True)
# Create a new column with the inverse of the rank:
medie['rank_inv'] = 1/medie['rank']
medie.colnames = ['category', 'rank', 'weight_elem', 'weight_comp', 'rank_inv']
medie.head()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.


Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access



Unnamed: 0,category,rank,weight_elem,weight_comp,rank_inv
0,Ice dance free,1,0.500918,0.499082,1.0
52,Ladies free,1,0.510249,0.491218,1.0
189,Pairs short,1,0.545739,0.454937,1.0
76,Ladies short,1,0.532529,0.467471,1.0
173,Pairs free,1,0.501095,0.501993,1.0


Dal momento che il **rank** varia da 1 a 37 e differisce a seconda dell'evento e del programma, ho deciso di creare una nuova metrica **rank level** che riassuma qualitativamente il livello di classifica: sarà un valore che varia da low a high.

In [14]:
# Calcola quante categorie presenta la colonna 'rank' in 'data' per ogni 'event' e 'program':
ranking.head(20) #Series object

event                                           program                     
Grand Prix Final 2017 Senior and Junior         ICE DANCE FREE DANCE             6
                                                ICE DANCE SHORT DANCE            6
                                                JUNIOR ICE DANCE FREE DANCE      6
                                                JUNIOR ICE DANCE SHORT DANCE     6
                                                JUNIOR LADIES FREE SKATING       6
                                                JUNIOR LADIES SHORT PROGRAM      6
                                                JUNIOR MEN FREE SKATING          6
                                                JUNIOR MEN SHORT PROGRAM         6
                                                JUNIOR PAIRS FREE SKATING        6
                                                JUNIOR PAIRS SHORT PROGRAM       6
                                                LADIES FREE SKATING              6
          

In [15]:
# get info about the ranking:
ranking.describe() # min is 5 max is 37 of number of participants in all events of all programs

count    152.000000
mean      11.355263
std        6.493408
min        5.000000
25%        7.000000
50%       10.000000
75%       12.000000
max       37.000000
Name: rank, dtype: float64

In [16]:
# Crea una nuova colonna in 'data' con il numero di partecipanti per ogni 'program' di ogni 'event' usando la serie 'ranking':
data['n_participants'] = data.apply(lambda x: ranking.loc[(x['event'], x['program'])], axis=1)
data.head(1)

# Crea una colonna con il rank percentuale:
data['rank_perc'] = data['rank']/data['n_participants']

# Crea una colonna con il rank percentuale mappato in una variabile categorica:
data['rank_perc_cat'] = data['rank_perc'].apply(lambda x: 'high' if x <= 0.20 else ('mid-high' if x <= 0.4 else ('medium' if x <= 0.60 else ('mid-low' if x <= 0.8 else 'low'))))

# Creo il dataset medie2 coi dati dei valori medi delle colonne 'weight_elem', 'weight_comp' raggruppati secondo 'category' e 'rank_perc_cat':
medie2 = data.groupby(['category', 'rank_perc_cat'])['weight_elem', 'weight_comp'].mean().reset_index()

# Aggiungo una colonna 'rank_perc' a medie2 matchando con la colonna 'rank_perc' di data:
medie2['rank_perc'] = medie2.apply(lambda x: data.loc[(data['category'] == x['category']) & (data['rank_perc_cat'] == x['rank_perc_cat']), 'rank_perc'].iloc[0], axis=1)
# Ordino medie2 secondo il rank percentuale in modo crescente:
medie2 = medie2.sort_values(by=['rank_perc'], ascending=True)

medie2.head()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,category,rank_perc_cat,weight_elem,weight_comp,rank_perc
15,Ladies short,high,0.534455,0.465871,0.029412
5,Ice dance short,high,0.51512,0.48521,0.03125
25,Men short,high,0.542448,0.459869,0.090909
20,Men free,high,0.528909,0.474816,0.1
0,Ice dance free,high,0.500477,0.499775,0.111111


In [17]:
palette = {'Ladies short':'#F833FA',
            'Ladies free': '#F833FA',
            'Men short':'#3399FA',
            'Men free':'#3399FA',
            'Pairs short':'#FA9433',
            'Pairs free':'#FA9433',
            'Ice dance short':'#35FA33',
            'Ice dance free':'#35FA33'}

In [18]:
scatter_medie2 = px.scatter(
    medie2, x="weight_elem", y="rank_perc_cat",
    symbol="category", #size="rank_inv", 
    color="category",
    # Assign color to categories using list_colors:
    color_discrete_map=palette,
    # Choose marker type according to category: 
    symbol_sequence=['circle','star','circle','star','circle','star','circle', 'star'],
    title='Rank: components vs elements score for each performance',
    log_x=False,
    labels={
        "category": "Category",  
        "rank_perc_cat": "Rank", 
        "weight_elem": "Weight of elements",
        "rank_perc": "Rank percentage"
    },
    hover_data={"rank_perc":True},
    category_orders={ # replaces default order by column name
        "category": ['Ladies short','Ladies free','Men short','Men free','Pairs short','Pairs free','Ice dance short', 'Ice dance free']
    },
    # Add picture 'img.png' as background:
    
    #template="plotly_dark",
    width=1000,
    height=600
)

scatter_medie2.update_traces(
    marker=dict(line=dict(width=0.5, color='DarkSlateGrey'), size=18),
    selector=dict(mode="markers"),  
)
scatter_medie2['layout']['yaxis']['autorange'] = "reversed"

scatter_medie2.layout.xaxis.tickformat = '0%'
# #Aggiungo y = x
scatter_medie2.add_vline(x=0.5, line_width=2, line_dash="dash", line_color="black", opacity=1, name="equilibrium")

scatter_medie2.show()

## 2. Final plot

In [21]:
bubble = px.scatter(
    medie2, x="weight_elem", y="rank_perc_cat",
    symbol="category",
    color="category",
    # Assign color to categories using list_colors:
    color_discrete_map=palette,
    # Choose marker type according to category: 
    symbol_sequence=['circle','star-diamond','circle','star-diamond','circle','star-diamond','circle', 'star-diamond'],
    # title='Rank: components vs elements score for each performance',
    log_x=False,
    labels={
        "category": "Category",  
        "rank_perc_cat": "Rank", 
        "weight_elem": "Elements' weight",
        "rank_perc": "Rank percentage"
    },
    hover_data={"rank_perc":True},
    category_orders={ # replaces default order by column name
        "category": ['Ladies short','Ladies free','Men short','Men free','Pairs short','Pairs free','Ice dance short', 'Ice dance free']
    }
)
bubble['layout']['yaxis']['autorange'] = "reversed"
bubble.layout.xaxis.tickformat = '0%'
bubble.update_traces(
    marker=dict(line=dict(width=0.5, color='DarkSlateGrey'), size=25),
    selector=dict(mode="markers"),  
)
bubble.add_vline(x=0.5, line_width=2, line_dash="dash", line_color="black", opacity=1, name="equilibrium")

#general features
bubble.update_layout(width=1315,
                     height=700,
                     plot_bgcolor='#E9F1FA',
                     paper_bgcolor='#E9F1FA',
                     
                     #margins
                     margin=dict(l=100, r=150, t=120, b=100),
                     
                     #legend
                     legend=dict(yanchor="bottom",
                                 xanchor="right",
                                 #orientation="h",
                                 # change title of legend:
                                 title_text="",
                                 y=0.575,
                                 x=1.11),
                     
                     #title
                     title={'text': "<b>Are elements more important than components in establishing the final ranking?</b>",
                             'y':0.97,
                             'x':0.5,
                             'xanchor': 'center',
                             'yanchor': 'top'},
                     title_font={'family': 'DejaVu Sans',
                                  'size': 24,
                                  'color': '#543810'},
                     
                     #general font
                     font={'family': 'DejaVu Sans',
                           'size': 14,
                           'color': '#543810'},
                    #change range x axis:
                    #xaxis_range=[0.47,0.56],
                    #change axis title font:
                    xaxis_title_font=dict(family='DejaVu Sans', size=17, color='#543810'),
                    yaxis_title="")
# sottotitolo riga 1
bubble.add_annotation(text = ("The performances in each event have been divided by rank level (low-high)."),
                              showarrow=False,
                              x = 0, 
                              y = -0.14, 
                              xref='paper', 
                              yref='paper', 
                              xanchor='center', 
                              yanchor='top', 
                              xshift=565, 
                              yshift=620, 
                              font=dict(size=18, color="#543810", family='DejaVu Sans'), 
                              align="center")
# sottotitolo riga 2
bubble.add_annotation(text = ("For each rank level and type of program, the average weight of the elements in the final score is shown on the x-axis."),
                              showarrow=False,
                              x = 0, 
                              y = -0.14, 
                              xref='paper', 
                              yref='paper', 
                              xanchor='center', 
                              yanchor='top', 
                              xshift=565, 
                              yshift=600, 
                              font=dict(size=18, color="#543810", family='DejaVu Sans'), 
                              align="center")

# aggiungi autore e origine dati
bubble.add_annotation(text = "data source: https://github.com/BuzzFeedNews/2018-02-figure-skating-analysis",
                      showarrow=False,
                      x = 0, 
                      y = -0.15, 
                      xref='paper', 
                      yref='paper', 
                      xanchor='left', 
                      yanchor='bottom', 
                      xshift=-1, 
                      yshift=-5, 
                      font=dict(size=10, color="#543810", family='DejaVu Sans'), 
                      align="left")
bubble.add_annotation(text = "plot author: Valeria Insogna",
                      showarrow=False,
                      x = 0, 
                      y = -0.18, 
                      xref='paper', 
                      yref='paper', 
                      xanchor='left', 
                      yanchor='bottom', 
                      xshift=-1, 
                      yshift=-5, 
                      font=dict(size=10, color="#543810", family='DejaVu Sans'), 
                      align="left")
#titolo asse y riga 1
bubble.add_annotation(text = "Rank",
                   showarrow=False,
                   x = 0, 
                   y = -0.15, 
                   xref='paper', 
                   yref='paper', 
                   xanchor='left', 
                   yanchor='top', 
                   xshift=-44, 
                   yshift=555, 
                   font=dict(size=16, color="#543810", family='DejaVu Sans'), 
                   align="left")