# Quali sono gli elementi più penalizzanti?

## 1. Importare il dataset

Penalties are given for each performance e they are from `performances.csv` file under column `total_deductions`

In [1]:
import pandas as pd
import numpy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
data = pd.read_csv("final.csv")
print(data.shape) #214531, 19 rows, cols
data.head(1)

(214491, 19)


Unnamed: 0,competition,program,performance_id,athlete_name,athlete_nation,rank,total_segment_score,total_element_score,total_component_score,total_deductions,aspect_id,aspect_desc,element_order,element_base_value,section,aspect_final_score,judge_name,judge_nation,judge_score
0,ISU European Figure Skating Championships 2017,LADIES SHORT PROGRAM,b639d77459,Evgenia MEDVEDEVA,RUS,1,78.92,42.0,36.92,0.0,00034b9414,Transitions,,,components,9.07,Agita ABELE,LAT,9.0


In [3]:
data["aspect_desc"].value_counts() # Ci sono 635 tipi di aspect_desc inclusi i components!

Performance                    15526
Composition                    15526
Transitions                    15526
Skating Skills                 15526
Interpretation of the Music    11782
                               ...  
5SLi1                              9
3F+1T*                             9
3F+3T+2Lo                          9
FoDs3                              9
3S<+2T                             9
Name: aspect_desc, Length: 635, dtype: int64

But `data` are grouped by `aspect_id` and `judge_name` not by `performance_id` so we need to group by `performance_id` to get the number of penalties given for each performance.

Let's check if also in `performances.csv` file deduction vary in range $[0, 9]$

In [4]:
performances = pd.read_csv("performances.csv")
print(performances.shape) #214531, 19 rows, cols
print(performances['total_deductions'].value_counts())
print(performances['total_deductions'].value_counts(normalize=True))

(1726, 11)
0.0    1208
1.0     401
2.0      98
3.0       8
4.0       8
6.0       2
9.0       1
Name: total_deductions, dtype: int64
0.0    0.699884
1.0    0.232329
2.0    0.056779
3.0    0.004635
4.0    0.004635
6.0    0.001159
9.0    0.000579
Name: total_deductions, dtype: float64


Hence:
- 70% of `performances` has no penalty deduction
- 23% of `performances` has penalty deduction of 1
- 6% of `performances` has penalty deduction of 2
- 2% of `performances` has penalty deduction of more

## 2. Distribuzione delle performance penalizzate

Now let's group `data` for each performance since we want to know which performance is penalized more.

In [5]:
# Group data by performance eliminating the element with 0 total_deductions e con 'section'= 'components':
df = data[data['section'] != 'components'] # Togliere gli elementi con section=components
df = df.sort_values(by='total_deductions', ascending=False)
df["aspect_desc"].value_counts() # Ci sono 422 tipi di aspect_desc ESCLUSI i components!

CCoSp4        6471
ChSq1         5364
2A            5265
StSq3         5228
3Lo           3591
              ... 
FCCoSp1          9
2A+2T+2Lo<       9
4T+SEQ+1T*       9
FoDs3            9
1CC3+kpYYN       9
Name: aspect_desc, Length: 628, dtype: int64

In [6]:
df = df[df['total_deductions'] != 0] # tolgo gli elementi con total_deductions=0
df = df.groupby('performance_id') # it returns a DataFrameGroupBy object != DataFrame !!!

# Let's print the first entries in all the groups formed.
df.first()

# Finding the first value of column 'total_deductions' contained in the "007541009f" group:
df.get_group('007541009f')["total_deductions"].tolist()[0]

# Get first value of column 'total_deductions' for each group:
df['total_deductions'].apply(lambda x: x.tolist()[0]).tolist()

# Get all groups as a list:
df.groups.keys() 

df.head() #e3937497cb is the first with 9 pts deduction

# Sort df by total_deductions in descending order: (there is no other way than creating a new DataFrame)
df1 = pd.DataFrame({'performance_id':df.groups.keys(), 'total_deductions':df['total_deductions'].apply(lambda x: x.tolist()[0]).tolist()})
df1 = df1.sort_values(by='total_deductions', ascending=False)
df1.shape #518 x 2 cols

(518, 2)

In [7]:
import plotly.express as px
# Plot data
bar_df1 = px.bar(
    df1, x='performance_id', y='total_deductions', 
    title='Penalty deduction for each performance <br><sup> 518/1726 performance with penalty>0</sup>', 
    labels={'performance_id': 'Performance ID', 'total_deductions':'Deduction points'}
    )
bar_df1.show()
#e3937497cb is the first with 9 pts deduction

In [8]:
#No need to check it multiple times (once is enough)

# Let's check if is the same for file performances.csv:
# Plot data
# dperf = performances[performances['total_deductions'] != 0]
# dperf = dperf.sort_values(by='total_deductions', ascending=False)
# bar_dperf = px.bar(dperf, x='performance_id', y='total_deductions', title='Penalty deduction for each performance')
# bar_dperf.show()#SAME

In [9]:
# Then group df1 by total_deductions and count the number of performances in each group:
df3 = df1.groupby('total_deductions').count()
df3 = df3.reset_index()
df3.columns = ['total_deductions', 'performance_count']
pie_df3 = px.pie(df3, values='performance_count', names='total_deductions', title='Penalty deduction points across performances with penalty>0 <br><sup> 518/1726 in tot </sup>')
pie_df3.show()

In [10]:
# Plot pie chart of data:
# Group data by by performance NOT eliminating the element with 0 total_deductions
df2 = data.sort_values(by='total_deductions', ascending=False)
df2 = df2.groupby('performance_id')
# Sort df2 by total_deductions in descending order: (there is no other way than creating a new DataFrame)
df2 = pd.DataFrame({'performance_id':df2.groups.keys(), 'total_deductions':df2['total_deductions'].apply(lambda x: x.tolist()[0]).tolist()})
df2 = df2.sort_values(by='total_deductions', ascending=False)

# Then group it by total_deductions and count the number of performances in each group:
df2 = df2.groupby('total_deductions').count()
df2 = df2.reset_index()
df2.columns = ['total_deductions', 'performance_count']
df2.head()
pie_df2 = px.pie(df2, values='performance_count', names='total_deductions', title='Penalty deduction points across all performances <br><sup> 1726 in tot</sup>')
pie_df2.show()

## 3. Elementi nelle performance penalizzate

In [11]:
# Mostrami le prime performance più penalizzate:
df.head(1)

Unnamed: 0,competition,program,performance_id,athlete_name,athlete_nation,rank,total_segment_score,total_element_score,total_component_score,total_deductions,aspect_id,aspect_desc,element_order,element_base_value,section,aspect_final_score,judge_name,judge_nation,judge_score
126582,ISU GP Trophee de France 2016,LADIES FREE SKATING,e3937497cb,Alena LEONOVA,RUS,12,77.49,33.56,52.93,9.0,fc9e599aa6,3F,6.0,5.83,elements,3.73,Jodi ABBOTT,CAN,-3.0
116644,ISU GP Audi Cup of China 2017,MEN FREE SKATING,9fbf9b7df2,Alexander PETROV,RUS,12,117.44,56.04,67.40,6.0,91fdb760bc,StSq3,5.0,3.30,elements,4.09,Daniel DELFA,ESP,1.0
109315,ISU GP Rostelecom Cup 2016,LADIES FREE SKATING,c5dcc99de5,Julia LIPNITSKAIA,RUS,12,78.88,31.56,53.32,6.0,d859699146,StSq2,9.0,2.60,elements,3.10,Wendy ENZMANN,USA,1.0
53653,ISU Four Continents Championships 2017,LADIES FREE SKATING,f689dd7cff,Kaetlyn OSMOND,CAN,6,115.96,55.25,64.71,4.0,3184648959,3S<,8.0,3.41,elements,1.31,Peter RANKIN,AUS,-3.0
29788,ISU World Figure Skating Championships 2017,LADIES SHORT PROGRAM,e37a1094f2,Michaela-Lucie HANZLIKOVA,CZE,37,32.21,16.55,19.66,4.0,791053302c,CCoSp3,7.0,3.00,elements,3.00,Edith SCHILLER,AUT,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213416,ISU Grand Prix of Figure Skating Final 2016,JUNIOR ICE DANCE FREE DANCE,969057f97a,Anastasia SHPILEVAYA / Grigory SMIRNOV,RUS,6,81.35,39.24,43.11,1.0,f78a8f0649,RoLi4,5.0,4.50,elements,5.36,Nicole LEBLANC-RICHARD,CAN,1.0
182579,ISU Grand Prix of Figure Skating Final 2016,ICE DANCE SHORT DANCE,80c81a6755,Madison CHOCK / Evan BATES,USA,6,70.87,36.47,35.40,1.0,11b4d4bb27,CuLi4,1.0,4.50,elements,5.87,Kerstin KIMMINUS,GER,2.0
9200,ISU GP Audi Cup of China 2017,PAIRS FREE SKATING,98bf365e76,Ashley CAIN / Timothy LEDUC,USA,6,101.21,47.48,54.73,1.0,f866e516f2,3LzTh,3.0,5.50,elements,4.10,Sviatoslav BABENKO,RUS,-2.0
43148,ISU Four Continents Championships 2017,ICE DANCE SHORT DANCE,07da83d879,Kimberley HEW-LOW / Timothy MCKERNAN,AUS,15,33.54,16.97,17.57,1.0,fe060b5c18,PSt1,2.0,4.10,elements,3.60,Sharon ROGERS,USA,-1.0


In [12]:
df.get_group('e3937497cb').head(1) #108 rows × 19 columns

Unnamed: 0,competition,program,performance_id,athlete_name,athlete_nation,rank,total_segment_score,total_element_score,total_component_score,total_deductions,aspect_id,aspect_desc,element_order,element_base_value,section,aspect_final_score,judge_name,judge_nation,judge_score
126582,ISU GP Trophee de France 2016,LADIES FREE SKATING,e3937497cb,Alena LEONOVA,RUS,12,77.49,33.56,52.93,9.0,fc9e599aa6,3F,6.0,5.83,elements,3.73,Jodi ABBOTT,CAN,-3.0


### 3.1. Distribuzione base_value

Voglio indagare come sono distribuiti i base_value in modo da capire se impostare un trashold per eliminare i base_value troppo bassi dal dataframe df.

In [13]:
df1.head(10) #e3937497cb is the first with 9 pts deduction

Unnamed: 0,performance_id,total_deductions
452,e3937497cb,9.0
391,c5dcc99de5,6.0
319,9fbf9b7df2,6.0
451,e37a1094f2,4.0
503,fb902cf344,4.0
467,eb80e313e9,4.0
108,36ee135925,4.0
484,f689dd7cff,4.0
486,f6fc06bf02,4.0
224,709ab37e52,4.0


In [14]:
worst10 = df1.head(10).performance_id.tolist()
#worst10

In [15]:
df4 = data[data['total_deductions'] != 0]
df4 = df4[df4['section'] != 'components'] # Togliere gli elementi con section=components
df4 = df4.sort_values(by='total_deductions', ascending=False) # Sort df by total_deductions in descending order
# Prendi solo le performance che sono tra le 10 peggiori:
df4 = df4[df4['performance_id'].isin(worst10)]
df4.head(1)

Unnamed: 0,competition,program,performance_id,athlete_name,athlete_nation,rank,total_segment_score,total_element_score,total_component_score,total_deductions,aspect_id,aspect_desc,element_order,element_base_value,section,aspect_final_score,judge_name,judge_nation,judge_score
126786,ISU GP Trophee de France 2016,LADIES FREE SKATING,e3937497cb,Alena LEONOVA,RUS,12,77.49,33.56,52.93,9.0,fc9e599aa6,3F,6.0,5.83,elements,3.73,Steve WINKLER,USA,-3.0


In [16]:
# Mostra la distribuzione degli element_base_value per le prime 10 più penalizzate performance_id
hist_df4 = px.histogram(
    df4, x="element_base_value", color="performance_id",
     title='Distribution of base values of the elements for the worst 10 performances',
     labels={'element_base_value': 'Base value', 'performance_id':'Performance ID'}
     )
hist_df4.show()
#NB Sort df by total_deductions in descending order

Paragoniamola con quella di tutte le performance:

In [17]:
# Mostra la distribuzione degli element_base_value per tutte performance penalizzate:
df5 = data[data['total_deductions'] != 0]
df5 = df5[df5['section'] != 'components'] # Togliere gli elementi con section=components
df5 = df5.sort_values(by='total_deductions', ascending=False)
# hist_df5 = px.histogram(
#     df5, x="element_base_value", #color="performance_id",
#      title='Distribution of base values of the elements for all penalized performances',
#      labels={'element_base_value': 'Base value', 'performance_id':'Performance ID'}
#      )
# hist_df5.show()

# Number of aspect_descr in the penalized performances:
df5["aspect_desc"].value_counts() # Sono 422 tipi di aspect_desc: sono uguli a quelli di df senza aver tolto quelli con total_deductions=0
#NB Sort df by total_deductions in descending order

ChSq1         2448
CCoSp4        2106
StSq3         1791
2A            1521
3A            1377
              ... 
3Lz<+3T          9
3F+2T+2T         9
4S+2T+2Lo        9
1RH3+kpNYY       9
3F+2T+2Lo<       9
Name: aspect_desc, Length: 422, dtype: int64

Sono simili $\implies$ non elimino nessun base_value

### 3.2. Distribuzione aspect_desc

In [18]:
# Mostra la distribuzione degli aspect_desc per le prime 10 più penalizzate performance_id
hist2_df4 = px.histogram(
    df4, x="aspect_desc", color="performance_id",
     title='Distribution of aspect_desc of the elements for the worst 10 performances',
     labels={'aspect_desc': 'Element description', 'performance_id':'Performance ID'}
     )

#fig = px.histogram(df, x="day", category_orders=dict(day=["Thur", "Fri", "Sat", "Sun"]))
hist2_df4.show()
#NB Sort df by total_deductions in descending order

In [19]:
# Mostra la distribuzione degli element_desc per tutte performance penalizzate:

hist2_df5 = px.histogram(
    df5, x="aspect_desc", #color="performance_id",
     title='Distribution of aspect_desc of the elements for all penalized performances',
     labels={'aspect_desc': 'Element description', 'performance_id':'Performance ID'}
     )
hist2_df5.show()
#NB Sort df by total_deductions in descending order

Sembra evidente che le 10 performance più penalizzate abbiano solo pochi aspect_desc.

Avrebbe più senso **dividere il dataset per categoria** (ex program) e poi fare l'analisi.

### 3.3. Bubble chart of worst 10 performances and all

In [20]:
bubble_df4 = px.scatter(
    df4, y="element_base_value", x="aspect_desc",
    color="performance_id", size="total_deductions", 
    title='Worst 10 performance',
    hover_name="aspect_desc", log_x=False, size_max=30
    )
bubble_df4.show()
#NB Sort df by total_deductions in descending order

Vediamo sull'intero dataset:

In [21]:
# Pesante e non molto informativo perché poco riassuntivo
# bubble_df5 = px.scatter(
#     df5, y="element_base_value", x="aspect_desc",
#     color="performance_id", size="total_deductions", 
#     title='All penalized performances',
#     hover_name="aspect_desc", log_x=False, size_max=60
#     )
# bubble_df5.show()
# # NB Sort df by total_deductions in descending order

## Questioni aperte

* dimensione bolla = media o contatore delle volte che l'elemento è presente in una performance penalizzata -> next section
* colore è la categoria (ex program, new category + young) -> next section
* aspect_desc lascio a sequenza -> ok
* total_deductions come sono stabilite -> vedi Teams
* visualizzazione -> next section

## 4. Summary per categorie

In [22]:
# Quali sono le categorie di program nel dataset con penalizzazioni?
df = df.apply(lambda x: x) # From DataFrameGroupBy to DataFrame
df['program'].value_counts() # 14 categorie! Mancano 2 categorie JUNIOR MEN SHORT PROGRAM e JUNIOR ICE DANCE SHORT DANCE

MEN FREE SKATING               13869
PAIRS FREE SKATING              7965
LADIES FREE SKATING             7911
MEN SHORT PROGRAM               4995
LADIES SHORT PROGRAM            3357
PAIRS SHORT PROGRAM             2936
ICE DANCE FREE DANCE            1908
ICE DANCE SHORT DANCE            891
JUNIOR MEN FREE SKATING          855
JUNIOR PAIRS FREE SKATING        576
JUNIOR LADIES FREE SKATING       396
JUNIOR PAIRS SHORT PROGRAM       252
JUNIOR LADIES SHORT PROGRAM      189
JUNIOR ICE DANCE FREE DANCE       63
Name: program, dtype: int64

Ricorda: ci sono 422 tipi di `aspect_desc` (ESCLUSI i components e le performance non penalizzate!)

In [23]:
# Quante performance ci sono per ogni elemento penalizzato?
# Prima elimino le righe di doppioni di aspect_id con i diversi voti dei giudici:
df6 = df.drop_duplicates(subset=['aspect_id', 'performance_id'])

# Elimina le colonne che non servono:
df6 = df6.drop(['judge_name', 'judge_nation', 'judge_score', 'section'], axis=1)

# Raggruppo per aspect_desc e conto il numero di performance per ogni aspect_desc creando una nuova colonna 'performance_count':
perf_list = df6.groupby(['aspect_desc', 'program']).size().reset_index(name='n_aspect_program')

# perf_list['aspect_counts'] //= 9 # Divide by 9 since for each aspect in each performance there are 9 judges voting it
# perf_list = perf_list.groupby('aspect_desc').sum()
# perf_list = perf_list.reset_index()
# perf_list.columns = ['aspect_desc', 'aspect_count']
# perf_list['aspect_counts'].value_counts()
# Trabsform perf_list in a list of tuples:
#perf_list = perf_list.values.tolist()
# print(perf_list[perf_list['aspect_counts']==8])
# df[df['performance_id']=='d78af4f3d5']
perf_list



Unnamed: 0,aspect_desc,program,n_aspect_program
0,1A,LADIES FREE SKATING,4
1,1A,MEN FREE SKATING,17
2,1A+1A+SEQ,PAIRS FREE SKATING,1
3,1A+1Lo+2F,LADIES FREE SKATING,1
4,1A+2A+SEQ,PAIRS FREE SKATING,1
...,...,...,...
831,StSq4,MEN SHORT PROGRAM,19
832,StSq4,PAIRS SHORT PROGRAM,24
833,StSqB,PAIRS SHORT PROGRAM,1
834,StaLi4,ICE DANCE FREE DANCE,6


In [24]:
# Per ogni aspect_desc e ogni performance_id, aggiungo una colonna con il numero di performance in cui è stato penalizzato:
df6 = df6.merge(perf_list, on=['aspect_desc', 'program'], how='left')
df6.head(1)
df6.sort_values(by='n_aspect_program', ascending=False, inplace=True)

In [25]:
bubble_df6 = px.scatter(
    df6, y="aspect_desc", x="total_deductions",
    color="program", size="n_aspect_program", 
    title='All penalized performances',
    hover_name="aspect_desc", log_x=False, size_max=40,
    )
bubble_df6.update_layout(height=1000)
bubble_df6.show()
#NB Sort df6 by n_aspect_program in descending order

In [26]:
df6.sort_values(by='total_deductions', ascending=False, inplace=True)
bubble2_df6 = px.scatter(
    df6, y="aspect_desc", x="element_base_value",
    color="program", size="total_deductions",
    title='All penalized performances',
    hover_name="aspect_desc", log_x=False, size_max=60,
    )
bubble2_df6.update_layout(height=1000)
bubble2_df6.show()

In [27]:
df6.sort_values(by='n_aspect_program', ascending=False, inplace=True)
bubble2_df6 = px.scatter(
    df6, y="element_base_value", x="total_deductions",
    color="program", size="n_aspect_program", 
    title='All penalized performances',
    hover_name="aspect_desc", log_x=False, size_max=60,
    )
bubble2_df6.update_layout(height=1000)
bubble2_df6.show()

In [28]:
df6.sort_values(by='total_deductions', ascending=False, inplace=True)
bubble2_df6 = px.scatter(
    df6, y="aspect_desc", x="element_base_value",
    color="program", size="total_deductions", 
    title='All penalized performances',
    hover_name="aspect_desc", log_x=False, size_max=60,
    )
bubble2_df6.update_layout(height=1000)
bubble2_df6.show()

In [29]:
df6.sort_values(by='n_aspect_program', ascending=False, inplace=True)
bubble2_df6 = px.scatter(
    df6, y="aspect_desc", x="element_base_value",
    color="program", size="n_aspect_program", 
    title='All penalized performances',
    hover_name="aspect_desc", log_x=False, size_max=60,
    )
bubble2_df6.update_layout(height=1000)
bubble2_df6.show()

* Unisci le categorie (cecilia)
* Prova con tutto il dataset (performance non penalizzate) per paragonare