# Quali sono gli elementi più penalizzanti?

## 1. Importare il dataset

Penalties are given for each performance e they are from `performances.csv` file under column `total_deductions`

In [1]:
import pandas as pd
import numpy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
data = pd.read_csv("final.csv")
print(data.shape) #214531, 19 rows, cols
data.head()

(214491, 19)


Unnamed: 0,competition,program,performance_id,athlete_name,athlete_nation,rank,total_segment_score,total_element_score,total_component_score,total_deductions,aspect_id,aspect_desc,element_order,element_base_value,section,aspect_final_score,judge_name,judge_nation,judge_score
0,ISU European Figure Skating Championships 2017,LADIES SHORT PROGRAM,b639d77459,Evgenia MEDVEDEVA,RUS,1,78.92,42.0,36.92,0.0,00034b9414,Transitions,,,components,9.07,Agita ABELE,LAT,9.0
1,ISU European Figure Skating Championships 2017,LADIES SHORT PROGRAM,b639d77459,Evgenia MEDVEDEVA,RUS,1,78.92,42.0,36.92,0.0,0127af74a7,Skating Skills,,,components,9.14,Agita ABELE,LAT,9.25
2,ISU European Figure Skating Championships 2017,LADIES SHORT PROGRAM,b639d77459,Evgenia MEDVEDEVA,RUS,1,78.92,42.0,36.92,0.0,38bda018f5,3Lo,4.0,5.61,elements,7.21,Agita ABELE,LAT,3.0
3,ISU European Figure Skating Championships 2017,LADIES SHORT PROGRAM,b639d77459,Evgenia MEDVEDEVA,RUS,1,78.92,42.0,36.92,0.0,50366b5d60,3F+3T,3.0,10.56,elements,12.16,Agita ABELE,LAT,3.0
4,ISU European Figure Skating Championships 2017,LADIES SHORT PROGRAM,b639d77459,Evgenia MEDVEDEVA,RUS,1,78.92,42.0,36.92,0.0,6e36e62b05,CCoSp4,6.0,3.5,elements,4.5,Agita ABELE,LAT,2.0


But `data` are grouped by `aspect_id` and `judge_name` not by `performance_id` so we need to group by `performance_id` to get the number of penalties given for each performance.

Let's check if also in `performances.csv` file deduction vary in range $[0, 9]$

In [3]:
performances = pd.read_csv("performances.csv")
print(performances.shape) #214531, 19 rows, cols
print(performances['total_deductions'].value_counts())
print(performances['total_deductions'].value_counts(normalize=True))

(1726, 11)
0.0    1208
1.0     401
2.0      98
3.0       8
4.0       8
6.0       2
9.0       1
Name: total_deductions, dtype: int64
0.0    0.699884
1.0    0.232329
2.0    0.056779
3.0    0.004635
4.0    0.004635
6.0    0.001159
9.0    0.000579
Name: total_deductions, dtype: float64


Hence:
- 70% of `performances` has no penalty deduction
- 23% of `performances` has penalty deduction of 1
- 6% of `performances` has penalty deduction of 2
- 2% of `performances` has penalty deduction of more

## 2. Distribuzione delle performance penalizzate

Now let's group `data` for each performance since we want to know which performance is penalized more.

In [4]:
# Group data by performance eliminating the element with 0 total_deductions e con 'section'= 'components':
df = data[data['total_deductions'] != 0]
df = df[df['section'] != 'components'] # Togliere gli elementi con section=components
df = df.sort_values(by='total_deductions', ascending=False)
df = df.groupby('performance_id') # it returns a DataFrameGroupBy object != DataFrame !!!

# Let's print the first entries in all the groups formed.
df.first()

# Finding the first value of column 'total_deductions' contained in the "007541009f" group:
df.get_group('007541009f')["total_deductions"].tolist()[0]

# Get first value of column 'total_deductions' for each group:
df['total_deductions'].apply(lambda x: x.tolist()[0]).tolist()

# Get all groups as a list:
df.groups.keys() 

df.head() #e3937497cb is the first with 9 pts deduction

# Sort df by total_deductions in descending order: (there is no other way than creating a new DataFrame)
df1 = pd.DataFrame({'performance_id':df.groups.keys(), 'total_deductions':df['total_deductions'].apply(lambda x: x.tolist()[0]).tolist()})
df1 = df1.sort_values(by='total_deductions', ascending=False)
df1.shape #518 x 2 cols

(518, 2)

In [5]:
import plotly.express as px
# Plot data
bar_df1 = px.bar(
    df1, x='performance_id', y='total_deductions', 
    title='Penalty deduction for each performance <br><sup> 518/1726 performance with penalty>0</sup>', 
    labels={'performance_id': 'Performance ID', 'total_deductions':'Deduction points'}
    )
bar_df1.show()
#e3937497cb is the first with 9 pts deduction

In [6]:
#No need to check it multiple times (once is enough)

# Let's check if is the same for file performances.csv:
# Plot data
# dperf = performances[performances['total_deductions'] != 0]
# dperf = dperf.sort_values(by='total_deductions', ascending=False)
# bar_dperf = px.bar(dperf, x='performance_id', y='total_deductions', title='Penalty deduction for each performance')
# bar_dperf.show()#SAME

In [7]:
# Then group df1 by total_deductions and count the number of performances in each group:
df3 = df1.groupby('total_deductions').count()
df3 = df3.reset_index()
df3.columns = ['total_deductions', 'performance_count']
pie_df3 = px.pie(df3, values='performance_count', names='total_deductions', title='Penalty deduction points across performances with penalty>0 <br><sup> 518/1726 in tot </sup>')
pie_df3.show()

In [8]:
# Plot pie chart of data:
# Group data by by performance NOT eliminating the element with 0 total_deductions
df2 = data.sort_values(by='total_deductions', ascending=False)
df2 = df2.groupby('performance_id')
# Sort df2 by total_deductions in descending order: (there is no other way than creating a new DataFrame)
df2 = pd.DataFrame({'performance_id':df2.groups.keys(), 'total_deductions':df2['total_deductions'].apply(lambda x: x.tolist()[0]).tolist()})
df2 = df2.sort_values(by='total_deductions', ascending=False)

# Then group it by total_deductions and count the number of performances in each group:
df2 = df2.groupby('total_deductions').count()
df2 = df2.reset_index()
df2.columns = ['total_deductions', 'performance_count']
df2.head()
pie_df2 = px.pie(df2, values='performance_count', names='total_deductions', title='Penalty deduction points across all performances <br><sup> 1726 in tot</sup>')
pie_df2.show()

## 3. Elementi nelle performance penalizzate

In [9]:
# Mostrami le prime performance più penalizzate:
df.head()

Unnamed: 0,competition,program,performance_id,athlete_name,athlete_nation,rank,total_segment_score,total_element_score,total_component_score,total_deductions,aspect_id,aspect_desc,element_order,element_base_value,section,aspect_final_score,judge_name,judge_nation,judge_score
126786,ISU GP Trophee de France 2016,LADIES FREE SKATING,e3937497cb,Alena LEONOVA,RUS,12,77.49,33.56,52.93,9.0,fc9e599aa6,3F,6.0,5.83,elements,3.73,Steve WINKLER,USA,-3.0
125965,ISU GP Trophee de France 2016,LADIES FREE SKATING,e3937497cb,Alena LEONOVA,RUS,12,77.49,33.56,52.93,9.0,c546e0efe1,3Lz<,2.0,4.20,elements,2.10,Masako KUBOTA,JPN,-3.0
126980,ISU GP Trophee de France 2016,LADIES FREE SKATING,e3937497cb,Alena LEONOVA,RUS,12,77.49,33.56,52.93,9.0,6e397559ac,LSp3,12.0,2.40,elements,2.40,Richard KOSINA,CZE,0.0
126981,ISU GP Trophee de France 2016,LADIES FREE SKATING,e3937497cb,Alena LEONOVA,RUS,12,77.49,33.56,52.93,9.0,7d762897e0,2S,9.0,1.43,elements,1.43,Richard KOSINA,CZE,0.0
126983,ISU GP Trophee de France 2016,LADIES FREE SKATING,e3937497cb,Alena LEONOVA,RUS,12,77.49,33.56,52.93,9.0,b10cbc16ad,3F<<+REP,7.0,1.46,elements,0.56,Richard KOSINA,CZE,-3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75258,ISU GP Trophee de France 2016,PAIRS SHORT PROGRAM,ccb53dcd67,Miriam ZIEGLER / Severin KIEFER,AUT,6,52.06,27.60,25.46,1.0,d49e8dcf31,StSq3,7.0,3.30,elements,3.44,Daniel DELFA,ESP,0.0
75275,ISU GP Trophee de France 2016,PAIRS SHORT PROGRAM,007e8ef343,Marissa CASTELLI / Mervin TRAN,USA,5,59.26,31.08,29.18,1.0,76de777565,3Tw2,1.0,5.80,elements,5.60,Daniel DELFA,ESP,0.0
75276,ISU GP Trophee de France 2016,PAIRS SHORT PROGRAM,007e8ef343,Marissa CASTELLI / Mervin TRAN,USA,5,59.26,31.08,29.18,1.0,7828988c29,5RLi4,6.0,7.50,elements,8.30,Daniel DELFA,ESP,1.0
75278,ISU GP Trophee de France 2016,PAIRS SHORT PROGRAM,007e8ef343,Marissa CASTELLI / Mervin TRAN,USA,5,59.26,31.08,29.18,1.0,92913001ab,3S,2.0,4.40,elements,3.00,Daniel DELFA,ESP,-2.0


In [10]:
df.get_group('e3937497cb') #108 rows × 19 columns

Unnamed: 0,competition,program,performance_id,athlete_name,athlete_nation,rank,total_segment_score,total_element_score,total_component_score,total_deductions,aspect_id,aspect_desc,element_order,element_base_value,section,aspect_final_score,judge_name,judge_nation,judge_score
126786,ISU GP Trophee de France 2016,LADIES FREE SKATING,e3937497cb,Alena LEONOVA,RUS,12,77.49,33.56,52.93,9.0,fc9e599aa6,3F,6.0,5.83,elements,3.73,Steve WINKLER,USA,-3.0
125965,ISU GP Trophee de France 2016,LADIES FREE SKATING,e3937497cb,Alena LEONOVA,RUS,12,77.49,33.56,52.93,9.0,c546e0efe1,3Lz<,2.0,4.20,elements,2.10,Masako KUBOTA,JPN,-3.0
126980,ISU GP Trophee de France 2016,LADIES FREE SKATING,e3937497cb,Alena LEONOVA,RUS,12,77.49,33.56,52.93,9.0,6e397559ac,LSp3,12.0,2.40,elements,2.40,Richard KOSINA,CZE,0.0
126981,ISU GP Trophee de France 2016,LADIES FREE SKATING,e3937497cb,Alena LEONOVA,RUS,12,77.49,33.56,52.93,9.0,7d762897e0,2S,9.0,1.43,elements,1.43,Richard KOSINA,CZE,0.0
126983,ISU GP Trophee de France 2016,LADIES FREE SKATING,e3937497cb,Alena LEONOVA,RUS,12,77.49,33.56,52.93,9.0,b10cbc16ad,3F<<+REP,7.0,1.46,elements,0.56,Richard KOSINA,CZE,-3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126378,ISU GP Trophee de France 2016,LADIES FREE SKATING,e3937497cb,Alena LEONOVA,RUS,12,77.49,33.56,52.93,9.0,fc9e599aa6,3F,6.0,5.83,elements,3.73,Lolita LABUNSKAIYA,RUS,-3.0
126375,ISU GP Trophee de France 2016,LADIES FREE SKATING,e3937497cb,Alena LEONOVA,RUS,12,77.49,33.56,52.93,9.0,f20a1f8a9d,1A,3.0,1.10,elements,1.10,Lolita LABUNSKAIYA,RUS,0.0
126374,ISU GP Trophee de France 2016,LADIES FREE SKATING,e3937497cb,Alena LEONOVA,RUS,12,77.49,33.56,52.93,9.0,cd76288d91,1A+2T,10.0,2.64,elements,2.33,Lolita LABUNSKAIYA,RUS,-2.0
126373,ISU GP Trophee de France 2016,LADIES FREE SKATING,e3937497cb,Alena LEONOVA,RUS,12,77.49,33.56,52.93,9.0,c546e0efe1,3Lz<,2.0,4.20,elements,2.10,Lolita LABUNSKAIYA,RUS,-3.0


### 3.1. Distribuzione base_value

Voglio indagare come sono distribuiti i base_value in modo da capire se impostare un trashold per eliminare i base_value troppo bassi dal dataframe df.

In [11]:
df1.head(10) #e3937497cb is the first with 9 pts deduction

Unnamed: 0,performance_id,total_deductions
452,e3937497cb,9.0
391,c5dcc99de5,6.0
319,9fbf9b7df2,6.0
451,e37a1094f2,4.0
503,fb902cf344,4.0
467,eb80e313e9,4.0
108,36ee135925,4.0
484,f689dd7cff,4.0
486,f6fc06bf02,4.0
224,709ab37e52,4.0


In [12]:
worst10 = df1.head(10).performance_id.tolist()
worst10

['e3937497cb',
 'c5dcc99de5',
 '9fbf9b7df2',
 'e37a1094f2',
 'fb902cf344',
 'eb80e313e9',
 '36ee135925',
 'f689dd7cff',
 'f6fc06bf02',
 '709ab37e52']

In [13]:
df4 = data[data['total_deductions'] != 0]
df4 = df4[df4['section'] != 'components'] # Togliere gli elementi con section=components
df4 = df4.sort_values(by='total_deductions', ascending=False)
# Prendi solo le performance che sono tra le 10 peggiori:
df4 = df4[df4['performance_id'].isin(worst10)]
df4.head()

Unnamed: 0,competition,program,performance_id,athlete_name,athlete_nation,rank,total_segment_score,total_element_score,total_component_score,total_deductions,aspect_id,aspect_desc,element_order,element_base_value,section,aspect_final_score,judge_name,judge_nation,judge_score
126786,ISU GP Trophee de France 2016,LADIES FREE SKATING,e3937497cb,Alena LEONOVA,RUS,12,77.49,33.56,52.93,9.0,fc9e599aa6,3F,6.0,5.83,elements,3.73,Steve WINKLER,USA,-3.0
125965,ISU GP Trophee de France 2016,LADIES FREE SKATING,e3937497cb,Alena LEONOVA,RUS,12,77.49,33.56,52.93,9.0,c546e0efe1,3Lz<,2.0,4.2,elements,2.1,Masako KUBOTA,JPN,-3.0
126980,ISU GP Trophee de France 2016,LADIES FREE SKATING,e3937497cb,Alena LEONOVA,RUS,12,77.49,33.56,52.93,9.0,6e397559ac,LSp3,12.0,2.4,elements,2.4,Richard KOSINA,CZE,0.0
126981,ISU GP Trophee de France 2016,LADIES FREE SKATING,e3937497cb,Alena LEONOVA,RUS,12,77.49,33.56,52.93,9.0,7d762897e0,2S,9.0,1.43,elements,1.43,Richard KOSINA,CZE,0.0
126983,ISU GP Trophee de France 2016,LADIES FREE SKATING,e3937497cb,Alena LEONOVA,RUS,12,77.49,33.56,52.93,9.0,b10cbc16ad,3F<<+REP,7.0,1.46,elements,0.56,Richard KOSINA,CZE,-3.0


In [14]:
# Mostra la distribuzione degli element_base_value per le prime 10 più penalizzate performance_id
hist_df4 = px.histogram(
    df4, x="element_base_value", color="performance_id",
     title='Distribution of base values of the elements for the worst 10 performances',
     labels={'element_base_value': 'Base value', 'performance_id':'Performance ID'}
     )
hist_df4.show()

In [15]:
# Mostra la distribuzione degli element_base_value per le prime 10 più penalizzate performance_id
#df4 = df4.sort_values(by='element_base_value', ascending=False)
#df4 = df4.groupby(['performance_id','element_base_value'])
#df4 = df4.reset_index()
#df4 = pd.DataFrame({'performance_id':df4.groups.keys(), 'element_base_value':df4['element_base_value'].apply(lambda x: x.tolist()[0]).tolist()})
#df4.head()
#df4.first()

Paragoniamola con quella di tutte le performance:

In [16]:
# Mostra la distribuzione degli element_base_value per tutte performance penalizzate:
df5 = data[data['total_deductions'] != 0]
df5 = df5[df5['section'] != 'components'] # Togliere gli elementi con section=components
df5 = df5.sort_values(by='total_deductions', ascending=False)
# hist_df5 = px.histogram(
#     df5, x="element_base_value", #color="performance_id",
#      title='Distribution of base values of the elements for all penalized performances',
#      labels={'element_base_value': 'Base value', 'performance_id':'Performance ID'}
#      )
# hist_df5.show()

Sono simili $\implies$ non elimino nessun base_value

### 3.2. Distribuzione aspect_desc

In [17]:
# Mostra la distribuzione degli aspect_desc per le prime 10 più penalizzate performance_id
hist2_df4 = px.histogram(
    df4, x="aspect_desc", color="performance_id",
     title='Distribution of aspect_desc of the elements for the worst 10 performances',
     labels={'aspect_desc': 'Element description', 'performance_id':'Performance ID'}
     )

#fig = px.histogram(df, x="day", category_orders=dict(day=["Thur", "Fri", "Sat", "Sun"]))
hist2_df4.show()

In [18]:
# Mostra la distribuzione degli element_desc per tutte performance penalizzate:

hist2_df5 = px.histogram(
    df5, x="aspect_desc", #color="performance_id",
     title='Distribution of aspect_desc of the elements for all penalized performances',
     labels={'aspect_desc': 'Element description', 'performance_id':'Performance ID'}
     )
hist2_df5.show()

### 3.3. Bubble chart of worst 10 performances and all

In [19]:
bubble_df4 = px.scatter(
    df4, y="element_base_value", x="aspect_desc",
    color="performance_id", size="total_deductions", 
    title='Worst 10 performance',
    hover_name="aspect_desc", log_x=False, size_max=30
    )
bubble_df4.show()

In [20]:
bubble_df5 = px.scatter(
    df5, y="element_base_value", x="aspect_desc",
    color="performance_id", size="total_deductions", 
    title='All penalized performances',
    hover_name="aspect_desc", log_x=False, size_max=60
    )
bubble_df5.show()

## Questioni aperte

* dimensione bolla = media o contatore delle volte che l'elemento è presente in una performance penalizzata.
* colore è la categoria (ex program, new category + young).
* aspect_desc lascio a sequenza.
* total_deductions come sono stabilite.
+ visualizzazione