In [1]:
import pandas as pd
import plotly_express as px

In [7]:
# reading athlete_events as df1
df1 = pd.read_csv("../Data/athlete_events.csv")
#df1.head()

# reading noc_regions as df2, going to merge these
df2 = pd.read_csv("../Data/noc_regions.csv")
#df2.head()
#df1.head(1)

# merging both files into one on NOC
df = pd.merge(df1, df2, on="NOC", how="left")
df.head(2)

# cross country skiing only dataframe
xcs_df = df[df["Sport"] == "Cross Country Skiing"]
xcs_df.head()


# dropping 'ID', 'Name', 'Age', 'Height', 'Weight' and NaN medals
dropped_df = xcs_df.drop(columns=['ID','Name', 'Age', 'Height','Weight']).dropna(subset='Medal')

# dropping duplicates to seperate team events
# going to use 'reusable_df' as a basline dataframe
# 
reusable_df = dropped_df.drop_duplicates()

In [None]:
# TODO: delete this cell
dropped_df.count() # count = 766

In [None]:
# TODO: delete this cell
reusable_df.count() # count = 461

### Men relay 10 km 

In [17]:
# taking 'Cross Country Skiing Men's 4 x 10 kilometres Relay' only to save in a dataframe
df_men_relay = reusable_df[reusable_df["Event"] == "Cross Country Skiing Men's 4 x 10 kilometres Relay"]

# grouping by event + region. new column '0', counts medals. 
# going to rename this one after concat with other mens team event
df_men_relay = df_men_relay.groupby(["Event", "region"])[["Medal"]].value_counts().to_frame().reset_index()

#TODO: sort medals
df_men_relay.head(2)

Unnamed: 0,Event,region,Medal,0
0,Cross Country Skiing Men's 4 x 10 kilometres R...,Czech Republic,Bronze,2
1,Cross Country Skiing Men's 4 x 10 kilometres R...,Finland,Bronze,6


In [21]:
# same as above but for xcs men's team sprint
df_men_sprint = reusable_df[reusable_df["Event"] == "Cross Country Skiing Men's Team Sprint"]
df_men_sprint = df_men_sprint.groupby(["Event", "region"])[["Medal"]].value_counts().to_frame().reset_index()
df_men_sprint.head(2)

Unnamed: 0,Event,region,Medal,0
0,Cross Country Skiing Men's Team Sprint,Finland,Gold,1
1,Cross Country Skiing Men's Team Sprint,Germany,Silver,1


### concating df_men_relay and df_men_sprint

In [24]:
frames = [df_men_sprint, df_men_relay]
concat_men_df = pd.concat(frames)
# This df consists of df_men_sprint AND df_men_relay
concat_men_df.head(9)

Unnamed: 0,Event,region,Medal,0
0,Cross Country Skiing Men's Team Sprint,Finland,Gold,1
1,Cross Country Skiing Men's Team Sprint,Germany,Silver,1
2,Cross Country Skiing Men's Team Sprint,Norway,Gold,1
3,Cross Country Skiing Men's Team Sprint,Norway,Silver,1
4,Cross Country Skiing Men's Team Sprint,Russia,Bronze,2
5,Cross Country Skiing Men's Team Sprint,Russia,Silver,1
6,Cross Country Skiing Men's Team Sprint,Sweden,Bronze,1
7,Cross Country Skiing Men's Team Sprint,Sweden,Gold,1
0,Cross Country Skiing Men's 4 x 10 kilometres R...,Czech Republic,Bronze,2


## sorting medals to get a nicer plot
* and renaming column '0' -> 'Amount'

In [25]:
concat_men_df = concat_men_df.rename({0:'Amount'}, axis=1)

# found this on stackoverflow:
# https://stackoverflow.com/questions/52784410/sort-column-in-pandas-dataframe-by-specific-order

concat_men_df.Medal = pd.Categorical(concat_men_df.Medal,categories=['Bronze', 'Silver', 'Gold'])
concat_men_df = concat_men_df.sort_values('Medal')
concat_men_df

Unnamed: 0,Event,region,Medal,Amount
18,Cross Country Skiing Men's 4 x 10 kilometres R...,Switzerland,Bronze,1
16,Cross Country Skiing Men's 4 x 10 kilometres R...,Sweden,Bronze,4
13,Cross Country Skiing Men's 4 x 10 kilometres R...,Russia,Bronze,3
4,Cross Country Skiing Men's Team Sprint,Russia,Bronze,2
11,Cross Country Skiing Men's 4 x 10 kilometres R...,Norway,Bronze,1
6,Cross Country Skiing Men's Team Sprint,Sweden,Bronze,1
0,Cross Country Skiing Men's 4 x 10 kilometres R...,Czech Republic,Bronze,2
1,Cross Country Skiing Men's 4 x 10 kilometres R...,Finland,Bronze,6
6,Cross Country Skiing Men's 4 x 10 kilometres R...,Germany,Bronze,1
4,Cross Country Skiing Men's 4 x 10 kilometres R...,France,Bronze,1


## plotting the concated df

In [29]:
fig = px.histogram(
    concat_men_df,
    x="region",
    y="Amount",
    color="Medal",
    labels={"Sport": "Sport", "0": "medals", "region": "Country"},
    barmode="group",
    title="Men's team cross country skiing medals",
    text_auto = True,
    #pattern_shape = "Medal", pattern_shape_sequence=["-", "x", "+"] # creates a pattern on each bar. kinda ugly
    color_discrete_sequence=[px.colors.qualitative.Dark2[6],px.colors.qualitative.Dark2[7],px.colors.qualitative.Dark2[5]]
)
# color_bars = {"Silver": "silver", "Bronze": "bronze", "Gold": "gold"}
# fig.update_traces(marker_color=color_bars)
fig.show()

In [30]:
#TODO: Create all 3 plots
#TODO: subplots with px 3x1. Visa enbart länder med 
#TODO: Live adjustable subplot-width?

Cross Country Skiing Women's 3 x 5 kilometres Relay

In [31]:
df_women_relay = reusable_df[reusable_df["Event"] == "Cross Country Skiing Women's 3 x 5 kilometres Relay"]
df_women_relay = df_women_relay.drop(columns=['ID','Name', 'Age', 'Height','Weight']).dropna(subset='Medal').drop_duplicates()

Unnamed: 0,Sex,Team,NOC,Games,Year,Season,City,Sport,Event,Medal,region,notes
1210,F,Soviet Union,URS,1968 Winter,1968,Winter,Grenoble,Cross Country Skiing,Cross Country Skiing Women's 3 x 5 kilometres ...,Bronze,Russia,
3940,M,Finland,FIN,1960 Winter,1960,Winter,Squaw Valley,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,Gold,Finland,
3969,M,Italy,ITA,1992 Winter,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 10 kilometres,Silver,Italy,
3972,M,Italy,ITA,1992 Winter,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,Silver,Italy,
3973,M,Italy,ITA,1994 Winter,1994,Winter,Lillehammer,Cross Country Skiing,Cross Country Skiing Men's 10 kilometres,Bronze,Italy,
