# Database

#### **Athlete Biography**

| Column | Type (value) | Pre-process |
| :-: | :-: | :-- |
| athlete_id      | int                ||
| name            | str                ||
| sex             | str (Male/Female)  ||
| born            | str (date)         | Standarize format (datetime, default 01/01/YYYY) |
| height          | int (cm)           ||
| weight          | int (kg)           ||
| country         | str (country_name) ||
| country_noc     | str (country_noc)  ||
| description     | str (info)         ||
| spetial_notes   | str (info)         ||

#### **Athlete Event Details**

| Column | Type (value) | Pre-process |
| :-: | :-: | :-- |
| edition     | str (year + event)      ||
| edition_id  | int                     ||
| country_noc | str (of the athlete)    ||
| sport       | str (of the athlete)    ||
| event       | str (within the sport)  ||
| result_id   | int (results)           ||
| athlete     | str (name)              ||


#### **Country Profiles**

| Column | Type (value) | Pre-process |
| :-: | :-: | :-- |
| noc       | str (3-letter word)   ||
| country   | str (name)            | Remove ROC |

#### **Event Results**

| Column | Type (value) | Pre-process |
| :-: | :-: | :-- |
| result_id             | int                               ||
| event_title           | str (title, sex)                  ||
| edition               | str (year + event)                ||
| edition_id            | int                               ||
| sport                 | str (of the event)                ||
| sport_url             | web (/editions/DD/sports/SSS)     ||
| result_date           | str (date + time)                 | Standarize format (datetime) |
| result_location       | str (loc info)                    | Standarize format (city, country) |
| result_participants   | str (number of people)            ||
| result_format         | str (description on how to win)   ||

#### **Games Summary**

| Column | Type (value) | Pre-process |
| :-: | :-: | :-- |
| edition           | str (year + event)    ||
| edition_id        | int                   ||
| edition_url       | web (/editiond/DD)    ||
| year              | int                   ||
| city              | str (name)            ||
| country_flag_url  | url (https://...)     ||
| country_noc       | str (3-letter word)   ||
| start date        | str (datetime)        | Standarize format (datetime)
| end_date          | str (datetime)        | Standarize format (datetime)
| competition_date  | str (datetimes)       | Standarize format (datetime)

#### **Medal Tally History**

| Column | Type (value) | Pre-process |
| :-: | :-: | :-- |
| edition           | str (year + event)    ||
| edition_id        | int                   ||
| year              | int                   ||
| country           | str (name)            ||
| country_noc       | str (3-letter word)   ||
| gold              | int                   ||
| silver            | int                   ||
| bronze            | int                   ||
| total             | int (sum of previous) ||

# Code

In [54]:
import pandas as pd
import numpy as np
from datetime import date



False

In [None]:
# Reading in Data

class DF():
    def __init__(self):
        try:
            self.df_athlete_biography       = pd.read_csv('./data/Olympic_Athlete_Biography.csv')
            self.df_athlete_event_details   = pd.read_csv('./data/Olympic_Athlete_Event_details.csv')
            self.df_country_profiles        = pd.read_csv('./data/Olympic_Country_Profiles.csv')
            self.df_event_results           = pd.read_csv('./data/Olympic_Event_Results.csv')
            self.df_games_summary           = pd.read_csv('./data/Olympic_Games_Summary.csv')
            self.df_medal_tally             = pd.read_csv('./data/Olympic_Medal_Tally_History.csv')
        except FileNotFoundError as error:
            print('File Not Found: ' + error.filename)
        else:
            self._preprocess()
    
    def _preprocess(self):
        # Preprocess born data from string to datetime ->  YYYY-MM-DD
        self.df_athlete_biography['born'] = self.df_athlete_biography['born'].apply(self.__find_date)

    def __find_date(self, txt):
        txt = str(txt)   # In case txt = 1920 (numeric)
        numbers = '0123456789'
        months  = ['january', 'february', 'march', 'april', 'may', 'june',
                'july', 'august', 'september', 'october', 'november', 'december']
        text    = list(txt)

        day = None
        month = None
        year = None

        # Find day and year
        prev_carac = ' '
        for i, carac in enumerate(text):
            if carac in numbers and prev_carac not in numbers: # Check for starting number
                try:
                    if text[i+1] in numbers:                # 12...
                        if text[i+2] not in numbers:        # 12 ...
                            day = int(''.join(text[i:i+2]))
                        elif text[i+3] in numbers:          # 1234...
                            year = int(''.join(text[i:i+4]))     # Keeps last year: "(1920 or 1921)" -> 1921
                    else:                                   # 1 ...
                        day = int(carac)
                except:
                    continue
            prev_carac = carac
        # Simple check for month
        for word in txt.split(' '):
            if word.lower() in months:
                month = months.index(word.lower())+1
        
        # Without year, there is no way of telling
        if year is None:
            return np.nan
        
        # Update incompleted dates:
        if day is None:
            day = 1
        if month is None:
            month = 1
        
        # Return representative values
        return date(year, month, day).isoformat()

data = DF()
data.df_athlete_biography

Unnamed: 0,athlete_id,name,sex,born,height,weight,country,country_noc,description,special_notes
0,65649,Ivanka Bonova,Female,1949-04-04,166.0,55,Bulgaria,BUL,Personal Best: 400 – 53.54 (1980).,
1,112510,Nataliya Uryadova,Female,1977-03-15,184.0,70,Russian Federation,RUS,,Listed in Olympians Who Won a Medal at the Eur...
2,114973,Essa Ismail Rashed,Male,1986-12-14,165.0,55,Qatar,QAT,Personal Best: 10000 – 27:20.97 (2006).,Listed in Olympians Who Won a Medal at the Asi...
3,30359,Péter Boros,Male,1908-01-12,,,Hungary,HUN,"Between 1927 and 1938, Péter Boros competed as...",
4,50557,Rudolf Piowatý,Male,1900-04-28,,,Czechoslovakia,TCH,Rudolf Piowaty joined the Czechoslovak militar...,
...,...,...,...,...,...,...,...,...,...,...
155856,23748,Todd Makler,Male,1946-01-08,183.0,75,United States,USA,"Todd Makler grew up in suburban Philadelphia, ...","Brother of Brooke Makler Son of Paul Makler, Sr."
155857,58581,Géza Hollósi,Male,1938-05-02,175.0,79,Hungary,HUN,Géza Hollósi had the following finishes at maj...,
155858,30387,József Keresztessy,Male,1885-09-19,,,Hungary,HUN,József Keresztessy was the grandson of József ...,
155859,69900,Alexander Thieme,Male,1954-01-13,187.0,84,East Germany,GDR,East German Sprinter Alexander Thieme reached ...,Listed in Olympians Who Won a Medal at the Eur...


In [None]:
def world_map_data(**kwargs):
    # in:      None. ¿Sport and Event selection?
    # out:     list(tuple(id, number of total medals))
    # default: All history

    # default algorithm (no filters, all history)
    

def list_of_sports(country_noc):
    # in:      str(Country_noc) | any identifyer of the country, to be defined
    # out:     list(Sports)     | Sports that the country have competed in
    # default: all sports
    pass

def ditr_corr(x, sport, *years):
    # in:      x, sport, year(s)
    # out:     age, weight, height
    # default: None
    pass

def get_age(athlete_id):
    pass

In [None]:
print(df_athlete_event_details.shape)

df_filtered = df_athlete_event_details[df_athlete_event_details["medal"].notna()]

print(df_filtered[df_filtered["medal"] == "Bronze"].shape)
print(df_filtered[df_filtered["medal"] == "Silver"].shape)
print(df_filtered[df_filtered["medal"] == "Gold"].shape)

# Some rows dont match but we dont really care we are gonna rely on medals
display(df_filtered[df_filtered["pos"] == "3"][df_filtered["medal"] != "Bronze"])
display(df_filtered[df_filtered["pos"] == "2"][df_filtered["medal"] != "Silver"])
display(df_filtered[df_filtered["pos"] == "1"][df_filtered["medal"] != "Gold"])



(316834, 11)
(14939, 11)
(14676, 11)
(15072, 11)


  display(df_filtered[df_filtered["pos"] == "3"][df_filtered["medal"] != "Bronze"])


Unnamed: 0,edition,edition_id,country_noc,sport,event,result_id,athlete,athlete_id,pos,medal,isTeamSport
284348,1912 Summer Olympics,6,SWE,Athletics,"Decathlon, Men",56970,Charles Lomberg,76283,3,Silver,False
296935,1912 Summer Olympics,6,USA,Athletics,"Pentathlon, Men",57014,Jim Donahue,78331,3,Silver,False


  display(df_filtered[df_filtered["pos"] == "2"][df_filtered["medal"] != "Silver"])


Unnamed: 0,edition,edition_id,country_noc,sport,event,result_id,athlete,athlete_id,pos,medal,isTeamSport
31133,2016 Summer Olympics,59,EGY,Weightlifting,"Middleweight, Men",354317,Mohamed Ihab,133608,2,Bronze,False
62650,2014 Winter Olympics,58,AUT,Biathlon,"4 × 7.5 kilometres Relay, Men",350026,Christoph Sumann,101195,2,Bronze,True
62651,2014 Winter Olympics,58,AUT,Biathlon,"4 × 7.5 kilometres Relay, Men",350026,Daniel Mesotitsch,101212,2,Bronze,True
62652,2014 Winter Olympics,58,AUT,Biathlon,"4 × 7.5 kilometres Relay, Men",350026,Simon Eder,118582,2,Bronze,True
62653,2014 Winter Olympics,58,AUT,Biathlon,"4 × 7.5 kilometres Relay, Men",350026,Dominik Landertinger,118518,2,Bronze,True
68358,2012 Summer Olympics,54,KAZ,Weightlifting,"Light-Heavyweight, Women",331063,Anna Nurmukhambetova,126715,2,Bronze,False


  display(df_filtered[df_filtered["pos"] == "1"][df_filtered["medal"] != "Gold"])


Unnamed: 0,edition,edition_id,country_noc,sport,event,result_id,athlete,athlete_id,pos,medal,isTeamSport
74641,2000 Summer Olympics,25,GRE,Athletics,"100 metres, Women",65683,Aikaterini Thanou,70693,1,Silver,False
144927,2014 Winter Olympics,58,GER,Biathlon,"4 × 7.5 kilometres Relay, Men",350026,Erik Lesser,127808,1,Silver,True
144928,2014 Winter Olympics,58,GER,Biathlon,"4 × 7.5 kilometres Relay, Men",350026,Daniel Böhm,127807,1,Silver,True
144929,2014 Winter Olympics,58,GER,Biathlon,"4 × 7.5 kilometres Relay, Men",350026,Arnd Peiffer,118509,1,Silver,True
144930,2014 Winter Olympics,58,GER,Biathlon,"4 × 7.5 kilometres Relay, Men",350026,Simon Schempp,118583,1,Silver,True
192783,2016 Summer Olympics,59,CHN,Weightlifting,"Middleweight, Men",354317,Lu Xiaojun,121984,1,Silver,False
