# Question

In [1]:
from sqlalchemy import create_engine
import pandas as pd

# Database connection URL
database_url = 'mysql+pymysql://360556_root:hackathonipssi*@mysql-hackathonipssi.alwaysdata.net/hackathonipssi_mia4'

# Create a database engine
engine = create_engine(database_url)

# Connect to the database and read tables
try:
    connection = engine.connect()
    df_athletes = pd.read_sql_table('Athlete', con=connection)
    df_hosts = pd.read_sql_table('Host', con=connection)
    df_medals = pd.read_sql_table('Medal', con=connection)
    df_results = pd.read_sql_table('Result', con=connection)
    print("Athlete Data:\n", df_athletes.head())
    print("Host Data:\n", df_hosts.head())
    print("Medal Data:\n", df_medals.head())
    print("Result Data:\n", df_results.head())
finally:
    connection.close()  # Always close the connection



Athlete Data:
                                          athlete_url     athlete_full_name  \
0  https://olympics.com/en/athletes/a-aziz-hassan...  A-Aziz Hassan JALOOF   
1    https://olympics.com/en/athletes/a-baser-wasiqi        A Baser WASIQI   
2          https://olympics.com/en/athletes/a-darnis             A. DARNIS   
3  https://olympics.com/en/athletes/a-germaine-go...   A. Germaine GOLDING   
4          https://olympics.com/en/athletes/a-j-hurt              A J HURT   

   games_participations      first_game  athlete_year_birth athlete_medals  \
0                   2.0  Barcelona 1992              1973.0           None   
1                   1.0    Atlanta 1996              1975.0           None   
2                   1.0      Paris 1900                 0.0           None   
3                   1.0      Paris 1924              1887.0           None   
4                   1.0    Beijing 2022              2000.0           None   

                                               

### La France a organisé 6 JO : 3 d’hiver et 3 d’été (en comptant celui de 2024) ?

In [2]:
# Vérifiez les valeurs uniques dans la colonne 'game_season'
unique_game_seasons = df_hosts['game_season'].unique()
print(unique_game_seasons)


['Winter' 'Summer']


In [3]:
# Filtrer les données pour afficher uniquement les lignes où 'game_location' est égale à 'France'
df_hosts_france = df_hosts[df_hosts['game_location'] == 'France']

# Afficher les résultats filtrés
print(df_hosts_france)

# Compter le nombre de lignes pour chaque valeur unique dans 'game_season'
season_counts = df_hosts_france['game_season'].value_counts()

# Afficher les résultats des comptages
print(season_counts)

           game_slug       game_end_date     game_start_date game_location  \
0   albertville-1992 1992-02-23 19:00:00 1992-02-08 07:00:00        France   
11     chamonix-1924 1924-02-05 20:00:00 1924-01-25 08:00:00        France   
14     grenoble-1968 1968-02-18 19:00:00 1968-02-06 07:00:00        France   
33        paris-1900 1900-10-28 19:50:39 1900-05-14 08:50:39        France   
34        paris-1924 1924-07-27 19:00:00 1924-05-04 07:00:00        France   

           game_name game_season  game_year  
0   Albertville 1992      Winter       1992  
11     Chamonix 1924      Winter       1924  
14     Grenoble 1968      Winter       1968  
33        Paris 1900      Summer       1900  
34        Paris 1924      Summer       1924  
game_season
Winter    3
Summer    2
Name: count, dtype: int64


### La France est le 2è pays qui a organisé le plus de JO après les USA (8 JO) ?

In [4]:
# Créer un DataFrame pour compter le nombre de Jeux Olympiques organisés par chaque pays
olympic_hosts_count = df_hosts['game_location'].value_counts().reset_index()
olympic_hosts_count.columns = ['Country', 'Number of Olympic Games Hosted']
olympic_hosts_count = olympic_hosts_count.sort_values(by='Number of Olympic Games Hosted', ascending=False)
# Afficher le DataFrame créé
print(olympic_hosts_count)


# Accéder à la deuxième ligne du DataFrame trié
second_most_hosting_country = olympic_hosts_count.iloc[1]

# Afficher la deuxième ligne
print('le 2è pays qui a organisé le plus de JO')
second_most_hosting_country

                        Country  Number of Olympic Games Hosted
0                 United States                               8
1                        France                               5
2                         Japan                               4
3                        Canada                               3
4                         Italy                               3
5                 Great Britain                               3
10                       Norway                               2
12            Republic of Korea                               2
11                      Germany                               2
9                       Austria                               2
8                   Switzerland                               2
7                        Greece                               2
6                         China                               2
19  Federal Republic of Germany                               1
24                        Spain         

Country                           France
Number of Olympic Games Hosted         5
Name: 1, dtype: object

### Les JO d’hiver sont nés à Chamonix en 1924 ?

In [5]:
# Filtrer pour obtenir uniquement les Jeux Olympiques d'hiver
winter_games = df_hosts[df_hosts['game_season'] == 'Winter']

# Trouver la ligne avec la date de début la plus ancienne des Jeux Olympiques d'hiver
oldest_winter_game = winter_games.loc[winter_games['game_start_date'].idxmin()]

# Afficher la date la plus ancienne et le pays correspondant
print(f"The earliest Winter Olympic Games started on: {oldest_winter_game['game_start_date']} in {oldest_winter_game['game_name']}")


The earliest Winter Olympic Games started on: 1924-01-25 08:00:00 in Chamonix 1924


### JO de Paris, en 1900 : les femmes peuvent participer aux JO ?

In [6]:
# Custom operation because this isn't a direct equality or join key
merged_df = pd.merge(df_hosts, df_medals, left_on='game_slug', right_on='slug_game')

# Step 3: Filter for rows where event_gender is either 'Mixed' or 'Women'
filtered_df = merged_df[(merged_df['event_gender'] == 'Mixed') | (merged_df['event_gender'] == 'Women')]
filtered_df = filtered_df.sort_values(by='game_start_date', ascending=True)
# Display the results

filtered_df = filtered_df.iloc[0]
print(filtered_df[['game_start_date', 'game_name', 'game_location', 'slug_game', 'event_gender']])


game_start_date    1900-05-14 08:50:39
game_name                   Paris 1900
game_location                   France
slug_game                   paris-1900
event_gender                     Women
Name: 11830, dtype: object


### Seuls 4 athlètes ont remporté des médailles à la fois aux JO d’hiver et d’été. Une seule d’entre eux, Christa Ludinger-Rothenburger, a remporté des médailles au cours de la même année ?

In [7]:
# Custom operation because this isn't a direct equality or join key
merged_df = pd.merge(df_hosts, df_medals, left_on='game_slug', right_on='slug_game')

# Filter out rows where medal_type is 'NULL'
valid_medals_df = merged_df[merged_df['medal_type'] != 'NULL']

# Step 3: Group by athlete names and Olympic types (assuming Olympic type is in 'game_season')
# Ensure that only valid medals are considered
athlete_medals = valid_medals_df.groupby(['athlete_url', 'game_season']).size().unstack(fill_value=0)

# Filter athletes who have at least one medal in both Winter and Summer Olympics
dual_medalists = athlete_medals[(athlete_medals['Winter'] > 0) & (athlete_medals['Summer'] > 0)]

# Display these athletes
print(dual_medalists)

game_season                                        Summer  Winter
athlete_url                                                      
https://olympics.com/en/athletes/christa-luding         1       4
https://olympics.com/en/athletes/clara-hughes           2       3
https://olympics.com/en/athletes/gillis-grafstrom       1       3
https://olympics.com/en/athletes/lauryn-williams        1       1
https://olympics.com/en/athletes/walter-jakobsson       1       1


In [124]:
df_hosts.dtypes

game_slug                  object
game_end_date      datetime64[ns]
game_start_date    datetime64[ns]
game_location              object
game_name                  object
game_season                object
game_year                   int64
dtype: object

In [8]:
# Step 2: Merge df_hosts with df_medals where slug_game contains simplified game_name
merged_df = pd.merge(df_hosts, df_medals, left_on='game_slug', right_on='slug_game')

# Ensure game_start_date is a datetime to extract the year
merged_df['game_start_date'] = pd.to_datetime(merged_df['game_start_date'])
merged_df['year'] = merged_df['game_start_date'].dt.year

# Filter out rows where medal_type is 'NULL'
valid_medals_df = merged_df[merged_df['medal_type'] != 'NULL']

# Step 3: Group by athlete names, Olympic types, and year
# This will allow checking for medals won by type and year
athlete_medals_by_year = valid_medals_df.groupby(['athlete_full_name', 'game_season', 'year']).size().unstack(level=1, fill_value=0)

# Check for Christa Luding-Rothenburger specifically
christa_medals = athlete_medals_by_year.loc['Christa LUDING-ROTHENBURGER']
christa_medals
# Determine if there are years where she won medals in both Winter and Summer Olympics
dual_year_medals = christa_medals[(christa_medals['Winter'] > 0) & (christa_medals['Summer'] > 0)]

# Display the results
print(christa_medals)


game_season  Summer  Winter
year                       
1984              0       1
1988              1       2
1992              0       1
