In [1]:
import pandas as pd
from scipy.stats import ttest_ind, pearsonr

# Load and clean the dataset
df = pd.read_csv("/content/olympics.csv")

In [2]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,,? Summer,01 !,02 !,03 !,Total,? Winter,01 !,02 !,03 !,Total,? Games,01 !,02 !,03 !,Combined total
1,Afghanistan (AFG),13,0,0,2,2,0,0,0,0,0,13,0,0,2,2
2,Algeria (ALG),12,5,2,8,15,3,0,0,0,0,15,5,2,8,15
3,Argentina (ARG),23,18,24,28,70,18,0,0,0,0,41,18,24,28,70
4,Armenia (ARM),5,1,2,9,12,6,0,0,0,0,11,1,2,9,12


In [3]:
df.columns = df.iloc[0]
df = df.drop(index=0).reset_index(drop=True)

In [4]:
df.columns = ['Country', 'Summer_Games', 'Summer_Gold', 'Summer_Silver', 'Summer_Bronze', 'Summer_Total',
              'Winter_Games', 'Winter_Gold', 'Winter_Silver', 'Winter_Bronze', 'Winter_Total',
              'Combined_Games', 'Combined_Gold', 'Combined_Silver', 'Combined_Bronze', 'Combined_Total']

In [5]:
df.head()

Unnamed: 0,Country,Summer_Games,Summer_Gold,Summer_Silver,Summer_Bronze,Summer_Total,Winter_Games,Winter_Gold,Winter_Silver,Winter_Bronze,Winter_Total,Combined_Games,Combined_Gold,Combined_Silver,Combined_Bronze,Combined_Total
0,Afghanistan (AFG),13,0,0,2,2,0,0,0,0,0,13,0,0,2,2
1,Algeria (ALG),12,5,2,8,15,3,0,0,0,0,15,5,2,8,15
2,Argentina (ARG),23,18,24,28,70,18,0,0,0,0,41,18,24,28,70
3,Armenia (ARM),5,1,2,9,12,6,0,0,0,0,11,1,2,9,12
4,Australasia (ANZ) [ANZ],2,3,4,5,12,0,0,0,0,0,2,3,4,5,12


In [6]:
# Convert numeric columns
numeric_cols = df.columns[1:]
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')


In [7]:
desc_stats = df[numeric_cols].agg(['mean', 'median', 'min', 'max', 'std']).transpose()
print("=== Descriptive Statistics ===")
print(desc_stats)


=== Descriptive Statistics ===
                       mean  median  min      max          std
Summer_Games      13.476190    13.0  1.0     27.0     7.072359
Summer_Gold       65.428571     3.0  0.0   4809.0   405.549990
Summer_Silver     64.965986     4.0  0.0   4775.0   399.309960
Summer_Bronze     69.795918     6.0  0.0   5130.0   427.187344
Summer_Total     200.190476    12.0  0.0  14714.0  1231.306297
Winter_Games       6.700680     5.0  0.0     22.0     7.433186
Winter_Gold       13.047619     0.0  0.0    959.0    80.799204
Winter_Silver     13.034014     0.0  0.0    958.0    80.634421
Winter_Bronze     12.897959     0.0  0.0    948.0    79.588388
Winter_Total      38.979592     0.0  0.0   2865.0   240.917324
Combined_Games    20.176871    15.0  1.0     49.0    13.257048
Combined_Gold     78.476190     3.0  0.0   5768.0   485.013378
Combined_Silver   78.000000     4.0  0.0   5733.0   478.860334
Combined_Bronze   82.693878     7.0  0.0   6078.0   505.855110
Combined_Total   239.170

In [8]:
# 1. T-Test: Summer vs Winter Total medals
summer = df['Summer_Total'].dropna()
winter = df['Winter_Total'].dropna()
t_stat, p_val = ttest_ind(summer, winter, equal_var=False)
print("\n=== T-Test: Summer vs Winter Total Medals ===")
print(f"T-Statistic: {t_stat:.4f}, P-Value: {p_val:.4f}")


=== T-Test: Summer vs Winter Total Medals ===
T-Statistic: 1.5579, P-Value: 0.1213


In [9]:
# 2. Correlation: Combined_Games vs Combined_Total
corr, corr_p = pearsonr(df['Combined_Games'], df['Combined_Total'])
print("\n=== Correlation: Combined_Games vs Combined_Total ===")
print(f"Pearson Correlation: {corr:.4f}, P-Value: {corr_p:.4f}")


=== Correlation: Combined_Games vs Combined_Total ===
Pearson Correlation: 0.2743, P-Value: 0.0008
