1. Importing Libraries

In [1]:
#Importing np,ply,pd,sns modules
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import seaborn as sns

2. Loading .csv files

In [2]:
#Loading Athlete and Regions dataset
athletes = pd.read_csv('athlete_events.csv')
regions = pd.read_csv('noc_regions.csv')

In [None]:
#display the first 2 data with column names
athletes.head(2)

In [None]:
#display the first 2 data with column names
regions.head(2)

3. Joining 2 tables(athletes and regions)

In [None]:
athletes_df = athletes.merge(regions, how= 'left', on='NOC')
athletes_df.shape
athletes_df

In [None]:
#Renaming the original data set's column names
athletes_df.rename(columns={'region':'Region','notes':'Notes'}, inplace=True)
athletes_df.head()

In [None]:
# '.info() ' function will give you the information of a dataset
athletes_df.info()

In [None]:
# '.describe() ' function will give you the information of a dataset
athletes_df.describe()

4. Checking for NULL values

In [None]:
#Checking for null values
nan_values = athletes_df.isna()
nan_columns = nan_values.any()
nan_columns 

In [None]:
#to check the exact number of NAN- values in the dataset.
athletes_df.isnull().sum()

5. Team INDIA's data

In [None]:
#Athletes of Indian Team
athletes_df.query('Team == "India"').head()

In [None]:
#Athletes of Japnese Team
athletes_df.query('Team == "Japan"').head()

6. Top 10 countries according to no.of players

In [None]:
#Top countries in the olympics.
top_10_countries = athletes_df['Team'].value_counts().sort_values(ascending=False).head(10)
top_10_countries

In [None]:
#Plotting the top10 countries.
plt.figure(figsize=(12,6))
plt.title('Overall participants of TOP-10 countries')
sns.barplot(x=top_10_countries, y=top_10_countries, palette='Set2')

In [None]:
#Plotting the top10 countries ages.
plt.figure(figsize=(12,6))
plt.title("Age distribition Chart")
plt.xlabel("Age")
plt.ylabel("No.of Participants")
plt.hist(athletes_df.Age, bins = np.arange(10,80,2), color= 'orange', edgecolor = 'black')

7. Winter Olympics Data

In [None]:
#Winter Olympics:

winter_olympics = athletes_df[athletes_df.Season == 'Winter'].Sport.unique()
winter_olympics

8. summer Olympics Data

In [None]:
#Summer Olympics:

summer_olympics = athletes_df[athletes_df["Season"]=="Summer"].Sport.unique()
summer_olympics

9. Male and Female Count

In [None]:
#Male and Female participants in totoal.
gender_counts = athletes_df['Sex'].value_counts()
gender_counts

In [None]:
#Pie-plot for male and female athletes

plt.figure(figsize=(12,6))
plt.title('Gender Distribution')
plt.pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%',startangle=150, shadow = True)

10. Percentage of Total Participants v/s Medalists 

In [None]:
#Percentage of participants vs Medals won
medal_winners = athletes_df["Medal"].count()
total_participants = athletes_df["ID"].nunique()
x_y = (medal_winners/total_participants)*100
x_y

11. Total female participants

In [None]:
#total female participants in summer olympics 
fe_participants = athletes_df[(athletes_df.Sex == "F") & (athletes_df.Season == 'Summer')][['Sex','Year']]
fe_participants = fe_participants.groupby('Year').count().reset_index()
fe_participants.head(5)

In [None]:
#total female participants in winter olympics 
fe_participants = athletes_df[(athletes_df.Sex == "F") & (athletes_df.Season == 'Winter')][['Sex','Year']]
fe_participants = fe_participants.groupby('Year').count().reset_index()
fe_participants.tail(5)

In [None]:
#total no.of female athletes
womenOylmpics = athletes_df[(athletes_df.Sex == "F") & (athletes_df.Season == 'Summer')]


#Bar plot for the above dataFrame
sns.set(style='darkgrid')
plt.figure(figsize=(12,6))
sns.countplot(x='Year',data=womenOylmpics, palette='Spectral')
plt.title('Women Participation')

In [None]:
#how female athletes grew over the time

part = womenOylmpics.groupby('Year')['Sex'].value_counts()
plt.figure(figsize=(12,6))
part.loc[:,'F'].plot()
plt.title("Plot of female athletes over the time")

12. All GOLD MEDAL winners

In [None]:
#Gold Medal Winners:
gold_Medal = athletes_df[athletes_df['Medal']=='Gold']
gold_Medal.head()

In [None]:
#taking values that are different than NaN:

goldMedals = gold_Medal[np.isfinite(gold_Medal['Age'])]
goldMedals

In [None]:
#Gold beyond 60
gold_60 = goldMedals['ID'][goldMedals['Age']>60].count()
gold_60

In [None]:
#for which event gold_60?
events_60 = goldMedals['Sport'][goldMedals['Age']>60]
events_60

In [None]:
#for which event gold_60? and the chart.
plt.figure(figsize=(10,5))
plt.tight_layout()
sns.countplot(events_60)
plt.title("GoldMedals for age over 60")

12. Country wise medal tally

In [None]:
#gold medals for each country

gold_by_country = goldMedals['Region'].value_counts().reset_index(name="Medal").head(6)
gold_by_country

In [None]:
#Bar plot to show top 6 countries with gold
graph = sns.catplot(x='index',y='Medal', data =gold_by_country, height=5, kind='bar', palette='rocket')
graph.despine(left=True)
graph.set_xlabels("Top 5")
graph.set_ylabels("No.of Countries")
plt.title("Gold Per Country")


13. About Rio olympics

In [None]:
#Data about Rio-Olympics

max_year = athletes_df['Year'].max()
max_year    

In [None]:
#Medals won by countries in 2016(Rio)
team_names = athletes_df[(athletes_df['Year']==2016) & (athletes_df['Medal']=='Gold')].Team
team_names = team_names.value_counts().head(15)
team_names 


In [None]:
#Plotting the above results
plt.figure(figsize=(18,8))
sns.barplot(x=team_names,y=team_names.index)
plt.ylabel(None)
plt.title("Rio-2016 Medal Tally of Top-15 Countries")

In [None]:
not_null_medals = athletes_df[(athletes_df['Height'].notnull()) & (athletes_df['Weight'].notnull())]

plt.figure(figsize=(18,6))
axis = sns.scatterplot(x='Height',y='Weight',data=not_null_medals, hue='Sex')
plt.title('Height vs Weight of Participants.')