In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import numpy as np
from dotenv import load_dotenv

load_dotenv()

df = pd.read_csv(r'CSV_PATH')

In [None]:
display(df.head())

In [None]:
print(df.isnull().sum())

In [None]:
print(df['Category'].value_counts())

In [None]:
print(df.groupby('Category')['Prize Amount'].mean())

In [None]:
df['Prize Amount'] = df['Prize Amount'].replace('[^\d.]', '', regex=True).astype(float).astype(int)
df['Prize Amount Adjusted'] = df['Prize Amount Adjusted'].replace('[^\d.]', '', regex=True).astype(float).astype(int)

In [None]:
print(df.dtypes)

In [None]:
print(df[['Prize Amount', 'Prize Amount Adjusted']].head())

In [None]:
for column in df.select_dtypes(include=['object']).columns:
    print(f"--- {column} ---")
    print(df[column].value_counts())
    print("\n")

In [None]:
unknown_names = df[df['Gender'] == 'Unknown']
display(unknown_names)

In [None]:
df = df[df['Gender'] != 'Unknown']

In [None]:
awards_per_year = df['Award Year'].value_counts().sort_index()
print(awards_per_year)

total_prize_amount_per_year = df.groupby('Award Year')['Prize Amount'].sum()

In [None]:
print(df['Gender'].value_counts())

In [None]:
gender_discrepancy = df.groupby(['Category', 'Gender'])['Laureate Name'].count().reset_index(name='Count')

print(gender_discrepancy)

In [None]:
pivot_table = gender_discrepancy.pivot(index='Category', columns='Gender', values='Count').fillna(0)

print(pivot_table)

In [None]:
pivot_table.plot(kind='bar', stacked=True, figsize=(10, 6))

plt.title('Gender Discrepancy Across Nobel Prize Categories')
plt.xlabel('Category')
plt.ylabel('Number of Awards')
plt.legend(title='Gender')

plt.show()

In [None]:
female_winners = df[df['Gender'] == 'female']

female_winners_per_year = female_winners.groupby('Award Year')['Laureate Name'].count()

plt.figure(figsize=(12, 6))
female_winners_per_year.plot(kind='line')
plt.title('Trend of Female Nobel Prize Winners Over Time')
plt.xlabel('Year')
plt.ylabel('Number of Female Winners')
plt.grid(True)
plt.show()

In [None]:
male_winners = df[df['Gender'] == 'male']

male_winners_per_year = male_winners.groupby('Award Year')['Laureate Name'].count()

plt.figure(figsize=(12, 6))
male_winners_per_year.plot(kind='line')
plt.title('Trend of Male Nobel Prize Winners Over Time')
plt.xlabel('Year')
plt.ylabel('Number of Female Winners')
plt.grid(True)
plt.show()

In [None]:
female_winners_by_category = female_winners.groupby('Category')['Laureate Name'].count()

In [None]:
male_winners_by_category = male_winners.groupby('Category')['Laureate Name'].count()

In [None]:
sorted_female_winners_by_category = female_winners_by_category.sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sorted_female_winners_by_category.plot(kind='bar')
plt.title('Number of Female Nobel Prize Winners by Category, Ordered by Winners')
plt.xlabel('Category')
plt.ylabel('Number of Female Winners')
plt.xticks(rotation=45)
plt.show()

In [None]:
sorted_male_winners_by_category = male_winners_by_category.sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sorted_male_winners_by_category.plot(kind='bar')
plt.title('Number of Male Nobel Prize Winners by Category, Ordered by Winners')
plt.xlabel('Category')
plt.ylabel('Number of Male Winners')
plt.xticks(rotation=45)
plt.show()

In [None]:
pio.renderers.default = 'iframe_connected'

winners_per_country = df['Birth Country'].value_counts().reset_index()
winners_per_country.columns = ['Country', 'Total Winners']

fig = px.choropleth(winners_per_country,
                    locations="Country", 
                    locationmode="country names",
                    color="Total Winners", 
                    hover_name="Country", 
                    color_continuous_scale=px.colors.sequential.Plasma,
                    title="Global Nobel Prize Winners by Country")
fig.show()

In [None]:
df['Award Year'] = df['Award Year'].astype(int)

before_1990 = df[df['Award Year'] < 1990]
after_1990 = df[df['Award Year'] >= 1990]

gender_counts_before_1990 = before_1990['Gender'].value_counts()
gender_counts_after_1990 = after_1990['Gender'].value_counts()

gender_pct_before_1990 = (gender_counts_before_1990 / gender_counts_before_1990.sum()) * 100
gender_pct_after_1990 = (gender_counts_after_1990 / gender_counts_after_1990.sum()) * 100

gender_discrepancy_df = pd.DataFrame({
    'Before 1990 Count': gender_counts_before_1990,
    '1990 and After Count': gender_counts_after_1990,
    'Before 1990 (%)': gender_pct_before_1990,
    '1990 and After (%)': gender_pct_after_1990
}).fillna(0) 

total_counts = gender_discrepancy_df[['Before 1990 Count', '1990 and After Count']].sum()
total_pct = gender_discrepancy_df[['Before 1990 (%)', '1990 and After (%)']].sum()
gender_discrepancy_df.loc['Total'] = pd.Series({
    'Before 1990 Count': total_counts['Before 1990 Count'],
    '1990 and After Count': total_counts['1990 and After Count'],
    'Before 1990 (%)': 100,
    '1990 and After (%)': 100,
})

display(gender_discrepancy_df)

In [None]:
categories_focus = ['Physics', 'Chemistry', 'Economics']
df_filtered = df[df['Category'].isin(categories_focus)].copy()

df_filtered['Award Year'] = df_filtered['Award Year'].astype(int)

before_1990_filtered = df_filtered[df_filtered['Award Year'] < 1990]
after_1990_filtered = df_filtered[df_filtered['Award Year'] >= 1990]

gender_counts_before_1990_filtered = before_1990_filtered['Gender'].value_counts()
gender_counts_after_1990_filtered = after_1990_filtered['Gender'].value_counts()

gender_pct_before_1990_filtered = (gender_counts_before_1990_filtered / gender_counts_before_1990_filtered.sum()) * 100
gender_pct_after_1990_filtered = (gender_counts_after_1990_filtered / gender_counts_after_1990_filtered.sum()) * 100

genders = df_filtered['Gender'].unique()
x = np.arange(len(genders))  # Assuming we have two genders to plot
width = 0.35

fig, ax = plt.subplots(figsize=(10, 6))

rects1 = ax.bar(x - width/2, gender_pct_before_1990_filtered, width, label='Before 1990', color='lightblue')
rects2 = ax.bar(x + width/2, gender_pct_after_1990_filtered, width, label='1990 and After', color='lightyellow')

ax.set_xlabel('Gender')
ax.set_ylabel('Percentage')
ax.set_title('Nobel Prize Laureates by Gender and Period (%) for Physics, Chemistry, Economics')
ax.set_xticks(x)
ax.set_xticklabels(gender_counts_before_1990_filtered.index) 
ax.legend()

def autolabel(rects, counts):
    for rect, count in zip(rects, counts):
        height = rect.get_height()
        ax.annotate(f'{height:.1f}% ({count})', 
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(20, -10),
                    textcoords="offset points",
                    ha='right', va='bottom') 

autolabel(rects1, gender_counts_before_1990_filtered)
autolabel(rects2, gender_counts_after_1990_filtered)

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))


x = np.arange(len(gender_discrepancy_df.index[:-1])) 
width = 0.35

rects1 = ax.bar(x - width/2, gender_discrepancy_df['Before 1990 (%)'][:-1], width, label='Before 1990', color='lightblue')
rects2 = ax.bar(x + width/2, gender_discrepancy_df['1990 and After (%)'][:-1], width, label='1990 and After', color='lightyellow')

ax.set_xlabel('Gender')
ax.set_ylabel('Percentage')
ax.set_title('Nobel Prize Laureates by Gender and Period (%)')
ax.set_xticks(x)
ax.set_xticklabels(gender_discrepancy_df.index[:-1]) 
ax.legend()

def autolabel(rects, counts):
    for rect, count in zip(rects, counts):
        height = rect.get_height()
        ax.annotate(f'{height:.1f}% ({count})',  
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(20, -10), 
                    textcoords="offset points",
                    ha='right', va='bottom')  

autolabel(rects1, gender_discrepancy_df['Before 1990 Count'][:-1])
autolabel(rects2, gender_discrepancy_df['1990 and After Count'][:-1])

plt.show()

In [None]:
female_laureates = df[df['Gender'] == 'female']
female_laureates_country_category = female_laureates[['Birth Country', 'Category']]

print(female_laureates_country_category)

In [None]:
df['Counts'] = 1  
female_df = df[df['Gender'] == 'female']
aggregated_data = female_df.groupby(['Birth Country', 'Category'], as_index=False)['Counts'].count()

total_counts_per_country = aggregated_data.groupby('Birth Country')['Counts'].sum().reset_index()

top_10_countries = total_counts_per_country.nlargest(10, 'Counts')['Birth Country']

top_countries_data = aggregated_data[aggregated_data['Birth Country'].isin(top_10_countries)]

color_map = {
    'Physics': 'blue',
    'Chemistry': 'green',
    'Medicine': 'red',
    'Literature': 'purple',
    'Peace': 'gold',
    'Economics': 'orange',
}

fig = px.bar(
    top_countries_data,  
    y='Birth Country',
    x='Counts',
    color='Category', 
    color_discrete_map=color_map,  
    orientation='h',
    title='Top 10 Countries with Highest Total Female Nobel Laureates by Category',
    labels={'Counts': 'Total Number of Female Laureates', 'Birth Country': 'Birth Country', 'Category': 'Prize Category'},
    text='Counts'
)


fig.update_layout(
    xaxis_title='Total Number of Female Laureates',
    yaxis_title='Birth Country',
    yaxis={'categoryorder': 'total ascending'}
)

fig.show()

In [None]:
selected_categories_data = top_countries_data[top_countries_data['Category'].isin(['Physics', 'Chemistry', 'Economics'])]

# Determine the top 10 countries based on total counts
top_10_countries = total_counts_per_country.nlargest(10, 'Counts')['Birth Country']

# Filter the original aggregated dataset to include only rows for these top 10 countries
top_countries_data = aggregated_data[aggregated_data['Birth Country'].isin(top_10_countries)]

# Proceed with Plotly Express bar chart creation using the filtered data
fig = px.bar(
    selected_categories_data,  # This is the further filtered DataFrame
    y='Birth Country',
    x='Counts',
    color='Category',  # This assigns a different color to each of the selected categories
    color_discrete_map=color_map,  # Apply custom color map, assuming it's defined as before
    orientation='h',
    title='Top 10 Countries with Highest Total Female Nobel Laureates in Physics, Chemistry, and Economics',
    labels={'Counts': 'Total Number of Female Laureates', 'Birth Country': 'Birth Country', 'Category': 'Prize Category'},
    text='Counts'
)

# Update the layout to match your preferences
fig.update_layout(
    xaxis_title='Total Number of Female Laureates',
    yaxis_title='Birth Country',
    yaxis={'categoryorder': 'total ascending'}
)

# Display the plot
fig.show()