In [23]:
import pandas as pd
import numpy as np
import plotly as py
import plotly.graph_objs as go
py.offline.init_notebook_mode(connected=True)

In [24]:
#read in datasets
economy_df = pd.read_excel('Data/country_classifications.xls',sheet_name=0,header=4)
ed_df=pd.read_csv('Data/EDULIT_DS.csv')
gapall=pd.read_csv('Data/GenderGapIndex.csv')
gdp_all=pd.read_csv('Data/GDP_Per_Capita.csv',header=4,usecols=[0,1,2,3,56,57,58,59,60])

In [25]:
#Choose only certain columns and rename from Country Classification dataset
economy_df=economy_df[['Code','Region','Income group']]

In [26]:
#Merge country classifications into GDP data
gdp_all.columns=['Country Name','Code','GDP Indicator Name','GDP Indicator Code','2012_GDPPerCapita','2013_GDPPerCapita','2014_GDPPerCapita','2015_GDPPerCapita','2016_GDPPerCapita']
gdp_all=gdp_all.merge(economy_df,on='Code',how='inner')


In [27]:
stem_df=ed_df.loc[ed_df['Indicator']=='Percentage of graduates from Science, Technology , Engineering and Mathematics programmes in tertiary education who are female (%)']
stem_df=stem_df[['LOCATION','Time','Value']]
stem_df.rename(columns={'LOCATION':'Code','Time':'STEM Year','Value':'STEM_fem_perc'},inplace=True)
stem_df=stem_df.pivot(index='Code',columns='STEM Year',values='STEM_fem_perc')


In [28]:
stem_df.columns=['2012_stem','2013_stem','2014_stem','2015_stem','2016_stem','2017_stem']
stem_df.drop(columns=['2017_stem'],inplace=True)
stem_df.reset_index(inplace=True)


In [29]:
overall_gendergap=gapall.loc[gapall['Indicator']=='Overall Global Gender Gap Index',:]
overall_gendergap=overall_gendergap.loc[overall_gendergap['Subindicator Type']=='Index',:]
overall_gendergap.rename(columns = {'Country ISO3':'Code','2006':'2006_gendergap','2007':'2007_gendergap',
                                   '2008':'2008_gendergap','2009':'2009_gendergap',
                                   '2010':'2010_gendergap','2011':'2011_gendergap','2012':'2012_gendergap',
                                   '2013':'2013_gendergap','2014':'2014_gendergap','2015':'2015_gendergap',
                                   '2016':'2016_gendergap'},inplace=True)


In [30]:
gender_stem=overall_gendergap.merge(stem_df,on='Code',how='inner')


In [31]:
gender_stem=gender_stem.merge(gdp_all,on='Code',how='inner')


In [32]:
df_2016=gender_stem[['Country Name_x','2016_gendergap','2016_stem','Region']]


In [33]:
df_2016=df_2016.dropna()


In [34]:
trace0 = go.Scatter(
    x=df_2016['2016_gendergap'][df_2016['Region'] == 'Europe & Central Asia'],
    y=df_2016['2016_stem'][df_2016['Region'] == 'Europe & Central Asia'],
    mode='markers',
    name='Europe & Central Asia',
    text=df_2016['Country Name_x'][df_2016['Region'] == 'Europe & Central Asia'],
    marker=dict(
        size=40,
        opacity=0.5,
        line=dict(
            width=2
        ),
        
    )
)

trace1 = go.Scatter(
    x=df_2016['2016_gendergap'][df_2016['Region'] == 'North America'],
    y=df_2016['2016_stem'][df_2016['Region'] == 'North America'],
    mode='markers',
    name='North America',
    text=df_2016['Country Name_x'][df_2016['Region'] == 'North America'],
    marker=dict(
        size=40,
        opacity=0.5,
        line=dict(
            width=2
        ),
        
    )
)

trace2 = go.Scatter(
    x=df_2016['2016_gendergap'][df_2016['Region'] == 'Latin America & Caribbean'],
    y=df_2016['2016_stem'][df_2016['Region'] == 'Latin America & Caribbean'],
    mode='markers',
    name='Latin America & Caribbean',
    text=df_2016['Country Name_x'][df_2016['Region'] == 'Latin America & Caribbean'],
    marker=dict(
        size=40,
        opacity=0.5,
        line=dict(
            width=2
        ),
        
    )
)

trace3 = go.Scatter(
    x=df_2016['2016_gendergap'][df_2016['Region'] == 'Sub-Saharan Africa'],
    y=df_2016['2016_stem'][df_2016['Region'] == 'Sub-Saharan Africa'],
    mode='markers',
    name='Sub-Saharan Africa',
    text=df_2016['Country Name_x'][df_2016['Region'] == 'Sub-Saharan Africa'],
    marker=dict(
        size=40,
        opacity=0.5,
        line=dict(
            width=2
        ),
        
    )
)

trace4 = go.Scatter(
    x=df_2016['2016_gendergap'][df_2016['Region'] == 'East Asia & Pacific'],
    y=df_2016['2016_stem'][df_2016['Region'] == 'East Asia & Pacific'],
    mode='markers',
    name='East Asia & Pacific',
    text=df_2016['Country Name_x'][df_2016['Region'] == 'East Asia & Pacific'],
    marker=dict(
        size=40,
        opacity=0.5,
        line=dict(
            width=2
        ),
        
    )
)
trace5 = go.Scatter(
    x=df_2016['2016_gendergap'][df_2016['Region'] == 'Middle East & North Africa'],
    y=df_2016['2016_stem'][df_2016['Region'] == 'Middle East & North Africa'],
    mode='markers',
    name='Middle East & North Africa',
    text=df_2016['Country Name_x'][df_2016['Region'] == 'Middle East & North Africa'],
    marker=dict(
        size=40,
        opacity=0.5,
        line=dict(
            width=2
        ),
        
    )
)
trace6 = go.Scatter(
    x=df_2016['2016_gendergap'][df_2016['Region'] == 'South Asia'],
    y=df_2016['2016_stem'][df_2016['Region'] == 'South Asia'],
    mode='markers',
    name='South Asia',
    text=df_2016['Country Name_x'][df_2016['Region'] == 'South Asia'],
    marker=dict(
        size=40,
        opacity=0.5,
        line=dict(
            width=2
        ),
        
    )
)



data=[trace0,trace1,trace2,trace3,trace4,trace5,trace6]

layout = go.Layout(
    title='Percent of Female STEM Graduates from Tertiary Education vs Gender Gap Index',
    xaxis=dict(
        title='Gender Gap Index',
        gridcolor='rgb(255, 255, 255)',
        range=[0.5, 0.9],
        zerolinewidth=1,
        ticklen=5,
        gridwidth=2,
    ),
    yaxis=dict(
        title='Female Stem Graduates from Tertiary Education (%)',
        gridcolor='rgb(255, 255, 255)',
        range=[10,70],
        zerolinewidth=1,
        ticklen=5,
        gridwidth=2,
    ),
    paper_bgcolor='rgb(243, 243, 243)',
    plot_bgcolor='rgb(243, 243, 243)',
)
fig=go.Figure(data=data,layout=layout)
py.offline.iplot(fig)

In [37]:
df_2016_gdp=gender_stem[['Country Name_x','2016_GDPPerCapita','2016_stem','Region']]
df_2016_gdp=df_2016_gdp.dropna()
df_2016_gdp['2016_GDPPerCapita']=df_2016_gdp['2016_GDPPerCapita']/2000
df_2016_gdp

Unnamed: 0,Country Name_x,2016_GDPPerCapita,2016_stem,Region
1,Albania,2.065936,48.72306,Europe & Central Asia
2,United Arab Emirates,19.258900,43.46405,Middle East & North Africa
3,Argentina,6.327177,46.50121,Latin America & Caribbean
4,Armenia,1.802872,36.84380,Europe & Central Asia
6,Austria,22.365505,25.90260,Europe & Central Asia
7,Azerbaijan,1.940369,34.83515,Europe & Central Asia
8,Burundi,0.142864,19.24742,Sub-Saharan Africa
9,Belgium,20.630489,27.48167,Europe & Central Asia
12,Bangladesh,0.679390,19.77568,South Asia
13,Bulgaria,3.734724,38.26006,Europe & Central Asia


In [42]:
trace0 = go.Scatter(
    x=df_2016_gdp['2016_GDPPerCapita'][df_2016_gdp['Region'] == 'Europe & Central Asia'],
    y=df_2016_gdp['2016_stem'][df_2016_gdp['Region'] == 'Europe & Central Asia'],
    mode='markers',
    name='Europe & Central Asia',
    text=df_2016_gdp['Country Name_x'][df_2016_gdp['Region'] == 'Europe & Central Asia'],
    marker=dict(
        size=40,
        opacity=0.5,
        line=dict(
            width=2
        ),
        
    )
)

trace1 = go.Scatter(
    x=df_2016_gdp['2016_GDPPerCapita'][df_2016_gdp['Region'] == 'North America'],
    y=df_2016_gdp['2016_stem'][df_2016_gdp['Region'] == 'North America'],
    mode='markers',
    name='North America',
    text=df_2016_gdp['Country Name_x'][df_2016_gdp['Region'] == 'North America'],
    marker=dict(
        size=40,
        opacity=0.5,
        line=dict(
            width=2
        ),
        
    )
)

trace2 = go.Scatter(
    x=df_2016_gdp['2016_GDPPerCapita'][df_2016_gdp['Region'] == 'Latin America & Caribbean'],
    y=df_2016_gdp['2016_stem'][df_2016_gdp['Region'] == 'Latin America & Caribbean'],
    mode='markers',
    name='Latin America & Caribbean',
    text=df_2016_gdp['Country Name_x'][df_2016_gdp['Region'] == 'Latin America & Caribbean'],
    marker=dict(
        size=40,
        opacity=0.5,
        line=dict(
            width=2
        ),
        
    )
)

trace3 = go.Scatter(
    x=df_2016_gdp['2016_GDPPerCapita'][df_2016_gdp['Region'] == 'Sub-Saharan Africa'],
    y=df_2016_gdp['2016_stem'][df_2016_gdp['Region'] == 'Sub-Saharan Africa'],
    mode='markers',
    name='Sub-Saharan Africa',
    text=df_2016_gdp['Country Name_x'][df_2016_gdp['Region'] == 'Sub-Saharan Africa'],
    marker=dict(
        size=40,
        opacity=0.5,
        line=dict(
            width=2
        ),
        
    )
)

trace4 = go.Scatter(
    x=df_2016_gdp['2016_GDPPerCapita'][df_2016_gdp['Region'] == 'East Asia & Pacific'],
    y=df_2016_gdp['2016_stem'][df_2016_gdp['Region'] == 'East Asia & Pacific'],
    mode='markers',
    name='East Asia & Pacific',
    text=df_2016_gdp['Country Name_x'][df_2016_gdp['Region'] == 'East Asia & Pacific'],
    marker=dict(
        size=40,
        opacity=0.5,
        line=dict(
            width=2
        ),
        
    )
)
trace5 = go.Scatter(
    x=df_2016_gdp['2016_GDPPerCapita'][df_2016_gdp['Region'] == 'Middle East & North Africa'],
    y=df_2016_gdp['2016_stem'][df_2016_gdp['Region'] == 'Middle East & North Africa'],
    mode='markers',
    name='Middle East & North Africa',
    text=df_2016_gdp['Country Name_x'][df_2016_gdp['Region'] == 'Middle East & North Africa'],
    marker=dict(
        size=40,
        opacity=0.5,
        line=dict(
            width=2
        ),
        
    )
)
trace6 = go.Scatter(
    x=df_2016_gdp['2016_GDPPerCapita'][df_2016_gdp['Region'] == 'South Asia'],
    y=df_2016_gdp['2016_stem'][df_2016_gdp['Region'] == 'South Asia'],
    mode='markers',
    name='South Asia',
    text=df_2016_gdp['Country Name_x'][df_2016_gdp['Region'] == 'South Asia'],
    marker=dict(
        size=40,
        opacity=0.5,
        line=dict(
            width=2
        ),
        
    )
)



data=[trace0,trace1,trace2,trace3,trace4,trace5,trace6]

layout = go.Layout(
    title='Percent of Female STEM Graduates from Tertiary Education vs GDP',
    xaxis=dict(
        title='GDP Per Capita (2000 Dollars)',
        gridcolor='rgb(255, 255, 255)',
        range=[0, 100],
        zerolinewidth=1,
        ticklen=5,
        gridwidth=2,
    ),
    yaxis=dict(
        title='Female Stem Graduates from Tertiary Education (%)',
        gridcolor='rgb(255, 255, 255)',
        range=[10,70],
        zerolinewidth=1,
        ticklen=5,
        gridwidth=2,
    ),
    paper_bgcolor='rgb(243, 243, 243)',
    plot_bgcolor='rgb(243, 243, 243)',
)
fig=go.Figure(data=data,layout=layout)
py.offline.iplot(fig)