<a href="https://colab.research.google.com/github/thornwishstalon/psychic-spoon/blob/main/dopp32_ex3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

DOPP 32



In [144]:
import pandas as pd
import numpy as np
import matplotlib as plt
import os
import re
import seaborn as sns


In [None]:
#to map iso2 country codes to iso3 - not sure if we need this helper function at all- remains to be ssen
!pip install pycountry
import pycountry as pc

def changeIso2ToIso3(code):
    country = None
    try:
        country = pc.countries.get(alpha_2=code)
        if country is None:
            return None
        return country.alpha_3
    except LookupError:
        return None

# use like this: df['country code'] = df['country_code'].apply(lambda code: changeIso2ToIso3(code))     

In [None]:
try:
    from google.colab import drive
    drive.mount("/content/drive/", force_remount=True)
    google_drive_prefix = "/content/drive/My Drive/dopp32_ex3_data"
    data_prefix = "{}/".format(google_drive_prefix)
except ModuleNotFoundError: 
    data_prefix = "data/"

print(data_prefix)

Mounted at /content/drive/
/content/drive/My Drive/dopp32_ex3_data/


In [168]:
#HELPER   ###################################
def create_year_dict(range, type, postfix=None):
    year_dict = {}
    for i in range:
      key = str(i)
      if postfix is not None:
        key = str(i) + postfix.format(i)

      year_dict[key] =type

    return year_dict

def load_country_list():
    country_file_name = "WDICountry_whitelist.csv"
    df = pd.read_csv(data_prefix + country_file_name, index_col=0)
    
    return df.index.to_numpy()


def load_WDI():
    '''
    loads WDI data csv file
    :return: pd.DataFrame
    '''
    WDI_data_file_name = "WDIData.csv"
    df = pd.read_csv(data_prefix + WDI_data_file_name, dtype=create_year_dict(range(1960, 2020), np.float))
    df.drop(df.columns[df.columns.str.contains('unnamed', case=False)], axis=1, inplace=True)

    df = df[df['Country Code'].isin(load_country_list())]

    return df

def load_WDI_Partials():
  sub_folder = 'data_csv/'
  directory = data_prefix+ sub_folder
  dfs = []
  for filename in os.listdir(directory):
    
    if filename.endswith(".csv"):
      print(filename)
      print(data_prefix + sub_folder + filename)
      #dtype=create_year_dict(range(1960, 2020), np.float," [YR{}]")
      use_cols, name_cols, d_types = get_name_replacements(['Country Name', 'Country Code', 'Series Name','Series Code'])   
      print(d_types)
      print(name_cols)   
      #dtype=d_types
      df = pd.read_csv(data_prefix + sub_folder + filename, na_values=['..'])
      df.columns = name_cols      
      dfs.append(df)

  return pd.concat(dfs)


def select_indicators(df, indicator_list):
  return df[df['Indicator Code'].isin(indicator_list)]

def get_name_replacements(columns=None):
  columns_replacements = columns.copy()
  d_types = {}
  if columns is None:
    columns = []
    columns_replacements = []
  for i in range(1980, 2021):
    columns.append("{} [YR{}]".format(i,i))
    columns_replacements.append(str(i))
    d_types["{} [YR{}]".format(i,i)]= 'float64'
  return columns , columns_replacements, d_types

def get_year_columns():
    columns = []
    for i in range(1980, 2021):      
      columns.append(str(i))

    return columns

def strip(year):
  p = re.compile('\[.+\]')
  year= p.sub('', year)
  return str.trim(year)

def turn_wide_into_long(wdi_data, id_vars, index ):
  wdi_data = pd.melt(wdi_data, id_vars=id_vars,
                   value_vars=get_year_columns(), var_name="Year", 
          value_name="Value")
  wdi_data[["Value", "Year"]] = wdi_data[["Value", "Year"]].apply(pd.to_numeric)

  wdi_data.set_index(index,inplace=True)
  

  return wdi_data  




#HELPER   ###################################
#END      ###################################

load all the data... in /data subfolder


In [173]:
# for partials
#wdi_data =  load_WDI_Partials()
wdi_data =  load_WDI()

#for partials use different keys
#wdi_data = turn_wide_into_long(wdi_data,['Country Code','Series Code'],['Country Code','Series Code','Year'])

wdi_data = turn_wide_into_long(wdi_data,['Country Code','Indicator Code'],['Country Code','Indicator Code','Year'])

print(wdi_data.head(25))




                                                   Value
Country Code Indicator Code           Year              
AFG          EG.CFT.ACCS.ZS           1980           NaN
             EG.ELC.ACCS.ZS           1980           NaN
             EG.ELC.ACCS.RU.ZS        1980           NaN
             EG.ELC.ACCS.UR.ZS        1980           NaN
             FX.OWN.TOTL.ZS           1980           NaN
             FX.OWN.TOTL.FE.ZS        1980           NaN
             FX.OWN.TOTL.MA.ZS        1980           NaN
             FX.OWN.TOTL.OL.ZS        1980           NaN
             FX.OWN.TOTL.40.ZS        1980           NaN
             FX.OWN.TOTL.PL.ZS        1980           NaN
             FX.OWN.TOTL.60.ZS        1980           NaN
             FX.OWN.TOTL.SO.ZS        1980           NaN
             FX.OWN.TOTL.YG.ZS        1980           NaN
             per_si_allsi.adq_pop_tot 1980           NaN
             per_allsp.adq_pop_tot    1980           NaN
             per_sa_allsa.adq_p

booya

In [175]:
# get Scientific and technical journal articles data
tmp = wdi_data.loc[wdi_data['Indicator Code'] == 'IP.JRN.ARTC.SC']
sns.lineplot(x="Year", y="Value",
             hue="Country Code",
             data=tmp)

KeyError: ignored