In [1]:
import pandas as pd
import os 

current_dir = os.getcwd() 
relative_path=os.path.abspath(os.path.join(current_dir, os.pardir))

Obtain the data

In [2]:

country_data = pd.DataFrame(pd.read_csv(
    relative_path+"\\Web_Scraping\\list_of_country.csv",usecols=[1],names=['country']))


In [3]:
salary_data = pd.DataFrame(pd.read_csv(
    relative_path+"\\Web_Scraping\\salary_data.csv",usecols=[1],names=['salary']))

In [4]:
country_data=country_data.iloc[1: , :]
salary_data=salary_data.iloc[1: , :]

In [5]:
df_salary=pd.concat([country_data,salary_data],axis=1)

In [6]:
df_salary

Unnamed: 0,country,salary
1,Argentina,"1,503,900 $"
2,Australia,"111,100 $"
3,Austria,"53,200 €"
4,Belarus,"952,071,300 Br"
5,Belgium,"59,500 €"
...,...,...
114,Tunisia,"97,600 د.ت"
115,Turkey,"127,400 ₤"
116,Ukraine,"717,100 ₴"
117,United Kingdom,"47,200 £"


Data seems to doubled. The extra part is dropped.

In [7]:
df_salary.drop(df_salary.tail(59).index,
               inplace=True)

In [8]:
df_salary

Unnamed: 0,country,salary
1,Argentina,"1,503,900 $"
2,Australia,"111,100 $"
3,Austria,"53,200 €"
4,Belarus,"952,071,300 Br"
5,Belgium,"59,500 €"
6,Bosnia and Herzegovina,"50,900 КМ"
7,Brazil,"227,000 R$"
8,Bulgaria,"57,100 лв"
9,Canada,"86,300 $"
10,Chile,"27,384,400 $"


Observe the currency. It is found out that most of the Europe country use euro dollar as indicator instead of their own money. Europe country like Denmark, Switzerland, Sweden, etc use their own money sign. Sign of € used to filter out the Europe country that assigned by Euro.

In [9]:
salary_euro = df_salary[df_salary['salary'].str.contains('€')]

In [10]:
other_salary=df_salary[~df_salary['salary'].str.contains('€')]

If want to use country code instead, code below can be used

In [11]:
# import pycountry
# country_df=pd.DataFrame(list(pycountry.countries))
# country_df.rename(columns = {0:'info'}, inplace = True)
# for i in range(len(other_salary)):
#     for j in range(len(country_df)):
#         if str(other_salary.loc[i,'country']) in str(country_df.loc[j,'info']):
#             other_salary.loc[i,'c_info']=country_df.loc[j,'info']
# other_salary


Get currency code that correspond to country name

In [12]:
from countryinfo import CountryInfo

In [13]:
# for i in range(len(other_salary)):
#     other_salary.loc[i, 'currency_code'] = str(CountryInfo(
#         other_salary.loc[i, 'country']).currencies().pop)

In [14]:
other_salary=other_salary.reset_index(drop='index')

In [15]:
for i in range(len(other_salary)):
    other_salary.loc[i, 'currency_code'] = str(CountryInfo(
        other_salary.loc[i, 'country']).currencies())

In [16]:
other_salary

Unnamed: 0,country,salary,currency_code
0,Argentina,"1,503,900 $",['ARS']
1,Australia,"111,100 $",['AUD']
2,Belarus,"952,071,300 Br",['BYR']
3,Bosnia and Herzegovina,"50,900 КМ",['BAM']
4,Brazil,"227,000 R$",['BRL']
5,Bulgaria,"57,100 лв",['BGN']
6,Canada,"86,300 $",['CAD']
7,Chile,"27,384,400 $","['CLF', 'CLP']"
8,China,"235,000 ¥",['CNY']
9,Colombia,"41,831,600 $",['COP']


Investigate which currency code to be used for country with multiple currency code. There are three countries with multiple currency code which are Childe, Switzerland and United States. Based on currency sign, CLP will used for Childe, CHF for Switzerland, USD for United States.

In [17]:
other_salary = other_salary.replace({'currency_code': "'"}, 
    {'currency_code': ''}, regex=True)
other_salary = other_salary.replace({'currency_code': "\["}, 
    {'currency_code': ''}, regex=True)
other_salary = other_salary.replace({'currency_code': "\]"}, 
    {'currency_code': ''}, regex=True)

In [18]:
other_salary.loc[7,'currency_code']='CLP'
other_salary.loc[35,'currency_code']='CHF'
other_salary.loc[41,'currency_code']='USD'

In [19]:
other_salary['salary'] = other_salary['salary'].str.split(' ').str[0]

In [20]:
other_salary = other_salary.replace({'salary': ","}, 
    {'salary': ''}, regex=True)

In [21]:
other_salary.astype({'salary': 'int64'}).dtypes

country          object
salary            int64
currency_code    object
dtype: object

In [22]:
other_salary

Unnamed: 0,country,salary,currency_code
0,Argentina,1503900,ARS
1,Australia,111100,AUD
2,Belarus,952071300,BYR
3,Bosnia and Herzegovina,50900,BAM
4,Brazil,227000,BRL
5,Bulgaria,57100,BGN
6,Canada,86300,CAD
7,Chile,27384400,CLP
8,China,235000,CNY
9,Colombia,41831600,COP


In [23]:
other_salary.to_csv(relative_path+"\\Web_Scraping\\currency_code.csv")

In [24]:
from forex_python.converter import CurrencyRates

In [25]:
cr = CurrencyRates()

In [26]:
for j in range(len(other_salary)):
    try:
        other_salary.loc[j,'salary_in_usd'] = cr.convert(
        other_salary.loc[j, 'currency_code'], 'USD', float(other_salary.loc[j, 'salary'])),
    except:
        pass 

Only certain conversion rate available

In [27]:
other_salary

Unnamed: 0,country,salary,currency_code,salary_in_usd
0,Argentina,1503900,ARS,
1,Australia,111100,AUD,
2,Belarus,952071300,BYR,
3,Bosnia and Herzegovina,50900,BAM,
4,Brazil,227000,BRL,
5,Bulgaria,57100,BGN,
6,Canada,86300,CAD,
7,Chile,27384400,CLP,
8,China,235000,CNY,
9,Colombia,41831600,COP,


Now is the turn for countries assigned EUR

In [28]:
salary_euro=salary_euro.reset_index(drop='index')

In [29]:
salary_euro['salary'] = salary_euro['salary'].str.split(' ').str[0]

In [30]:
salary_euro = salary_euro.replace({'salary': ","}, 
    {'salary': ''}, regex=True)

In [31]:
for j in range(len(salary_euro)):
    try:
        salary_euro.loc[j,'salary_in_usd'] = cr.convert(
        'EUR', 'USD', float(salary_euro.loc[j, 'salary'])),
    except:
        pass 

In [32]:
salary_euro

Unnamed: 0,country,salary
0,Austria,53200
1,Belgium,59500
2,Estonia,41300
3,Finland,60900
4,France,54400
5,Germany,63400
6,Greece,55200
7,Ireland,61500
8,Italy,63600
9,Latvia,39100


In [33]:
converted_salary=pd.concat([other_salary,salary_euro])

In [34]:
converted_salary.to_csv(relative_path+"\\get_conversion_rate\\converted_salary.csv")


In [35]:
converted_salary

Unnamed: 0,country,salary,currency_code,salary_in_usd
0,Argentina,1503900,ARS,
1,Australia,111100,AUD,
2,Belarus,952071300,BYR,
3,Bosnia and Herzegovina,50900,BAM,
4,Brazil,227000,BRL,
5,Bulgaria,57100,BGN,
6,Canada,86300,CAD,
7,Chile,27384400,CLP,
8,China,235000,CNY,
9,Colombia,41831600,COP,
