In [181]:
import pandas as pd
import numpy as np
import json

## Converting dataframe into json file from putting into MongoDB
---
<b>Tables Checklist: </b>
- retail price
- growers received
- total production
- domestic consumption
- import
- re-export
- export
---
<b>Other things need to be checked</b>
1. country names in the table same as listed in map json file
2. grouping the countries in growers received (average all coffee types)


## Read worldmap json file and create a list for all the country name

In [182]:
# opening the json file
with open("../Dataset/worldmap.json", "r") as f:
  worldmap = json.load(f)

# locate all the country in the json file
country_stored_list = worldmap["objects"]["countries1"]["geometries"]

country_list = [] # create an empty list to store all the dcountry name
for country in country_stored_list:
  # putting the country name into the list
  country_list.append(country["properties"]["name"]) 

# this country list will use to check is the country in the tables are matching to the json file
country_list

['Afghanistan',
 'Angola',
 'Albania',
 'United Arab Emirates',
 'Argentina',
 'Armenia',
 'Antarctica',
 'French Southern and Antarctic Lands',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Burundi',
 'Belgium',
 'Benin',
 'Burkina Faso',
 'Bangladesh',
 'Bulgaria',
 'The Bahamas',
 'Bosnia and Herzegovina',
 'Belarus',
 'Belize',
 'Bermuda',
 'Bolivia',
 'Brazil',
 'Brunei',
 'Bhutan',
 'Botswana',
 'Central African Republic',
 'Canada',
 'Switzerland',
 'Chile',
 'China',
 'Ivory Coast',
 'Cameroon',
 'Democratic Republic of the Congo',
 'Republic of the Congo',
 'Colombia',
 'Costa Rica',
 'Cuba',
 'Northern Cyprus',
 'Cyprus',
 'Czech Republic',
 'Germany',
 'Djibouti',
 'Denmark',
 'Dominican Republic',
 'Algeria',
 'Ecuador',
 'Egypt',
 'Eritrea',
 'Spain',
 'Estonia',
 'Ethiopia',
 'Finland',
 'Fiji',
 'Falkland Islands',
 'France',
 'Gabon',
 'United Kingdom',
 'Georgia',
 'Ghana',
 'Guinea',
 'Gambia',
 'Guinea Bissau',
 'Equatorial Guinea',
 'Greece',
 'Greenland',
 'Guatemala'

### Retail Price

In [183]:
retail_price_df = pd.read_csv("../Dataset/international_coffee_organization/Price_data/Coffee_retail_price.csv")
retail_price_df.head()

Unnamed: 0,Country (US$/lb),1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Austria,4.9,4.57,4.99,4.97,4.58,5.18,5.42,4.37,3.97,...,6.69,8.32,8.38,8.62,8.63,7.44,5.64,6.22,6.63,6.16607
1,Belgium,3.27,2.92,3.05,2.78,3.42,4.88,4.04,3.98,4.19,...,5.18,6.48,6.2,6.24,,,,,,
2,Bulgaria,,,,,,,,,,...,3.62,4.16,3.44,3.59,3.75,3.45,3.79,4.64,4.58,4.208544
3,Cyprus,2.83,2.8,2.87,2.6,3.18,4.26,4.13,4.04,4.61,...,5.21,6.06,6.36,6.47,6.48,5.3,5.3,5.5,5.79,5.393971
4,Czech Republic,,,,,,,,4.31,5.1,...,5.01,6.58,7.13,7.0,6.61,5.75,6.14,6.58,7.27,6.697826


In [184]:
# checking country name in the table
# create two list to store all the country name that matching the json file or not matching the json file
match = []
not_match =[]

for name in retail_price_df["Country (US$/lb)"]:
    if name in country_list:
        match.append(name)
    else:
        not_match.append(name)
print(f"match: {match}\n not match: {not_match}")

match: ['Austria', 'Belgium', 'Bulgaria', 'Cyprus', 'Czech Republic', 'Denmark', 'Finland', 'France', 'Germany', 'Hungary', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'Japan', 'Norway', 'Russia', 'Switzerland', 'United Kingdom', 'United States of America']
 not match: []


In [185]:
# checking all the format in each column
retail_price_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 31 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country (US$/lb)  28 non-null     object 
 1   1990              17 non-null     float64
 2   1991              17 non-null     float64
 3   1992              17 non-null     float64
 4   1993              18 non-null     float64
 5   1994              19 non-null     float64
 6   1995              21 non-null     float64
 7   1996              21 non-null     float64
 8   1997              24 non-null     float64
 9   1998              24 non-null     float64
 10  1999              26 non-null     float64
 11  2000              26 non-null     float64
 12  2001              26 non-null     float64
 13  2002              27 non-null     float64
 14  2003              27 non-null     float64
 15  2004              27 non-null     float64
 16  2005              27 non-null     float64
 17 

### Growers received

In [186]:
grower_received_df = pd.read_csv("../Dataset/international_coffee_organization/Price_data/Prices_to _Growers.csv")
grower_received_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65 entries, 0 to 64
Data columns (total 32 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Country (US cents/lb)  65 non-null     object 
 1   coffee types           65 non-null     object 
 2   1990                   51 non-null     float64
 3   1991                   49 non-null     float64
 4   1992                   52 non-null     float64
 5   1993                   51 non-null     float64
 6   1994                   51 non-null     float64
 7   1995                   52 non-null     float64
 8   1996                   49 non-null     float64
 9   1997                   49 non-null     float64
 10  1998                   49 non-null     float64
 11  1999                   45 non-null     float64
 12  2000                   46 non-null     float64
 13  2001                   44 non-null     float64
 14  2002                   45 non-null     float64
 15  2003    

In [187]:
# checking country name in the table
# create two list to store all the country name that matching the json file or not matching the json file
match = []
not_match =[]

for name in grower_received_df["Country (US cents/lb)"]:
    if name in country_list:
        match.append(name)
    else:
        not_match.append(name)
print(f"match: {match}\n not match: {not_match}")

match: ['Colombia', 'Kenya', 'United Republic of Tanzania', 'Bolivia', 'Burundi', 'Cameroon', 'Costa Rica', 'Cuba', 'Democratic Republic of the Congo', 'Dominican Republic', 'Ecuador', 'El Salvador', 'Guatemala', 'Haiti', 'Honduras', 'India', 'Jamaica', 'Madagascar', 'Malawi', 'Mexico', 'Nicaragua', 'Panama', 'Papua New Guinea', 'Peru', 'Rwanda', 'Sri Lanka', 'Uganda', 'Venezuela', 'Zambia', 'Zimbabwe', 'Angola', 'Brazil', 'Ethiopia', 'Indonesia', 'Philippines', 'Thailand', 'Vietnam', 'Angola', 'Benin', 'Brazil', 'Burundi', 'Cameroon', 'Central African Republic', 'Republic of the Congo', 'Ivory Coast', 'Democratic Republic of the Congo', 'Ecuador', 'Gabon', 'Ghana', 'Guinea', 'India', 'Indonesia', 'Liberia', 'Madagascar', 'Nigeria', 'Papua New Guinea', 'Philippines', 'Sierra Leone', 'Sri Lanka', 'United Republic of Tanzania', 'Thailand', 'Togo', 'Trinidad and Tobago', 'Uganda', 'Vietnam']
 not match: []


In [188]:
# groupby countries -> average the amount of different coffee types
grower_received_grp = grower_received_df.groupby("Country (US cents/lb)").mean().reset_index()
grower_received_grp.head()


Unnamed: 0,Country (US cents/lb),1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Angola,85.6569,91.1153,44.9183,11.8708,,,,,,...,44.3889,48.2936,52.2644,34.50765,50.3799,41.99435,89.3944,145.80605,110.47815,
1,Benin,53.7609,22.8961,24.0067,,,,,,,...,,,,,,,,,,
2,Bolivia,51.7802,52.423,43.5276,27.3283,79.915,76.9867,57.1799,100.0689,131.0938,...,,,225.6445,201.9723,239.2465,234.7937,271.7944,471.6169,455.146,858.2941
3,Brazil,44.5867,35.7439,36.3968,44.21055,99.9974,106.51135,83.06235,111.63265,91.4982,...,104.10645,164.7769,128.34625,93.71205,107.9303,89.00675,100.30415,104.34355,79.10075,68.9681
4,Burundi,58.0195,55.0206,53.1979,50.0363,51.0661,49.5105,46.2112,46.4859,43.7877,...,85.6928,,,,,,,,,


### Total Production

In [189]:
production_df = pd.read_csv("../Dataset/international_coffee_organization/Supply/Total_production.csv")
production_df.columns = ["Country (In thousand 60kg bags)", "Species growing", "1990", "1991", "1992", "1993", "1994", "1995",
                         "1996", "1997", "1998", "1999", "2000", "2001", "2002", " 2003", "2004", " 2005", "2006", "2007",
                         "2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019"]
production_df.head()

Unnamed: 0,Country (In thousand 60kg bags),Species growing,1990,1991,1992,1993,1994,1995,1996,1997,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Angola,R A,50.345,79.331,77.52,32.608,76.802,62.109,70.925,64.33,...,34.97,28.715,32.79,34.935,39.405,40.515,44.83,35.006,41.9027,51.8419
1,Bolivia,A,122.777,103.536,120.235,50.823,116.944,142.485,124.579,140.719,...,117.2249,131.8354,105.2812,119.9122,99.8766,84.2191,77.9835,83.8112,82.5687,81.2654
2,Brazil,A R,27285.6286,27293.4934,34603.3542,28166.9786,28192.047,18060.2022,29196.743,26148.004,...,55428.4102,48591.8289,55418.0012,54688.9664,53304.7669,52870.5876,56788.1784,52739.8179,65130.7997,58210.7127
3,Burundi,A R,487.393,667.199,620.238,393.354,664.143,433.98,400.969,249.785,...,352.9776,204.1328,405.9615,163.2177,247.55,269.4576,196.4128,202.1079,204.3391,272.4688
4,Ecuador,A R,1503.815,2123.824,1185.48,2069.007,2375.766,1888.233,1992.914,1190.663,...,853.9798,825.4144,828.1024,665.545,644.0112,644.4926,644.8845,623.5744,496.0001,558.9352


In [190]:
# checking country name in the table
# create two list to store all the country name that matching the json file or not matching the json file
match = []
not_match =[]

for name in production_df["Country (In thousand 60kg bags)"]:
    if name in country_list:
        match.append(name)
    else:
        not_match.append(name)
print(f"match: {match}\n not match: {not_match}")

match: ['Angola', 'Bolivia', 'Brazil', 'Burundi', 'Ecuador', 'Indonesia', 'Madagascar', 'Malawi', 'Papua New Guinea', 'Paraguay', 'Peru', 'Rwanda', 'East Timor', 'Zimbabwe', 'Republic of the Congo', 'Cuba', 'Dominican Republic', 'Haiti', 'Philippines', 'United Republic of Tanzania', 'Zambia', 'Cameroon', 'Central African Republic', 'Colombia', 'Costa Rica', 'Ivory Coast', 'Democratic Republic of the Congo', 'El Salvador', 'Equatorial Guinea', 'Ethiopia', 'Gabon', 'Ghana', 'Guatemala', 'Guinea', 'Guyana', 'Honduras', 'India', 'Jamaica', 'Kenya', 'Laos', 'Liberia', 'Mexico', 'Nepal', 'Nicaragua', 'Nigeria', 'Panama', 'Sierra Leone', 'Sri Lanka', 'Thailand', 'Togo', 'Trinidad and Tobago', 'Uganda', 'Venezuela', 'Vietnam', 'Yemen']
 not match: []


In [191]:
# checking all the format in each column
production_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 32 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country (In thousand 60kg bags)  55 non-null     object 
 1   Species growing                  55 non-null     object 
 2   1990                             55 non-null     float64
 3   1991                             55 non-null     float64
 4   1992                             55 non-null     float64
 5   1993                             55 non-null     float64
 6   1994                             55 non-null     float64
 7   1995                             55 non-null     float64
 8   1996                             55 non-null     float64
 9   1997                             55 non-null     float64
 10  1998                             55 non-null     float64
 11  1999                             55 non-null     float64
 12  2000                    

In [192]:
# replacing A to arabica and R to robusta
production_df["Species growing"] = production_df["Species growing"].str.replace("A", "Arabica").str.replace("R", "Robusta")
production_df.head()

Unnamed: 0,Country (In thousand 60kg bags),Species growing,1990,1991,1992,1993,1994,1995,1996,1997,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Angola,Robusta Arabica,50.345,79.331,77.52,32.608,76.802,62.109,70.925,64.33,...,34.97,28.715,32.79,34.935,39.405,40.515,44.83,35.006,41.9027,51.8419
1,Bolivia,Arabica,122.777,103.536,120.235,50.823,116.944,142.485,124.579,140.719,...,117.2249,131.8354,105.2812,119.9122,99.8766,84.2191,77.9835,83.8112,82.5687,81.2654
2,Brazil,Arabica Robusta,27285.6286,27293.4934,34603.3542,28166.9786,28192.047,18060.2022,29196.743,26148.004,...,55428.4102,48591.8289,55418.0012,54688.9664,53304.7669,52870.5876,56788.1784,52739.8179,65130.7997,58210.7127
3,Burundi,Arabica Robusta,487.393,667.199,620.238,393.354,664.143,433.98,400.969,249.785,...,352.9776,204.1328,405.9615,163.2177,247.55,269.4576,196.4128,202.1079,204.3391,272.4688
4,Ecuador,Arabica Robusta,1503.815,2123.824,1185.48,2069.007,2375.766,1888.233,1992.914,1190.663,...,853.9798,825.4144,828.1024,665.545,644.0112,644.4926,644.8845,623.5744,496.0001,558.9352


### Domestic Consumption

In [193]:
domestic_consumption_df = pd.read_csv("../Dataset/international_coffee_organization/Supply/Domestic_consumption.csv")
domestic_consumption_df.columns = ["Country (In thousand 60kg bags)", "Species growing", "1990", "1991", "1992", "1993", "1994", "1995",
                                   "1996", "1997", "1998", "1999", "2000", "2001", "2002", " 2003", "2004", " 2005", "2006", "2007",
                                   "2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019"]
domestic_consumption_df.head()

Unnamed: 0,Country (In thousand 60kg bags),Species growing,1990,1991,1992,1993,1994,1995,1996,1997,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Angola,R A,20.0,30.0,35.0,20.0,25.0,10.0,20.0,40.0,...,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
1,Bolivia,A,25.0,27.0,27.5,28.5,29.5,30.5,31.5,32.5,...,47.5,49.0,50.5,52.0,53.5,55.0,57.0,58.5,60.0,61.0
2,Brazil,A R,8200.0,8500.0,8900.0,9100.0,9300.0,10100.0,11000.0,11500.0,...,19132.0,19720.0,20330.0,20085.0,20333.0,20508.0,21225.0,21997.0,22200.0,22000.0
3,Burundi,A R,2.0,1.6,1.7,1.91,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
4,Ecuador,A R,350.0,350.0,350.0,350.0,350.0,350.0,300.0,300.0,...,150.0,150.0,150.0,155.0,155.0,155.0,155.0,155.0,155.0,149.0


In [194]:
# checking country name in the table
# create two list to store all the country name that matching the json file or not matching the json file
match = []
not_match =[]

for name in domestic_consumption_df["Country (In thousand 60kg bags)"]:
    if name in country_list:
        match.append(name)
    else:
        not_match.append(name)
print(f"match: {match}\n not match: {not_match}")

match: ['Angola', 'Bolivia', 'Brazil', 'Burundi', 'Ecuador', 'Indonesia', 'Madagascar', 'Malawi', 'Papua New Guinea', 'Paraguay', 'Peru', 'Rwanda', 'East Timor', 'Zimbabwe', 'Republic of the Congo', 'Cuba', 'Dominican Republic', 'Haiti', 'Philippines', 'United Republic of Tanzania', 'Zambia', 'Cameroon', 'Central African Republic', 'Colombia', 'Costa Rica', 'Ivory Coast', 'Democratic Republic of the Congo', 'El Salvador', 'Equatorial Guinea', 'Ethiopia', 'Gabon', 'Ghana', 'Guatemala', 'Guinea', 'Guyana', 'Honduras', 'India', 'Jamaica', 'Kenya', 'Laos', 'Liberia', 'Mexico', 'Nepal', 'Nicaragua', 'Nigeria', 'Panama', 'Sierra Leone', 'Sri Lanka', 'Thailand', 'Togo', 'Trinidad and Tobago', 'Uganda', 'Venezuela', 'Vietnam', 'Yemen']
 not match: []


In [195]:
# checking all the format in each column
domestic_consumption_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 32 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country (In thousand 60kg bags)  55 non-null     object 
 1   Species growing                  55 non-null     object 
 2   1990                             55 non-null     float64
 3   1991                             55 non-null     float64
 4   1992                             55 non-null     float64
 5   1993                             55 non-null     float64
 6   1994                             55 non-null     float64
 7   1995                             55 non-null     float64
 8   1996                             55 non-null     float64
 9   1997                             55 non-null     float64
 10  1998                             55 non-null     float64
 11  1999                             55 non-null     float64
 12  2000                    

In [196]:
domestic_consumption_df["Species growing"] = domestic_consumption_df["Species growing"].str.replace("A", "Arabica").str.replace("R", "Robusta")
domestic_consumption_df.head()

Unnamed: 0,Country (In thousand 60kg bags),Species growing,1990,1991,1992,1993,1994,1995,1996,1997,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Angola,Robusta Arabica,20.0,30.0,35.0,20.0,25.0,10.0,20.0,40.0,...,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
1,Bolivia,Arabica,25.0,27.0,27.5,28.5,29.5,30.5,31.5,32.5,...,47.5,49.0,50.5,52.0,53.5,55.0,57.0,58.5,60.0,61.0
2,Brazil,Arabica Robusta,8200.0,8500.0,8900.0,9100.0,9300.0,10100.0,11000.0,11500.0,...,19132.0,19720.0,20330.0,20085.0,20333.0,20508.0,21225.0,21997.0,22200.0,22000.0
3,Burundi,Arabica Robusta,2.0,1.6,1.7,1.91,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
4,Ecuador,Arabica Robusta,350.0,350.0,350.0,350.0,350.0,350.0,300.0,300.0,...,150.0,150.0,150.0,155.0,155.0,155.0,155.0,155.0,155.0,149.0


#### Import 

In [197]:
import_df = pd.read_csv("../Dataset/international_coffee_organization/Trade/all_import.csv")
import_df.head()

Unnamed: 0,Country (In thousand 60-kg bags),1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Algeria,1058.887,1782.264,862.266,1470.025,1697.675,897.735,1229.068,1461.023,475.395,...,2021.446,1942.379,2116.647,2124.726,2153.974,2159.12,2323.13,2190.164,1819.133,2259.792
1,Benin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.78,...,8.087,12.718,7.674,6.309,10.64,15.769,11.973,15.57,22.396,12.339
2,Botswana,12.974,11.536,18.099,22.07,20.425,9.249,26.427,13.261,29.537,...,22.017,23.347,30.7,22.127,20.76,32.991,23.295,21.018,5.776,17.567
3,Burkina Faso,5.534,7.054,1.893,7.071,8.431,13.125,11.419,12.01,16.345,...,60.257,70.139,65.029,78.134,82.762,127.51,91.64,106.9,126.316,104.666
4,Cabo Verde,1.67,7.119,17.5,9.968,8.026,12.591,13.053,7.706,8.742,...,11.342,10.558,12.326,11.682,11.226,11.107,12.259,13.529,13.943,11.236


In [198]:
# checking country name in the table
# create two list to store all the country name that matching the json file or not matching the json file
match = []
not_match =[]

for name in import_df["Country (In thousand 60-kg bags)"]:
    if name in country_list:
        match.append(name)
    else:
        not_match.append(name)
print(f"match: {match}\n not match: {not_match}")

match: ['Algeria', 'Benin', 'Botswana', 'Burkina Faso', 'Chad', 'Djibouti', 'Egypt', 'Gambia', 'Lesotho', 'Libya', 'Mali', 'Mauritania', 'Morocco', 'Mozambique', 'Namibia', 'Niger', 'Senegal', 'Somalia', 'South Africa', 'Sudan', 'Afghanistan', 'Armenia', 'Australia', 'Azerbaijan', 'Bangladesh', 'Cambodia', 'China', 'China', 'China', 'North Korea', 'Fiji', 'Iran', 'Iraq', 'Israel', 'Jordan', 'Kazakhstan', 'Kuwait', 'Kyrgyzstan', 'Lebanon', 'Malaysia', 'Mongolia', 'Myanmar', 'New Zealand', 'Oman', 'Pakistan', 'Qatar', 'South Korea', 'Saudi Arabia', 'Solomon Islands', 'Taiwan', 'Tajikistan', 'Turkey', 'United Arab Emirates', 'Uzbekistan', 'Vanuatu', 'Netherlands', 'Puerto Rico', 'Belize', 'Bermuda', 'Albania', 'Belarus', 'Bosnia and Herzegovina', 'Georgia', 'Iceland', 'Montenegro', 'Macedonia', 'Moldova', 'Republic of Serbia', 'Ukraine', 'Canada', 'Argentina', 'Chile', 'Suriname', 'Uruguay', 'Austria', 'Belgium', 'Luxembourg', 'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 

#### Some of the country is not in the map, so it was skipped but will still store into database

In [199]:
# remove (,) in each cell
for column in import_df.columns:
    import_df[column] = import_df[column].str.replace(",","", regex = False)
 
import_df

Unnamed: 0,Country (In thousand 60-kg bags),1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Algeria,1058.887,1782.264,862.266,1470.025,1697.675,897.735,1229.068,1461.023,475.395,...,2021.446,1942.379,2116.647,2124.726,2153.974,2159.120,2323.130,2190.164,1819.133,2259.792
1,Benin,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,1.780,...,8.087,12.718,7.674,6.309,10.640,15.769,11.973,15.570,22.396,12.339
2,Botswana,12.974,11.536,18.099,22.070,20.425,9.249,26.427,13.261,29.537,...,22.017,23.347,30.700,22.127,20.760,32.991,23.295,21.018,5.776,17.567
3,Burkina Faso,5.534,7.054,1.893,7.071,8.431,13.125,11.419,12.010,16.345,...,60.257,70.139,65.029,78.134,82.762,127.510,91.640,106.900,126.316,104.666
4,Cabo Verde,1.670,7.119,17.500,9.968,8.026,12.591,13.053,7.706,8.742,...,11.342,10.558,12.326,11.682,11.226,11.107,12.259,13.529,13.943,11.236
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,Russia,0.000,0.000,1381.858,1794.946,1725.730,1728.144,1127.972,2185.288,1731.918,...,4155.257,4217.683,4174.745,4410.193,4746.950,4710.038,5232.881,5467.783,5287.542,5916.633
139,Switzerland,1170.508,1130.292,1101.251,1059.878,1099.406,1016.829,1111.627,970.772,1111.835,...,2317.606,2497.914,2477.957,2666.751,2643.106,2747.859,2816.026,2903.618,3086.019,3228.383
140,Tunisia,101.000,90.201,110.583,100.624,137.267,136.783,143.650,164.617,147.417,...,304.090,428.782,438.924,438.967,460.028,458.707,465.553,546.156,532.967,508.332
141,United Kingdom,2898.298,2805.914,3088.172,3196.981,3465.860,2806.723,2903.453,2929.338,3142.408,...,4301.908,4183.230,4126.054,4206.091,4318.696,4895.407,5052.293,4989.038,5687.860,5554.364


In [200]:
# change cell type using astype
dic_columns_type = {}
for column in import_df.columns:
    if column != "Country (In thousand 60-kg bags)":
        dic_columns_type[column] = "float"
    
import_df = import_df.astype(dic_columns_type)
import_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143 entries, 0 to 142
Data columns (total 31 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country (In thousand 60-kg bags)  143 non-null    object 
 1   1990                              137 non-null    float64
 2   1991                              137 non-null    float64
 3   1992                              141 non-null    float64
 4   1993                              141 non-null    float64
 5   1994                              141 non-null    float64
 6   1995                              141 non-null    float64
 7   1996                              141 non-null    float64
 8   1997                              141 non-null    float64
 9   1998                              141 non-null    float64
 10  1999                              142 non-null    float64
 11  2000                              142 non-null    float64
 12  2001    

In [201]:
# grouping the same country together
import_grp = import_df.groupby("Country (In thousand 60-kg bags)").mean().reset_index()
import_grp

Unnamed: 0,Country (In thousand 60-kg bags),1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Afghanistan,0.250,0.250,0.250,0.250,0.250,0.250,0.867,2.900,1.000,...,9.665,1.040,4.637,3.033,7.792,11.539,10.681,34.357,38.556,8.158
1,Albania,28.417,9.283,33.333,59.685,67.802,31.302,11.275,1.651,6.248,...,113.653,116.812,123.346,131.379,123.046,143.464,154.180,154.907,125.143,144.686
2,Algeria,1058.887,1782.264,862.266,1470.025,1697.675,897.735,1229.068,1461.023,475.395,...,2021.446,1942.379,2116.647,2124.726,2153.974,2159.120,2323.130,2190.164,1819.133,2259.792
3,Anguilla,0.009,0.055,0.079,0.330,0.300,0.037,0.032,0.349,0.841,...,0.981,0.390,0.506,0.400,0.494,0.319,0.315,0.215,0.359,0.354
4,Antigua and Barbuda,0.718,0.602,0.303,0.261,0.321,0.332,0.107,1.103,0.170,...,1.950,3.443,3.459,2.165,2.090,2.833,1.862,4.492,4.445,5.282
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,United States of America,21006.786,19839.630,22939.033,19328.721,16171.245,17106.857,19444.533,20342.746,21030.401,...,24378.013,26093.395,26056.163,27015.540,27565.498,27707.664,28837.838,29487.886,28918.207,30854.073
135,Uruguay,42.261,30.191,45.544,32.313,39.978,38.922,42.059,41.978,47.829,...,61.693,63.086,59.816,59.983,59.113,59.956,55.148,56.313,56.841,56.628
136,Uzbekistan,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,74.823,119.627
137,Vanuatu,0.411,0.269,0.578,0.352,0.454,0.318,0.417,0.530,0.211,...,1.568,1.558,0.941,1.778,1.580,1.483,1.615,3.637,6.124,2.180


#### Re-export

In [202]:
reexport_df = pd.read_csv("../Dataset/international_coffee_organization/Trade/all_reexport.csv")
reexport_df.head()

Unnamed: 0,Country (In thousand 60-kg bags),1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Austria,414.5930184,756.1850347,960.0820413,535.4880203,380.9950083,228.6660055,162.6780051,222.1238083,209.6159091,...,463.3459702,365.1472623,303.589908,310.8708473,386.3713576,377.5517359,307.6914777,308.2931411,257.4135488,274.1619791
1,Belgium,,,,,,,,,,...,5053.064814,4893.973633,4753.595285,4256.978879,3952.812466,4459.903535,4550.04536,4377.324264,4377.326748,5013.520188
2,Luxembourg,890.8440337,1038.068039,1131.478041,1273.823045,1304.258046,1328.483041,1487.186551,1658.115555,2270.271439,...,,,,,,,,,,
3,Bulgaria,166.4,183.459,7.3530002,70.4880001,109.9040006,122.0100008,25.685,19.2360001,17.9619999,...,133.0140903,121.9904582,183.8838149,190.8798212,244.4727217,275.4994882,326.8140544,291.8707366,311.5245159,361.8300649
4,Croatia,,,33.6450014,30.8840013,29.5880012,31.6510013,17.367,23.9010009,19.321001,...,23.9868757,23.7208153,24.1262407,26.2867961,51.2135581,62.5539035,99.0814808,77.9684384,85.5762211,85.9117545


In [203]:
# checking country name in the table
# create two list to store all the country name that matching the json file or not matching the json file
match = []
not_match =[]

for name in reexport_df["Country (In thousand 60-kg bags)"]:
    if name in country_list:
        match.append(name)
    else:
        not_match.append(name)
print(f"match: {match}\n not match: {not_match}")

match: ['Austria', 'Belgium', 'Luxembourg', 'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'Japan', 'Norway', 'Russia', 'Switzerland', 'Tunisia', 'United Kingdom', 'United States of America', 'Algeria', 'Botswana', 'Burkina Faso', 'Djibouti', 'Egypt', 'Gambia', 'Lesotho', 'Mali', 'Morocco', 'Mozambique', 'Namibia', 'Niger', 'Senegal', 'Somalia', 'South Africa', 'Sudan', 'Armenia', 'Australia', 'Azerbaijan', 'Bangladesh', 'Cambodia', 'China', 'China', 'China', 'Fiji', 'Iran', 'Iraq', 'Israel', 'Jordan', 'Kazakhstan', 'Kuwait', 'Kyrgyzstan', 'Lebanon', 'Malaysia', 'Mongolia', 'Myanmar', 'New Zealand', 'Oman', 'Pakistan', 'Qatar', 'South Korea', 'Saudi Arabia', 'Solomon Islands', 'Taiwan', 'Turkey', 'United Arab Emirates', 'Uzbekistan', 'Vanuatu', 'Puerto R

In [99]:
reexport_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131 entries, 0 to 130
Data columns (total 31 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country (In thousand 60-kg bags)  131 non-null    object 
 1   1990                              118 non-null    object 
 2   1991                              120 non-null    object 
 3   1992                              127 non-null    object 
 4   1993                              128 non-null    object 
 5   1994                              128 non-null    object 
 6   1995                              128 non-null    object 
 7   1996                              128 non-null    object 
 8   1997                              128 non-null    object 
 9   1998                              128 non-null    object 
 10  1999                              130 non-null    object 
 11  2000                              130 non-null    object 
 12  2001    

In [101]:
# remove (,) in each cell

for column in reexport_df.columns:
    if column != "2008":
        reexport_df[column] = reexport_df[column].str.replace(",","", regex = False)

reexport_df.head()

Unnamed: 0,Country (In thousand 60-kg bags),1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Austria,414.5930184,756.1850347,960.0820413,535.4880203,380.9950083,228.6660055,162.6780051,222.1238083,209.6159091,...,463.3459702,365.1472623,303.589908,310.8708473,386.3713576,377.5517359,307.6914777,308.2931411,257.4135488,274.1619791
1,Belgium,,,,,,,,,,...,5053.064814,4893.973633,4753.595285,4256.978879,3952.812466,4459.903535,4550.04536,4377.324264,4377.326748,5013.520188
2,Belgium/Luxembourg,890.8440337,1038.068039,1131.478041,1273.823045,1304.258046,1328.483041,1487.186551,1658.115555,2270.271439,...,,,,,,,,,,
3,Bulgaria,166.4,183.459,7.3530002,70.4880001,109.9040006,122.0100008,25.685,19.2360001,17.9619999,...,133.0140903,121.9904582,183.8838149,190.8798212,244.4727217,275.4994882,326.8140544,291.8707366,311.5245159,361.8300649
4,Croatia,,,33.6450014,30.8840013,29.5880012,31.6510013,17.367,23.9010009,19.321001,...,23.9868757,23.7208153,24.1262407,26.2867961,51.2135581,62.5539035,99.0814808,77.9684384,85.5762211,85.9117545


In [102]:
# change cell type using astype
dic_columns_type = {}
for column in reexport_df.columns:
    if column != "Country (In thousand 60-kg bags)":
        dic_columns_type[column] = "float"
    
reexport_cleaned_df = reexport_df.astype(dic_columns_type)
reexport_cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131 entries, 0 to 130
Data columns (total 31 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country (In thousand 60-kg bags)  131 non-null    object 
 1   1990                              118 non-null    float64
 2   1991                              120 non-null    float64
 3   1992                              127 non-null    float64
 4   1993                              128 non-null    float64
 5   1994                              128 non-null    float64
 6   1995                              128 non-null    float64
 7   1996                              128 non-null    float64
 8   1997                              128 non-null    float64
 9   1998                              128 non-null    float64
 10  1999                              130 non-null    float64
 11  2000                              130 non-null    float64
 12  2001    

#### Export

In [103]:
export_df = pd.read_csv("../Dataset/international_coffee_organization/Trade/Export.csv")
export_df.head()

Unnamed: 0,Country (In thousand 60kg bags),1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Angola,84.35,70.501,80.25,38.878,8.302,40.559,51.831,50.494,53.929,...,4.37,7.575,8.375,5.52,9.375,10.515,10.945,9.055,9.323397,23.357178
1,Bolivia (Plurinational State of),156.442,73.523,96.204,47.319,84.321,93.958,123.445,110.955,97.039,...,78.268006,74.308883,62.67578,54.850533,61.751267,30.280158,22.456342,26.119992,22.459634,20.341955
2,Brazil,16935.7876,21182.7614,18790.7192,17837.748,17273.1476,14468.4322,15250.609,16801.26001,18144.38833,...,33166.64159,33806.00933,28549.42589,31650.56295,37335.17283,37562.84675,34269.15025,30924.56785,35637.35859,40697.86371
3,Burundi,584.773,687.851,645.858,417.609,507.803,528.202,224.076,528.764,373.841,...,307.118958,217.845799,392.006917,194.715883,252.178,230.18855,204.270831,168.876264,201.725236,292.887291
4,Cameroon,2611.259,1752.179,1645.851,704.53,545.889,407.269,563.549,1368.03,745.718,...,793.845667,490.283067,621.8128,271.949217,375.033867,390.142717,281.128967,245.017117,287.41525,249.9024


In [104]:
export_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 31 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country (In thousand 60kg bags)  55 non-null     object 
 1   1990                             55 non-null     float64
 2   1991                             55 non-null     float64
 3   1992                             55 non-null     float64
 4   1993                             55 non-null     float64
 5   1994                             55 non-null     float64
 6   1995                             55 non-null     float64
 7   1996                             55 non-null     float64
 8   1997                             55 non-null     float64
 9   1998                             55 non-null     float64
 10  1999                             55 non-null     float64
 11  2000                             55 non-null     float64
 12  2001                    

### Export all the dataframe into json format

In [None]:
# store data in json file
retail_price_df.to_json(path_or_buf = "./output_json/retail_price.json", orient= "records")
production_df.to_json(path_or_buf = "./output_json/total_production.json", orient= "records")

