In [4]:
import pandas as pd
import pycountry
import numpy as np

# Load CSV file into a pandas DataFrame
df = pd.read_csv('data/gdp(1972-2021).csv')
print(len(df))

# Get unique categories from the first column of the CSV file
categories = df.iloc[:, 0].unique()
#print some of the categories present in the dataset


# Create a dictionary to store dataframes for each category
category_dfs = {}

# Loop through each category and create a dataframe
for category in categories:
    # Filter the rows in the DataFrame based on the category
    category_df = df[df.iloc[:, 0] == category] 
    # Add the category dataframe to the dictionary
    category_dfs[category] = category_df
    


11072


The csv file contains data for all the indicators of the development like (population, gdp, inflation, net FDI etc.) . So we only take the features we need.


In [64]:

#Extract only the fetures that can act as indicator to gdp
popDf = category_dfs['Population, total']
popDf.reset_index(inplace=True,drop=True)

infDf = category_dfs['Inflation, consumer prices (annual %)']
infDf.reset_index(inplace=True,drop=True)
gdpDf = category_dfs['GDP per capita (current US$)']
gdpDf.reset_index(inplace=True,drop=True)
remitDf = category_dfs['Personal remittances, paid (current US$)']
remitDf.reset_index(inplace=True,drop=True)
exportsDf = category_dfs['Exports of goods and services (% of GDP)']
exportsDf.reset_index(inplace=True,drop=True)
importsDf = category_dfs['Imports of goods and services (% of GDP)']
importsDf.reset_index(inplace=True,drop=True)
fdiDf = category_dfs['Foreign direct investment, net inflows (BoP, current US$)']
fdiDf.reset_index(inplace=True,drop=True)
energyDf = category_dfs['Energy use (kg of oil equivalent per capita)']
energyDf.reset_index(inplace=True,drop=True)
incLow20Df = category_dfs['Income share held by lowest 20%']
incLow20Df.reset_index(inplace=True,drop=True)
gniDf = category_dfs['GNI per capita, Atlas method (current US$)']
gniDf.reset_index(inplace=True,drop=True)

#militaryExpenseDf = category_dfs['Military expenditure (% of GDP)']

#marketCapDf = category_dfs['Market capitalization of listed domestic companies (% of GDP)']
#mobileSubDf = category_dfs['Mobile cellular subscriptions (per 100 people)']


industryValDf = category_dfs['Industry (including construction), value added (% of GDP)']
industryValDf.reset_index(inplace=True,drop=True)
agriValDf = category_dfs['Agriculture, forestry, and fishing, value added (% of GDP)']
agriValDf.reset_index(inplace=True,drop=True)
elecConsDf = category_dfs["Electric power consumption (kWh per capita)"]
elecConsDf.reset_index(inplace=True,drop=True)





In [65]:

#Merge all the features to the dataframe of a a particular year
def merge_features(year):
    mergedDf = pd.DataFrame()
    mergedDf = gdpDf.iloc[:,3:4]
    mergedDf.reset_index(inplace=True,drop=True)
    mergedDf.loc[:,'population'] = popDf[year]
    mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
    mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
    mergedDf.loc[:,'imports'] = importsDf.loc[:,year]
    mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
    mergedDf.loc[:,'income'] = incLow20Df.loc[:,year]
    mergedDf.loc[:,'gni'] = gniDf.loc[:,year]
    mergedDf.loc[:,'fdi'] = fdiDf.loc[:,year]
    mergedDf.loc[:,'industry'] = industryValDf.loc[:,year]
    mergedDf.loc[:,'agri'] = agriValDf.loc[:,year]
    mergedDf.loc[:,'gdp'] = gdpDf.loc[:,year]
    mergedDf.loc[:,'remit'] = remitDf.loc[:,year]
    mergedDf.loc[:,'energy'] = energyDf.loc[:,year]
    mergedDf.loc[:,'electric'] = elecConsDf.loc[:,year]
    return mergedDf;



In [25]:
mergedDF = merge_features(2012)
print(mergedDf[mergedDf['fdi']==0])

KeyError: 2012

In [57]:
print(mergedDf.columns)


Index(['Country Code', 'population', 'inflation', 'exports', 'imports',
       'income', 'gni', 'fdi', 'industry', 'agri', 'gdp', 'iso_code',
       'netExport'],
      dtype='object')


Now that we have brought all the desired indicators of the gdp in one datatset, we can check for the missing values in the dataframe and deal with them.

In [17]:
#Create  a new column in dataframe with iso code of country -> because BACI data deals using this code, for uniformity



In [66]:



NUM_PRODS = 20
k=10
BACI_FORMAT = 'data/BACI/BACI_HS92_Y{}_V202301.csv'
OUTPUT_PREFIX = '/Users/macbook/Documents/7091CEM/gitVersion/GNN/'
INPUT_NODE_FEATURES = OUTPUT_PREFIX + 'data/x_country_{}.csv'
OUTPUT_EDGE_INDEX = OUTPUT_PREFIX + 'data/connection_{}.csv'
OUTPUT_EDGE_TARGET = OUTPUT_PREFIX + 'data/y_country_{}.csv'
import numpy as np

def create_edge_features(year):
    """
    Extract edge features from BACI dataset. There are 10 features for each edge.
    The i-th feature corresponds to the amount, in metric tons, of the i-th most
    traded product in the world.
    """

    # Read in BACI if not done so already
    baci = pd.read_csv(BACI_FORMAT.format(year))
    baci = baci.fillna(0)

    # Get top NUM_PRODS = 10 products traded in the world (in metric tons)
    top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']
    k_to_idx = {id: i for (i, id) in enumerate(top_products)}

    # Get edge features associated with each pair of countries
    feature_dict = {}
    baci = baci[baci['k'].isin(top_products)].filter(['i', 'j', 'k', 'q'])

    d = {}
    def update_dict(i, j, k, q):
        """ Assigns edge features to each pair of countries """

        i = int(i)
        j = int(j)
        if (i, j) not in d:
            d[(i,j)] = [0] * NUM_PRODS
        r = d[(i,j)] # get vec for a specific edge (i,j)
        r[k_to_idx[k]] = q # update vec for edge (i,j) and product k
        d[(i,j)] = r # rewrite the vec

    baci.apply(lambda r: update_dict(int(r['i']), int(r['j']), int(r['k']), r['q']), axis=1)
    edge_features = np.vstack([d[(r['i'], r['j'])] for _, r in baci.iterrows()]) # create matrix of edge feature vecs we will write to the files.
    feature_names = ['f'+ str(i) for i in range(NUM_PRODS)] # name our edge features 'f0,...f9'
    baci[feature_names] = edge_features # write in the data to the baci dataframe
    return baci




def prepareData(mergedDf,year):
    
    
    # Read in the BACI trade-flow dataset
    baci = pd.read_csv(BACI_FORMAT.format(year))
    baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
    baci = baci.sort_values(['v'], ascending=False).groupby(['i']).head(k).reset_index().filter(['i','j']) # keep only top k exporters to each country
    
    #print(baci)

    # Extract edge features from BACI dataset
    edge_baci = create_edge_features(year) # add edge features (ie. the amount, in metric tons, of the top 10 products traded internationally)
    baci = pd.merge(baci, edge_baci,how='left') # merge these features with the edges we've already filtered by export value above.
    baci.replace(r'^\s+|\s+$','', inplace=True,regex=True)
    baci.replace('NA', np.nan, inplace=True)
    baci = baci.fillna(0)
    print(len(mergedDf))
    mergedDf = mergedDf[mergedDf['iso_code'].isin(baci['i'])] # keep rows corresponding to countries that are in the BACI dataset
    print(len(mergedDf))
    mergedDf = mergedDf[mergedDf['gdp'].astype(float) > 0] # remove rows with zero GDP data
    print(len(mergedDf))
    baci = baci[baci['i'].isin(mergedDf['iso_code']) & baci['j'].isin(mergedDf['iso_code'])] # exclude edges connected to countries that do not have GDP data
    assert(mergedDf.shape[0] == baci['i'].nunique()) # GDP and exporters should be of same cardinality
    
    
    yValue = mergedDf[['iso_code','gdp']]
    yValue.to_csv(OUTPUT_EDGE_TARGET.format(year),index=False)
    features_to_drop = ['gdp','Country Code']
    mergedDf = mergedDf.drop(features_to_drop,axis=1)
    mergedDf.to_csv(INPUT_NODE_FEATURES.format(year), index=False)
    baci.to_csv(OUTPUT_EDGE_INDEX.format(year), index=False)
    




In [67]:
def convert_row(row):
    country = pycountry.countries.get(alpha_3=row['Country Code'])
    if country is None:
        return -1
    else:
        if country.alpha_3 =='USA':
            return 842    # for some reason the USA sometimes uses 840 and other times 842
    return int(country.numeric)
for i in range(1995,2022):
    mergedDf = merge_features(str(i)+' ' +'[YR'+str(i)+']')
    
    mergedDf['iso_code'] = mergedDf.apply(convert_row, axis=1)
    mergedDf.head()

    #Replace .. with NaN values for ease
    mergedDf.replace('..', np.nan, inplace=True)
    #Fill missing values using median approach
    mergedDf.fillna(mergedDf.median(),inplace=True)
    #Replace 0 with 1 otherwise Nan value will be produced when computed
    mergedDf['fdi'] = mergedDf['fdi'].replace('0','1')
    #print(mergedDf[mergedDf['fdi']=='0'])
    #feature engineering
    mergedDf['netExport'] = mergedDf['exports'].astype(float) - mergedDf['imports'].astype(float)
    
    prepareData(mergedDf,i)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
187
187


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
187
187


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
187
187


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
187
187


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
189
189


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
198
198


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
198
198


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
198
198


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
198
198


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
198
198


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
198
198


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
200
200


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
200
200


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
200
200


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
200
200


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
201
201


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
202
202


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
204
204


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
204
204


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
204
204


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
204
204


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
204
204


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
204
204


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
204
204


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
204
204


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
204
204


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'population'] = popDf[year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'inflation'] = infDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mergedDf.loc[:,'exports'] = exportsDf.loc[:,year]
A value is trying to be set on a copy of a slice from a DataFram

  baci = baci.groupby(['i','j']).sum() # sum together all trade-flows between each pair of countries
  top_products = baci.groupby(['k']).sum().sort_values(['v'],ascending=False).head(NUM_PRODS).reset_index()['k']


217
204
204


In [None]:
!pip install torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu111.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu111.html
!pip install torch-geometric

In [104]:

def create_data(year):
    '''
    For given year, pull in node features, edge features, and edge index and
    save in a PyG Data object.
    '''
    
    assert(year in range(1995, 2021 + 1))
    edges = pd.read_csv(f'data/input_files/connection_{year}.csv')
    
    # generate map from iso_code to ids of form [0, ..., num_unique_iso_codes - 1]
    iso_codes = set(edges['i'])
    iso_codes = iso_codes.union(set(edges['j']))
    iso_code_to_id = {code : i for (i, code) in enumerate(iso_codes)}
    
    # load in edge index
    edges['i_id'] = edges['i'].map(iso_code_to_id)
    edges['j_id'] = edges['j'].map(iso_code_to_id)
    edge_index = torch.from_numpy(edges[['i_id', 'j_id']].to_numpy(np.long)).t()
    edge_attr = torch.from_numpy(edges[EDGE_FEATURES].to_numpy(np.float32)) #extract the features from the dataset.
    edge_attr = (edge_attr - edge_attr.mean(axis=0)) / (edge_attr.std(axis=0))
    print(edge_attr)
    
    x_df = pd.read_csv(f'data/input_files/x_country_{year}.csv')
    x_df['id'] = x_df['iso_code'].map(iso_code_to_id)
    x = torch.from_numpy(x_df.sort_values('id').loc[:,:].to_numpy(np.float32))
    x = (x - x.mean(axis=0)) / (x.std(axis=0))  # scale and center data
    print(x)
    
    # load in target values
    y_df = pd.read_csv(f'data/input_files/y_country_{year}.csv')
    y_df['id'] = y_df['iso_code'].map(iso_code_to_id)
    y = torch.from_numpy(y_df.sort_values('id')[f'{year+1}'].to_numpy(np.float32)).unsqueeze(1)# get labels as tensor
    y = y.log() # log scale since spread of GDP is large

    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)

def evaluate_model(model, data_iter):
    '''
    Accumulate MSE over a data list or loader.
    '''
    return sum([F.mse_loss(model(data), data.y).item() for data in data_iter])

def get_data():
    '''
    Generate data_lists for train, val, and test. These lists can be either loaded into data_loaders
    or indexed directly.
    '''

    data_list = [create_data(year) for year in range(FIRST_YEAR, LAST_YEAR)]
    random.shuffle(data_list)
    data_train = data_list[:NUM_TRAIN]
    data_val = data_list[NUM_TRAIN:NUM_TRAIN+NUM_VAL+1]
    data_test = data_list[NUM_TRAIN+NUM_VAL:]
    return (data_train, data_val, data_test)

In [105]:
d = create_data(2020)

NameError: name 'torch' is not defined

In [None]:
    # generate map from iso_code to ids of form [0, ..., num_unique_iso_codes - 1]
    iso_codes = set(edges['i'])
    iso_codes = iso_codes.union(set(edges['j']))
    iso_code_to_id = {code : i for (i, code) in enumerate(iso_codes)}

    # load in edge index
    edges['i_id'] = edges['i'].map(iso_code_to_id)
    edges['j_id'] = edges['j'].map(iso_code_to_id)
    edge_index = torch.from_numpy(edges[['i_id', 'j_id']].to_numpy(np.long)).t()
    edge_attr = torch.from_numpy(edges[EDGE_FEATURES].to_numpy(np.float32)) #extract the features from the dataset.
    edge_attr = (edge_attr - edge_attr.mean(axis=0)) / (edge_attr.std(axis=0))
    
    # load in target values
    y_df = pd.read_csv(f'{DOWNLOAD_PREFIX}/output/Y_{year}.csv')
    y_df['id'] = y_df['iso_code'].map(iso_code_to_id)
    y = torch.from_numpy(y_df.sort_values('id')[f'{year+1}'].to_numpy(np.float32)).unsqueeze(1)# get labels as tensor
    y = y.log() # log scale since spread of GDP is large
    
    # load in input features
    x_df = pd.read_csv(f'{DOWNLOAD_PREFIX}/output/X_NODE_{year}.csv')
    x_df['id'] = x_df['iso_code'].map(iso_code_to_id)
    features = ['pop', 'cpi', 'emp']
    x = torch.from_numpy(x_df.sort_values('id').loc[:,:].to_numpy(np.float32))
    x = (x - x.mean(axis=0)) / (x.std(axis=0))  # scale and center data
    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)



In [106]:
data = {'name': ['John', 'Bob', 'Alice'], 'age': [25, 30, 35]}
df = pd.DataFrame(data)
prefix = 'https://raw.githubusercontent.com/wagles3/GNN/main/'
df.to_csv(prefix+'output.csv', index=False)

HTTPError: HTTP Error 404: Not Found