# Import to AWS DB

1) Run `CalculateEverything` in the `InternationalityIndex.InternationalityCalculations.py`

2) Copy the output xlsx file in the same folder as this notebook.

3) Edit the first two rows in the following cell and run it.

4) Copy all files from the AWS_Import directory using WinSCP

    a. Connect to AWS EC2 IDEA (ubuntu@ec2-18-188-88-0.us-east-2.compute.amazonaws.com)
    
    b. Copy csv files from the `AWS_Import` directory to `\home\ubuntu\db-admin\csv`
    
5) Using Putty, run the import to AWS

    a. Connect to AWS EC2 IDEA (ubuntu@ec2-18-188-88-0.us-east-2.compute.amazonaws.com)
    
    b. Go to `db-admin` directory
    
    c. run: `psql --host=science-internationality-dbinstance.c3aa5fkeiz2h.us-east-2.rds.amazonaws.com --port=5432 --username=root --password --dbname=scienceInternationalitydb -f drop_generate_schema.sql`
    
    d. run: `psql --host=science-internationality-dbinstance.c3aa5fkeiz2h.us-east-2.rds.amazonaws.com --port=5432 --username=root --password --dbname=scienceInternationalitydb -f psql-import-csvs.txt`
    
    
In case of problems check
    a. Variable names - from the original excel in additionalData, through the table schema in drop_generate_schema.sql to variable names in psql-import-csvs.txt
    
    b. Data validity in CSVs.
    
    c. Also prisma query in fetcher.js should contain valid variable names! If they change, prisma should be rerun as follows:
        1. docker-compose down
        2. change the datamodel.yml
        3. docker-compose up -d prisma
        4. prisma deploy

In [1]:
topData = '20181218_AllFieldsCountriesMethods_TOP.xlsx' ## OUTPUT OF CalculateEverything() in InternationalityIndex.InternationalityCalculations.py
bottomData = '20181218_AllFieldsCountriesMethods_bot_all.csv'
additionalData = 'populateAmazon.xlsx'

import pandas as pd

# load data
tops = pd.read_excel(topData,index_col=[0,1,2,3]).reset_index()
bottoms = pd.read_csv(bottomData,index_col=[0,1,2,3]).reset_index()

bottoms = bottoms[bottoms.Field != 'All']

df = pd.concat([tops,bottoms],ignore_index=True)

countries = pd.read_excel(additionalData,sheet_name='country')
index = pd.read_excel(additionalData,sheet_name='index',index=False)
methods = pd.read_excel(additionalData,sheet_name='method')

merged = pd.merge(df,countries,how='left',left_on='Country',right_on='full_name').loc[:,['country_code','Field','Method','Period','Internationality']]
merged.columns = index.columns

merged = merged.loc[merged.period > 2004]

  mask |= (ar1 == a)


# Minimax

In [2]:
minimaxs = methods.set_index(methods.method_code).minmax.map({'min':-1,'max':1})

In [3]:
merged['minmax'] = merged.method_code.map(minimaxs)
merged.value = merged.value * merged.minmax
merged = merged.drop('minmax',axis=1)


## filter displayed globalizations

In [4]:
import pandas as pd
import sqlite3
import numpy as np

jrnThreshold = 30
fields = merged.field_code.unique()

def getDocsJournalsForField(field,conn):

    if field == 'All':
        query ='''
        SELECT
       c.name as Country,
       p.name as Year,
       Sum(A.Articles) AS Documents,
       Count(A.Articles) as Journals

        FROM ArticleCountries as A
        INNER JOIN countries c on A.FacetID = c.ID
        INNER JOIN periods p on A.PeriodID = p.ID
        INNER JOIN issns i on A.ISSNID = i.ID
        GROUP BY Country,Year

        '''
    else:
        query = '''
        SELECT
       c.name as Country,
       p.name as Year,
       Sum(A.Articles) AS Documents,
       Count(A.Articles) as Journals

        FROM ArticleCountries as A
        INNER JOIN countries c on A.FacetID = c.ID
        INNER JOIN periods p on A.PeriodID = p.ID
        INNER JOIN issns i on A.ISSNID = i.ID
        WHERE i.{} = 1
        GROUP BY Country,Year
        
        '''.format(field)

    df = pd.read_sql_query(query,conn)
    df['field'] = field
    return df

conn = sqlite3.connect('D:/Dropbox/Python/AllScopusJournals/180802_1611_AllJournals_ArReCp_2001_2017.sqlite')
#conn = sqlite3.connect('C:/Users/vitekzkytek/Dropbox/Python/AllScopusJournals/180802_1611_AllJournals_ArReCp_2001_2017.sqlite')
dfs =[]

for field in fields:
    dfs.append(getDocsJournalsForField(field,conn))

filters = pd.concat(dfs).merge(countries.loc[:,['country_code','full_name']],left_on='Country',right_on='full_name').drop('full_name',axis=1)
filters['include'] = np.where(filters['Journals'] >= jrnThreshold, True, False)
filters.Year = pd.to_numeric(filters.Year)

merged = merged.merge(filters,left_on=['country_code','field_code','period'],right_on=['country_code','field','Year'],how='left')
merged = merged[merged['include'] == True]
merged = merged.drop(['Documents','Journals','field','Country','Year','include'],axis=1)

## Calculate group averages

In [5]:
avgs = [merged]
def calcGroupAverage(mergedDF,countriesDF,dimension,new_country_codes):
    df = mergedDF.merge(countriesDF,on='country_code',how='left').set_index(keys=['country_code','field_code','method_code','period'])
    df = df[['value',dimension]]
    g = df.groupby(['field_code','method_code','period',dimension]).mean().reset_index()
    g['country_code'] = g[dimension].map(new_country_codes,na_action='ignore')
    return g.drop(dimension,axis=1)[mergedDF.columns].dropna()

cntrs = [countries]
def appendToCountries(countries,d):
    l = [{'country_code':d[key],'full_name':key,'name':key,'Type':'aggregate'} for key in d.keys()]
    df = pd.DataFrame(l)
    return df

#regions
d = {
    'Europe':'_Europe',
    'North America':'_NAmer',
    'South America':'_SAmer',
    'Central Asia':'_CAsia',
    'Middle East':'_MEast',
    'East Asia':'_EAsia',
    'South Asia':'_SAsia',
    'Pacific':'_Pac',
    'North Africa':'_NAfr', 
    'Sub-Saharan Africa':'_SSAfr'
    }
avgs.append(calcGroupAverage(merged,countries, 'region',d))
cntrs.append(appendToCountries(countries,d))


# Income Level
d = {'Upper middle income':'_UMI','High income':'_HI','Lower middle income':'_LMI','Low income':'_LI'}
avgs.append(calcGroupAverage(merged,countries, 'incomelevel',d))
cntrs.append(appendToCountries(countries,d))


#EU
d = {'EU-15':'_EU15','EU-13':'_EU13'}
avgs.append(calcGroupAverage(merged,countries, 'eu_sub',d))
cntrs.append(appendToCountries(countries,d))

# whole EU
d = {'EU-28':'_EU'}
avgs.append(calcGroupAverage(merged,countries, 'eu',d))
cntrs.append(appendToCountries(countries,d))


#OECD
d = {'OECD':'_OECD'}
avgs.append(calcGroupAverage(merged,countries, 'oecd',d))
cntrs.append(appendToCountries(countries,d))


#IMF 2003
d = {
    'Advanced countries':'_ADV',
    'Transition countries':'_TRA',
    'Developing countries':'_DEV'
}
avgs.append(calcGroupAverage(merged,countries, 'imf2003',d))
cntrs.append(appendToCountries(countries,d))

# World
#world = merged.set_index(['country_code','field_code','method_code','period'],append=True).unstack('country_code').mean(axis=1).rename('value').reset_index()
#world['country_code'] = '_AV'
#world = world[merged.columns]
#avgs.append(world)
wld = merged.groupby(['field_code','method_code','period']).mean().reset_index()
wld['country_code'] = '_AV'
wld = wld[merged.columns]
avgs.append(wld)


cntrs.append(appendToCountries(countries,{'World':'_AV'}))

merged = pd.concat(avgs,ignore_index=True)
countries = pd.concat(cntrs,ignore_index=True)

# Normalize between 0 and 1

In [6]:
for method in merged.method_code.unique():
    dfm = merged.loc[merged.method_code == method,:]
    dfm.loc[:,'value'] = (dfm.value - dfm.value.min())/(dfm.value.max() - dfm.value.min())
    merged.loc[merged.method_code == method,'value'] = dfm.loc[:,'value']
merged

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,country_code,field_code,method_code,period,value
0,DZA,top_Life,euclid,2005,0.761474
1,ARG,top_Life,euclid,2005,0.772071
2,ARM,top_Life,euclid,2005,0.517155
3,AUS,top_Life,euclid,2005,0.760902
4,AUT,top_Life,euclid,2005,0.804374
5,BGD,top_Life,euclid,2005,0.723061
6,BLR,top_Life,euclid,2005,0.614812
7,BEL,top_Life,euclid,2005,0.824800
8,BEN,top_Life,euclid,2005,0.778632
9,BOL,top_Life,euclid,2005,0.786342


### Write CSVs

In [7]:
countries.to_csv('AWS_Import/country.csv',index=False)

method = pd.read_excel(additionalData,sheet_name='method').to_csv('AWS_Import/method.csv',index=False)
field = pd.read_excel(additionalData,sheet_name='field').to_csv('AWS_Import/field.csv',index=False)
merged.to_csv('AWS_Import/index.csv',index=False)

# Generate JSONs with controlling data

1. Run both cells in the notebook

2. Copy the file `controls_data.js` from this notebooks directory into `main/public/javascripts/` directory

In [8]:
methods = pd.read_excel(additionalData,sheet_name='method')

import pandas as pd
df_methods = pd.read_excel('populateAmazon.xlsx',sheet_name='method',index_col='method_code')
df_methods = df_methods.loc[['euclid','weightGini','localShare','shareEnglish','top3','GiniSimpson'],:]
df_fields = pd.read_excel('populateAmazon.xlsx',sheet_name='field',index_col='field_code')
df_countries = pd.read_excel('populateAmazon.xlsx',sheet_name='country',index_col='country_code')
df_countries = countries.set_index('country_code')[df_countries.columns]
df_countries = df_countries.loc[df_countries.index.isin(merged.country_code.unique())]


In [9]:
d_aggr = df_countries.loc[df_countries.Type == 'aggregate','name'].reset_index().rename(columns={'country_code':'id','name':'text'}).to_dict(orient='records')
d_cntrs = df_countries.loc[df_countries.Type == 'country','name'].reset_index().rename(columns={'country_code':'id','name':'text'}).sort_values('text').to_dict(orient='records')
d_countries = {'results':[{'text':'Country Groups','children':d_aggr},{'text':'Countries','children':d_cntrs}]}


In [10]:
regions = ['_NAmer', '_EAsia', '_Europe', '_SAsia', '_Pac', '_SAmer', '_CAsia', '_MEast', '_SSAfr', '_NAfr']
incomes = ['_HI','_UMI', '_LMI', '_LI']
status = ['_ADV', '_TRA', '_DEV']
others = [ '_EU15', '_EU13', '_EU', '_OECD']

d_cntrs = df_countries.loc[df_countries.Type == 'country','name'].reset_index().rename(columns={'country_code':'id','name':'text'}).sort_values('text').to_dict(orient='records')

aggr = df_countries.loc[df_countries.Type == 'aggregate','name'].reset_index().rename(columns={'country_code':'id','name':'text'})
d_regions = aggr.loc[aggr.id.isin(regions)].to_dict(orient='records')
d_status = aggr.loc[aggr.id.isin(status)].to_dict(orient='records')
d_incomes = aggr.loc[aggr.id.isin(incomes)].to_dict(orient='records')
d_incomes_sorted = []
for el in incomes:
    texts = [t['text'] for t in d_incomes if t['id'] == el]
    d_incomes_sorted.append({'id':el,'text':texts[0]})
d_others = aggr.loc[aggr.id.isin(others)].to_dict(orient='records')

d_countries = {'results':
               [
                   {'children':[{'id':'_AV','text':'World'}]},
                   {'text':'Development Status','children':d_status},
                   {'text':'Income','children':d_incomes_sorted},
                   {'text':'Regions','children':d_regions},
                   {'text':'Other','children':d_others},
                   {'text':'Countries','children':d_cntrs}
               ]
              }


In [13]:
disc_sel2 = df_fields.loc[:,'leg_name'].reset_index().rename(columns={'field_code':'id','leg_name':'text'})
ltops = ['top_Life', 'top_Physical', 'top_Health', 'top_Social']
lbottoms =['bot_General', 'bot_AgriculturalAndBiological',
       'bot_ArtsHumanities', 'bot_BiochemistryGeneticsMolecularBiology',
       'bot_BusinessManagementAccounting', 'bot_ChemicalEngineering',
       'bot_Chemistry', 'bot_ComputerScience', 'bot_DecisionSciences',
       'bot_EarthPlanetarySciences', 'bot_EconomicsEconometricsFinance',
       'bot_Energy', 'bot_Engineering', 'bot_EnvironmentalScience',
       'bot_ImmunologyMicrobiology', 'bot_Materials', 'bot_Mathematics',
       'bot_Medicine', 'bot_Neuroscience', 'bot_Nursing',
       'bot_PharmacologyToxicologyPharmaceutics', 'bot_PhysicsAstronomy',
       'bot_Psychology', 'bot_SocialSciences', 'bot_Veterinary',
       'bot_Dentistry', 'bot_HealthProfessions']

d_tops = disc_sel2.loc[disc_sel2.id.isin(ltops)].to_dict(orient='records')
d_bottoms = disc_sel2.loc[disc_sel2.id.isin(lbottoms)].to_dict(orient='records')

d_fields = {'results':
               [
                   {'children':[{'id':'All','text':'All disciplines'}]},
                   {'text':'Broad Subject Clusters','children':d_tops},
                   {'text':'Major Subject Areas','children':sorted(d_bottoms, key=lambda k: k['text'])}
               ]
              }


In [None]:
df_methods['rank'] = pd.Series({'euclid':0,'weightGini':2,'localShare':4,'shareEnglish':5,'top3':3,'GiniSimpson':1})
df_methods = df_methods.sort_values('rank',ascending=True).drop('rank',axis=1)
d_methods = df_methods.reset_index().rename(columns={'method_code':'id','name':'text'}).to_dict(orient='records')
d_methods = {'results':d_methods,'pagination':{'more':True}}

d = {'methods': d_methods,'fields':d_fields,'countries':d_countries}

import json
s = 'var controllers = %s' % (json.dumps(d))

with open("controls_data.js", "w") as f:
    f.write(s)

In [14]:
d_fields

{'results': [{'children': [{'id': 'All', 'text': 'All disciplines'}]},
  {'children': [{'id': 'top_Life', 'text': 'Life Sciences'},
    {'id': 'top_Physical', 'text': 'Physical Sciences'},
    {'id': 'top_Health', 'text': 'Health Sciences'},
    {'id': 'top_Social', 'text': 'Social Sciences'}],
   'text': 'Broad Subject Clusters'},
  {'children': [{'id': 'bot_AgriculturalAndBiological',
     'text': 'Agricultural and Biological Sciences'},
    {'id': 'bot_ArtsHumanities', 'text': 'Arts and Humanities'},
    {'id': 'bot_BiochemistryGeneticsMolecularBiology',
     'text': 'Biochemistry, Genetics and Molecular Biology'},
    {'id': 'bot_BusinessManagementAccounting',
     'text': 'Business, Management and Accounting'},
    {'id': 'bot_ChemicalEngineering', 'text': 'Chemical Engineering'},
    {'id': 'bot_Chemistry', 'text': 'Chemistry'},
    {'id': 'bot_ComputerScience', 'text': 'Computer Science'},
    {'id': 'bot_DecisionSciences', 'text': 'Decision Sciences'},
    {'id': 'bot_Dentistry