In [None]:
from InternationalityCalculations import CalculateEverything
from InternationalityCalculations import DB_joinJournals

CalculateEverything('globalization_TOP_NOLIMIT.csv','TOP',excludeMultiDiscipline=False)
CalculateEverything('globalization_TOP_BROADLIMIT.csv','TOP',excludeMultiDiscipline='broad')
CalculateEverything('globalization_TOP_NARROWLIMIT.csv','TOP',excludeMultiDiscipline='narrow')

In [None]:
import pandas as pd

df = pd.DataFrame({
    'nolimit':pd.read_csv('DisciplineRobustness/globalization_TOP_NOLIMIT.csv',index_col=['Period','Method','Field','Country']).Internationality,
    'broadlimit': pd.read_csv('DisciplineRobustness/globalization_TOP_BROADLIMIT.csv',index_col=['Period','Method','Field','Country']).Internationality,
    'narrowlimit': pd.read_csv('DisciplineRobustness/globalization_TOP_NARROWLIMIT.csv',index_col=['Period','Method','Field','Country']).Internationality
})


In [None]:

df.corr()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

bins = np.linspace(-1, 1, 200)

plt.hist((df.nolimit - df.broadlimit), bins, alpha=0.5, label='Broad disciplines excl.')
plt.hist((df.nolimit - df.narrowlimit), bins, alpha=0.5, label='Narrow disciplines excl.')
plt.legend(loc='upper right')
plt.title('Deviation of globalization caused by \n removing journals belonging to multiple disciplines')

plt.show()

In [None]:
ranks = df.groupby(['Field','Method','Period']).rank()

(ranks.nolimit - ranks.broadlimit).hist(bins=100)

In [None]:
devs = pd.DataFrame({'broad':(df.nolimit - df.broadlimit),
                     'narrow':(df.nolimit - df.narrowlimit)})
devs[(devs.broad >= -0.15) & (devs.broad <= 0.15)].shape[0]/devs.shape[0]

In [None]:
from InternationalityData import DB_joinJournals
import pandas as pd
conn = DB_joinJournals()
pd.read_sql_query('''
        SELECT c.name as Country, sum(Articles) as Documents
        FROM ArticleCountries
        inner join countries c on ArticleCountries.FacetID = c.ID
        inner join v_issns i on ArticleCountries.ISSNID = i.ID
        where
            BundleID = {} 
        {}
        group by c.name
            AND
    i.broadFieldsNum = 1
        '''.format(1,'''
'''),conn,index_col='Country')


In [None]:
from plotting import plotJournalDistsCountries
social = plotJournalDistsCountries('top_Social',2017,'euclid',True,False)
health = plotJournalDistsCountries('top_Health',2017,'euclid',True,False)
life = plotJournalDistsCountries('top_Life',2017,'euclid',True,False)
physical = plotJournalDistsCountries('top_Physical',2017,'euclid',True,False)

In [None]:
import pandas as pd
idx = pd.IndexSlice
dist = pd.DataFrame({
    'top_Life':life.stack(),
    'top_Health':health.stack(),
    'top_Physical':physical.stack(),
    'top_Social':social.stack()
    })

In [None]:
eu = ['Austria','Italy','Belgium','Latvia','Bulgaria','Lithuania','Croatia','Luxembourg','Cyprus','Malta','Czech Republic','Netherlands','Denmark','Poland','Estonia','Portugal','Finland','Romania','France','Slovakia', 'Germany', 'Slovenia', 'Greece','Spain', 'Hungary','Sweden','Ireland']
disteu = dist.loc[idx[eu,:],:]

In [None]:
disteu.loc[idx[:,['Q4']],:].unstack(1).plot.bar()

In [None]:
from InternationalityData import DB_joinJournals, DB_GetInternationalityData, maxOrMin
from InternationalityCalculations import SubsetJournalsByMinDocuments, CalcJournalInternationality
import pandas as pd
def getJournalDistsCountries(field, period,method,quantiles=4):
    d = DB_GetInternationalityData(field, period,True, DB_joinJournals())
    d = SubsetJournalsByMinDocuments(d, 30)
    #breakpoint()
    qu = pd.qcut(CalcJournalInternationality(d, method), quantiles, labels=False)

    if maxOrMin[method] == 'min':
        qu = quantiles - qu

    df = d['countries']
    df.loc[:, 'qu'] = qu
    df2 = df.groupby('qu').sum() / df.groupby('qu').sum().sum()
    df2.index = ['Q{}'.format(x) for x in range(quantiles, 0, -1)]
    
    #breakpoint()
    df2.loc['Documents',:] = df.sum()
    df2.loc['Journals',:] = df.apply(lambda x: x[x>0].shape[0])
    
    df3 = df2.T.reset_index()
    df3['period'] = period
    df3['field'] = field
    df3['method'] = method
    return df3.set_index(['Country','field','method','period'])
getJournalDistsCountries('top_Life',2017,'euclid')


In [None]:
from tqdm import tqdm

for method in tqdm(['euclid','cosine','GiniSimpson','weightGini','top3','instTOP3','shareEnglish','localShare']):
    dfs = []
    for yr in range(2005,2018):
        for field in ['top_Social','top_Life','top_Health','top_Physical','All']:
            dfs.append(getJournalDistsCountries(field,yr,'euclid'))

    df = pd.concat(dfs)
    df.to_csv('country_dists_{}.csv'.format(method))

## Journal-level Globalizations

In [2]:
from InternationalityCalculations import DB_joinJournals,SubsetJournalsByMinDocuments,CalcJournalInternationality
from InternationalityData import DB_joinJournals,DB_GetInternationalityData
import pandas as pd
from tqdm import tqdm
def GlobalizationJournalIndicator(field,period,method,conn=None):
    if conn is None:
        conn = DB_joinJournals()

    d = DB_GetInternationalityData(field,period,True,False,conn)
    d = SubsetJournalsByMinDocuments(d, 30)

    d['method'] = method

    if method == 'localShare':
        unknownPubCountry = ['1696-2737', '1881-8366', '1604-7982', '1735-4331', '0367-5793', '1738-3102', '1790-8140',
                            '1813-8586', '0478-3522', '1732-8705', '2084-3925', '1897-1059']
        d['total'] = d['total'].drop(unknownPubCountry, axis='index', errors='ignore')
        d['countries'] = d['countries'].drop(unknownPubCountry, axis='index', errors='ignore')

    df = CalcJournalInternationality(d,method).to_frame()
    df.index = df.index.rename('ISSN')
    df.loc[:,'method'] = method
    df.loc[:,'field'] = field
    df.loc[:,'period'] = period
    return df.reset_index().set_index(['method','field','period','ISSN'])

db = DB_joinJournals()
dfs = []
glob = pd.read_csv('data/index.csv').set_index(['country_code','field_code','method_code','period'])['value']
flds = [col for col in glob.index.get_level_values('field_code').unique()]
flds = ['bro']
for method in ['euclid','cosine','GiniSimpson','top3','instTOP3','shareEnglish','localShare']:
    print(f'starting {method}')
    for yr in [2017]:#list(range(2005,2018)):
        for fld in tqdm(['top_Life','top_Health','top_Social','top_Physical','All']):
            dfs.append(GlobalizationJournalIndicator(fld,yr,method,db))
        pd.concat(dfs).to_csv(f'data/AllJournalIndicators_{method}_{yr}.csv')

starting euclid

  0%|          | 0/5 [00:00<?, ?it/s]
 20%|██        | 1/5 [00:55<03:41, 55.47s/it]
 40%|████      | 2/5 [02:07<03:01, 60.41s/it]
 60%|██████    | 3/5 [03:37<02:18, 69.31s/it]
 80%|████████  | 4/5 [05:03<01:14, 74.46s/it]
100%|██████████| 5/5 [06:30<00:00, 77.96s/it]starting cosine

  0%|          | 0/5 [00:00<?, ?it/s]
 20%|██        | 1/5 [01:18<05:13, 78.38s/it]
 40%|████      | 2/5 [02:34<03:53, 77.77s/it]
 60%|██████    | 3/5 [03:40<02:28, 74.29s/it]
 80%|████████  | 4/5 [04:53<01:13, 73.64s/it]
100%|██████████| 5/5 [06:26<00:00, 79.67s/it]starting GiniSimpson

  0%|          | 0/5 [00:00<?, ?it/s]
 20%|██        | 1/5 [01:21<05:27, 81.99s/it]
 40%|████      | 2/5 [02:45<04:07, 82.44s/it]
 60%|██████    | 3/5 [04:16<02:49, 84.99s/it]
 80%|████████  | 4/5 [05:50<01:27, 87.69s/it]
100%|██████████| 5/5 [07:13<00:00, 86.31s/it]starting top3

  0%|          | 0/5 [00:00<?, ?it/s]
 20%|██        | 1/5 [01:44<06:58, 104.60s/it]
 40%|████      | 2/5 [03:46<05:29, 109.86s/

In [3]:
import os

pd.concat([pd.read_csv(f'data/{f}').rename({0:'globalization'},axis=1) for f in os.listdir('data') if 'AllJournalIndicators' in f]).to_csv('../public_data/globalization_journals.csv')