This script merge previous regression data with publication count, the diversity of next time periods and the average normalized density of new activated disciplines.

In [1]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

In [27]:
regpath="../../data/dropbox/Data/Derived/Publication_based/Regression/Normalized/full/reg_data_panel_1973-2017.csv"
flagpath="../../data/dropbox/Data/Additional_data/country_WoS_ECI_WB_flags.tsv"
pubcntpath="../../data/dropbox/Data/Derived/Cleaned_raw_data/pubcnt_full.csv"
ginipath="../../data/dropbox/Data/Derived/Publication_based/Gini/Normalized/agg_gini_full.csv"
densitypath="../../data/dropbox/Data/Derived/Publication_based/Transition/cntry_densitynormed_avg.csv"
outputpath="../../data/dropbox/Data/Derived/Publication_based/Regression/Normalized/full/reg_data_new.csv"

In [16]:
reg_df=pd.read_csv(regpath)
flag_df=pd.read_csv(flagpath,sep="\t")
pub_cnt=pd.read_csv(pubcntpath)
gini_df=pd.read_csv(ginipath)
density_df=pd.read_csv(densitypath)

In [18]:
reg_df=reg_df.merge(flag_df[['Code','WoS']],on=['Code'])

In [19]:
yearlist=reg_df.period.unique().tolist()
yearlist.sort()
yearlist.append('2013-2017')

assign next time period to current regression data

In [20]:
reg_df['period_t1']=reg_df['period'].apply(lambda x:yearlist[yearlist.index(x)+1])

aggregate the annual publication count to publication count of time periods

In [21]:
cntry_cnt = pub_cnt.groupby(['COUNTRY','YEAR'])['PAPER_CNT'].sum().reset_index()
cntry_period_df=pd.DataFrame()
for year in yearlist:
    start,end=year.split("-")
    data=cntry_cnt[cntry_cnt['YEAR'].between(int(start),int(end))]
    data=data.groupby(['COUNTRY'])['PAPER_CNT'].sum().reset_index()
    data['period']=year
    cntry_period_df=pd.concat([data,cntry_period_df])
cntry_period_df.head()

Unnamed: 0,COUNTRY,PAPER_CNT,period
0,ALL COUNTRIES,7710529,2013-2017
1,Afghanistan,308,2013-2017
2,Albania,898,2013-2017
3,Algeria,14520,2013-2017
4,Andorra,58,2013-2017


merge publication count with regression data

In [22]:
cntry_period_df=cntry_period_df.rename(columns={'COUNTRY':'WoS'})
reg_df=reg_df.merge(cntry_period_df,on=['WoS','period'])
reg_df=reg_df.merge(cntry_period_df.rename(columns={'period':'period_t1'}),on=['WoS','period_t1'])
reg_df=reg_df.rename(columns={'PAPER_CNT_x':'pub_cnt_t0','PAPER_CNT_y':'pub_cnt_t1'})
reg_df.head()

Unnamed: 0,Code,date,period,nm_change,shm_change,ne_change,Income_t0,sum_adv_t0,growth_rate,IncomeGroup,gini,ECI,diversity,growth,Income_t0_log,WoS,period_t1,pub_cnt_t0,pub_cnt_t1
0,AFG,0,1973-1977,-2.0,-4.0,-2.0,2352889000.0,23.0,0.176126,L,0.910166,,0.089834,0.176126,9.371601,Afghanistan,1978-1982,44,39
1,AFG,5,1998-2002,15.0,6.0,0.0,4055180000.0,3.0,0.207178,L,0.987081,,0.012919,0.207178,9.60801,Afghanistan,2003-2007,3,44
2,AFG,6,2003-2007,11.0,8.0,0.0,6534128000.0,24.0,0.367859,L,0.914178,,0.085822,0.367859,9.815188,Afghanistan,2008-2012,44,175
3,AFG,7,2008-2012,-9.0,0.0,1.0,15242160000.0,43.0,0.120182,L,0.842701,,0.157299,0.120182,10.183046,Afghanistan,2013-2017,175,308
4,AGO,1,1978-1982,2.0,1.0,-2.0,5677156000.0,5.0,0.086293,LM,0.977281,-1.566572,0.022719,0.086293,9.754131,Angola,1983-1987,5,11


merge gini value of next period with data

In [23]:
reg_df=reg_df.merge(gini_df.rename(columns={'COUNTRY':'WoS','YEAR':'period_t1','GINI':'gini_t1'}),on=['WoS','period_t1'])
reg_df.head()

Unnamed: 0,Code,date,period,nm_change,shm_change,ne_change,Income_t0,sum_adv_t0,growth_rate,IncomeGroup,...,ECI,diversity,growth,Income_t0_log,WoS,period_t1,pub_cnt_t0,pub_cnt_t1,gini_t1,ST
0,AFG,0,1973-1977,-2.0,-4.0,-2.0,2352889000.0,23.0,0.176126,L,...,,0.089834,0.176126,9.371601,Afghanistan,1978-1982,44,39,0.936461,Others
1,AFG,5,1998-2002,15.0,6.0,0.0,4055180000.0,3.0,0.207178,L,...,,0.012919,0.207178,9.60801,Afghanistan,2003-2007,3,44,0.914178,Others
2,AFG,6,2003-2007,11.0,8.0,0.0,6534128000.0,24.0,0.367859,L,...,,0.085822,0.367859,9.815188,Afghanistan,2008-2012,44,175,0.842701,Others
3,AFG,7,2008-2012,-9.0,0.0,1.0,15242160000.0,43.0,0.120182,L,...,,0.157299,0.120182,10.183046,Afghanistan,2013-2017,175,308,0.824282,Others
4,AGO,1,1978-1982,2.0,1.0,-2.0,5677156000.0,5.0,0.086293,LM,...,-1.566572,0.022719,0.086293,9.754131,Angola,1983-1987,5,11,0.979213,Lagging


In [24]:
reg_df=reg_df.merge(density_df.rename(columns={'COUNTRY':'WoS','CRRT_TIME':'period'}),on=['WoS','period'])

In [25]:
reg_df = reg_df.sort_values(by=['Code','date'])
reg_df['pub_growth']=np.log10(reg_df['pub_cnt_t1']/reg_df['pub_cnt_t0'])
reg_df['pub_cnt_t0_log']=np.log10(reg_df['pub_cnt_t0'])
reg_df['pub_cnt_t1_log']=np.log10(reg_df['pub_cnt_t1'])
reg_df['diversity_t1']=1-reg_df['gini_t1']

In [26]:
reg_df.head()

Unnamed: 0,Code,date,period,nm_change,shm_change,ne_change,Income_t0,sum_adv_t0,growth_rate,IncomeGroup,...,period_t1,pub_cnt_t0,pub_cnt_t1,gini_t1,ST,Density_norm,pub_growth,pub_cnt_t0_log,pub_cnt_t1_log,diversity_t1
0,AFG,0,1973-1977,-2.0,-4.0,-2.0,2352889000.0,23.0,0.176126,L,...,1978-1982,44,39,0.936461,Others,1.271279,-0.052388,1.643453,1.591065,0.063539
1,AFG,5,1998-2002,15.0,6.0,0.0,4055180000.0,3.0,0.207178,L,...,2003-2007,3,44,0.914178,Others,0.75277,1.166331,0.477121,1.643453,0.085822
2,AFG,6,2003-2007,11.0,8.0,0.0,6534128000.0,24.0,0.367859,L,...,2008-2012,44,175,0.842701,Others,0.854465,0.599585,1.643453,2.243038,0.157299
3,AFG,7,2008-2012,-9.0,0.0,1.0,15242160000.0,43.0,0.120182,L,...,2013-2017,175,308,0.824282,Others,0.621515,0.245513,2.243038,2.488551,0.175718
4,AGO,1,1978-1982,2.0,1.0,-2.0,5677156000.0,5.0,0.086293,LM,...,1983-1987,5,11,0.979213,Lagging,0.417888,0.342423,0.69897,1.041393,0.020787


In [None]:
reg_df.to_csv(outputpath, index=False)