In [None]:
import os
import re
import pickle
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats

import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.family'] = ['Arial']

In [None]:
with open(os.path.join('.','process_data','paper_classify_union.pkl'),'rb') as f:
    paper_classify=pickle.load(f)
    
with open(os.path.join('.','process_data','paper_year.pkl'),'rb') as f:
    paper_year=pickle.load(f)
with open(os.path.join('.','process_data','paper_root_fos_level0_multi.pkl'),'rb') as f:
    paper_root_fos=pickle.load(f)
with open(os.path.join('.','process_data','paper_citation_year.pkl'),'rb') as f:
    paper_citation=pickle.load(f)
with open(os.path.join('.','process_data','paper_venue.pkl'),'rb') as f:
    paper_venue=pickle.load(f)
    
with open(os.path.join('.','process_data','paper_selected_dict_title_abstract.pkl'),'rb') as f:
    paper_selected_dict_title_abstract=pickle.load(f)
with open(os.path.join('.','process_data','paper_selected_dict_venue.pkl'),'rb') as f:
    paper_selected_dict_venue=pickle.load(f)
    
with open(os.path.join('.','process_data','root_fos_level0.pkl'),'rb') as f:
    root_fos=pickle.load(f)

In [None]:
fos_list=['geology','chemistry','materials science','biology','physics','medicine']
fos_list=set(['geology','chemistry','materials science','biology','physics','medicine'])
# fos_list=['geology','geography','chemistry','materials science','mathematics','biology','computer science','engineering','environmental science','physics','medicine']
year_start=1990
year_end=2015
years=range(year_start,year_end+1)

In [None]:
def interval(data,confidence=0.99):
    return stats.t.interval(confidence=confidence, df=len(data)-1, loc=np.mean(data), scale=stats.sem(data))

def interval_diff(data1,data2,confidence=0.99):
    s1=stats.sem(data1)**2
    s2=stats.sem(data2)**2
    
    return stats.t.interval(confidence=confidence, df=(s1+s2)**2/(s1**2/(len(data1)-1)+s2**2/(len(data2)-1)), loc=np.mean(data1)-np.mean(data2), scale=(s1+s2)**0.5)

In [None]:
result_citation=dict()
for i in range(5):
    result_citation[i]=[[],[]]

In [None]:
count=0
window_len=5
for paper,classify in tqdm(paper_classify.items()):
    if (paper in paper_selected_dict_title_abstract) and (paper in paper_selected_dict_venue):
        try:
            year=paper_year[paper]
            foses=paper_root_fos[paper]
        except:
            continue
            
        if year not in years:
            continue
        
        fos_names=set([root_fos[fos] for fos in foses])
        inter_set=fos_names&fos_list
        if len(inter_set)==0:
            continue

        if year<1995:
            year_index=0
        elif year<2000:
            year_index=1
        elif year<2005:
            year_index=2
        elif year<2010:
            year_index=3
        else:
            year_index=4

        if paper not in paper_citation:
            result_citation[year_index][classify].append(0)
        else:
            citation_count=0
            citation_year=paper_citation[paper]
            for i in range(year,year+window_len+1):
                try:
                   citation_count+=citation_year[i]
                except:
                    continue
            result_citation[year_index][classify].append(citation_count)

In [None]:
df=pd.DataFrame(columns=['Year','Type','Citation'])
for i in range(5):   
    df=df.append(pd.DataFrame({'Type':'without AI','Year':i,'Citation':result_citation[i][0]}),ignore_index=True)
    df=df.append(pd.DataFrame({'Type':'AI-enabled','Year':i,'Citation':result_citation[i][1]}),ignore_index=True)
    
for i in range(5):
    df=df.append(pd.DataFrame({'Type':'without AI','Year':5,'Citation':result_citation[i][0]}),ignore_index=True)
    df=df.append(pd.DataFrame({'Type':'AI-enabled','Year':5,'Citation':result_citation[i][1]}),ignore_index=True)

plt.figure(figsize=(10.5,6.5))
g=sns.barplot(data=df, x='Year', y='Citation', hue='Type', errorbar=interval, palette=['royalblue','red'],alpha=0.6,capsize=0.2,errwidth=1.5)
g.legend_.remove()
plt.xticks(range(6),['1990-1994','1995-1999','2000-2004','2005-2009','2010-2015','Total'],fontsize=15)
plt.yticks(fontsize=15)
plt.ylabel('5-year citation per paper',fontsize=20)
plt.xlabel('Publish year',fontsize=20)
# plt.legend(fontsize=20)
axis=plt.axis()
ymin=axis[-2]
ymax=axis[-1]
plt.vlines(x=np.array(range(1,6))-0.5,ymin=ymin,ymax=ymax*1.1,color='lightgrey',linestyle='--')
for i in range(6):
    df_perd=df[df['Year']==i]
    AI_data=df_perd[df_perd['Type']=='without AI']['Citation'].to_list()
    NonAI_data=df_perd[df_perd['Type']=='AI-enabled']['Citation'].to_list()
    p=stats.ttest_ind(AI_data,NonAI_data).pvalue
    plt.text(x=i,y=(ymax if i>1 else ymax*0.7),s=('p=%.3f'%p if p>=0.001 else 'p<0.001'),va='center',ha='center',fontsize=15)
    print('%.3f\t%.3f-%.3f\t%.3f\t%.3f-%.3f\t%.3f\t%.3f-%.3f\tp=%.3f'%((np.mean(NonAI_data),)+interval(NonAI_data)+(np.mean(AI_data),)+interval(AI_data)+(np.mean(AI_data)-np.mean(NonAI_data),)+interval_diff(AI_data,NonAI_data)+(p,)))
plt.xlim(-0.5,5.5)
plt.ylim(ymin,ymax*1.1)
plt.tight_layout()
plt.show()

In [None]:
JCR_data_dir=os.path.join('.','JCRdata')
JCR_data=pd.read_excel(os.path.join(JCR_data_dir,'JCR2022.xlsx'))

MAG_data_dir=os.path.join('.','MAGdata')
journal_data=pd.read_table(os.path.join(MAG_data_dir,'Journals.txt'),header=None,names=['JournalId','Rank','NormalizedName','DisplayName','Issn','Publisher','Webpage','PaperCount','PaperFamilyCount','CitationCount','CreatedDate'])

journalsJCR1=pd.merge(left=journal_data.dropna(subset='Issn'),right=JCR_data.dropna(subset='issn'),left_on='Issn',right_on='issn',how='inner')
journalsJCR2=pd.merge(left=journal_data.dropna(subset='Issn'),right=JCR_data.dropna(subset='eissn'),left_on='Issn',right_on='eissn',how='inner')
journalsJCR=pd.concat([journalsJCR1,journalsJCR2]).drop_duplicates(subset='JournalId')

In [None]:
Qn=list()
for _,line in journalsJCR.iterrows():
    Qs=re.findall(pattern='\(Q\d\)',string=line['category'])
    if len(Qs)>0:
        Qmin=4
        for Q in Qs:
            Qint=int(Q.replace('(Q','').replace(')',''))
            if Qint<Qmin:
                Qmin=Qint
    else:
        Qmin=-1 
    Qn.append(Qmin)
journalsJCR.insert(0,'Qn',Qn)
journalsJCR=journalsJCR[journalsJCR['Qn']!=-1]

journal_Qn=dict()
for _,line in journalsJCR.iterrows():
    journal_Qn[line['JournalId']]=line['Qn']
    
journal_IF=dict()
for _,line in journalsJCR.iterrows():
    journal_IF[line['JournalId']]=line['2021JIF']

In [None]:
result_venue=dict()
for year in range(1990,2020):
    result_venue[year]=np.array([[0,0],[0,0],[0,0],[0,0]])

In [None]:
count=0
for paper,classify in tqdm(paper_classify.items()):
    if (paper in paper_selected_dict_title_abstract) and (paper in paper_selected_dict_venue):
        try:
            year=paper_year[paper]
            Qn=journal_Qn[paper_venue[paper]]
            foses=paper_root_fos[paper]
        except:
            continue
            
        if year not in years:
            continue
        
        fos_names=set([root_fos[fos] for fos in foses])
        inter_set=fos_names&fos_list
        if len(inter_set)==0:
            continue
        
        result_venue[year][Qn-1][classify]+=1
        count+=1

In [None]:
AI_ratio_quantile=100*np.array([[result_venue[year][i,1]/(result_venue[year][i,0]+result_venue[year][i,1]) for i in range(4)] for year in years])
NonAI_ratio_quantile=100-AI_ratio_quantile
AI_ratio_all=100*np.array([np.sum(result_venue[year][:,1])/np.sum(result_venue[year]) for year in years])[:,np.newaxis]
NonAI_ratio_all=100-AI_ratio_all

AI_value_list=np.mean(AI_ratio_quantile-AI_ratio_all,axis=0)
AI_interval_list=[interval(AI_ratio_quantile[:,i]-AI_ratio_all[:,0]) for i in range(4)]
NonAI_value_list=np.mean(NonAI_ratio_quantile-NonAI_ratio_all,axis=0)
NonAI_interval_list=[interval(NonAI_ratio_quantile[:,i]-NonAI_ratio_all[:,0]) for i in range(4)]

fig=plt.figure(figsize=(10.5,6.5))
plt.errorbar(np.arange(4),NonAI_value_list,yerr=np.abs(np.vstack(NonAI_interval_list).T-NonAI_value_list),color='royalblue',marker='s',markersize=8,markerfacecolor='white',markeredgewidth=2,markeredgecolor='royalblue',ecolor='royalblue',elinewidth=2,capsize=10,capthick=2,label='without AI',zorder=2)
plt.errorbar(np.arange(4),AI_value_list,yerr=np.abs(np.vstack(AI_interval_list).T-AI_value_list),color='red',marker='o',markersize=8,markerfacecolor='white',markeredgewidth=2,markeredgecolor='red',ecolor='red',elinewidth=2,capsize=10,capthick=2,label='AI',zorder=1)

plt.xlabel('Journal quantile', fontsize = 20)
plt.ylabel('Relative paper share', fontsize  = 20)
plt.xticks(np.arange(4),[f'Q{i}' for i in range(1,5)],fontsize = 15)
plt.yticks(fontsize = 15)
axis=plt.axis()
xmin=axis[0]
xmax=axis[1]
plt.hlines(y=0,xmin=xmin,xmax=xmax,color='black',linestyle='--')
plt.xlim(xmin,xmax)
plt.tight_layout()
plt.show()