In [None]:
import os

import pickle
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats

import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.family'] = ['Arial']

In [None]:
with open(os.path.join('.','processed_data','author_career.pkl'),'rb') as f:
    author_career=pickle.load(f)
with open(os.path.join('.','processed_data','author_selected_dict_fos.pkl'),'rb') as f:
    author_selected_dict_fos=pickle.load(f)

with open(os.path.join('.','processed_data',f'author_paper_year.pkl'),'rb') as f:
    author_paper_year=pickle.load(f)
with open(os.path.join('.','processed_data',f'author_citation_year.pkl'),'rb') as f:
    author_citation_year=pickle.load(f)

In [None]:
fos_list=['geology','chemistry','materials science','biology','physics','medicine']
fos_name_list=['Geology','Chemistry','Materials','Biology','Physics','Medicine','Total']
    
AI_pub_fos=dict()
NonAI_pub_fos=dict()
for fos in fos_list:
    AI_pub_fos[fos]=dict()
    NonAI_pub_fos[fos]=dict()
    for year in range(1990,2016):
        AI_pub_fos[fos][year]=list()
        NonAI_pub_fos[fos][year]=list()

AI_cite_fos=dict()
NonAI_cite_fos=dict()
for fos in fos_list:
    AI_cite_fos[fos]=dict()
    NonAI_cite_fos[fos]=dict()
    for year in range(1990,2016):
        AI_cite_fos[fos][year]=list()
        NonAI_cite_fos[fos][year]=list()

In [None]:
count=0
for author,career in tqdm(author_career.items()):
    try:
        publication=author_paper_year[author]
        citation=author_citation_year[author]
    except:
        continue
    
    paper_num=int(career[0])
    start=int(career[1])
    first=int(career[2])
    last=int(career[3])
    AI=int(career[4])
    drop=int(career[5])        
    author_fos=author_selected_dict_fos[author]

    if last>first and last>start and drop>start and drop<=2015:
        count+=1
        for fos in author_fos:
            for year in range(max(1990,start),min(2016,last)):
                if year<AI:
                    NonAI_pub_fos[fos][year].append(publication[year])
                    NonAI_cite_fos[fos][year].append(citation[year])
                else:
                    AI_pub_fos[fos][year].append(publication[year])
                    AI_cite_fos[fos][year].append(citation[year])                    

In [None]:
year_start=1990
year_end=2015
years=range(year_start,year_end+1)

In [None]:
def interval(data,confidence=0.99):
    return stats.t.interval(confidence=confidence, df=len(data)-1, loc=np.mean(data), scale=stats.sem(data))

def interval_diff(data1,data2,confidence=0.99):
    s1=stats.sem(data1)**2
    s2=stats.sem(data2)**2
    
    return stats.t.interval(confidence=confidence, df=(s1+s2)**2/(s1**2/(len(data1)-1)+s2**2/(len(data2)-1)), loc=np.mean(data1)-np.mean(data2), scale=(s1+s2)**0.5)

In [None]:
df_publication=pd.DataFrame(columns=['Field','Type','Year','Value'])
for year in years:    
    for fos in sorted(fos_list):
        df_publication=df_publication.append(pd.DataFrame({'Field':fos.capitalize(),'Type':'without AI','Year':year,'Value':NonAI_pub_fos[fos][year]}),ignore_index=True)
        df_publication=df_publication.append(pd.DataFrame({'Field':fos.capitalize(),'Type':'AI-enabled','Year':year,'Value':AI_pub_fos[fos][year]}),ignore_index=True)
for year in years:    
    for fos in sorted(fos_list):
        df_publication=df_publication.append(pd.DataFrame({'Field':'Total','Type':'without AI','Year':year,'Value':NonAI_pub_fos[fos][year]}),ignore_index=True)
        df_publication=df_publication.append(pd.DataFrame({'Field':'Total','Type':'AI-enabled','Year':year,'Value':AI_pub_fos[fos][year]}),ignore_index=True)

for fos in fos_list:
    plt.figure(figsize=(7,5))
    sns.lineplot(data=df_publication[df_publication['Field']==fos.capitalize()], x='Year', y='Value', hue='Type',errorbar=interval,palette=['royalblue','red'],legend=False)
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=15)
    plt.ylabel('Number of annual publications',fontsize=18)
    plt.xlabel('Year',fontsize=20)
    plt.xlim(year_start,year_end)
    plt.ylim(0,1)
    plt.grid(True, linestyle="--", alpha=1)
#     plt.legend(fontsize=20)
#     plt.title(fos.title(),fontsize=20)
    plt.tight_layout()
    # plt.show()
    plt.savefig(os.path.join('..','figure_0108',f'3-publication_plot_{fos}.pdf'),dpi=300)
    plt.close()

plt.figure(figsize=(7,5))
sns.lineplot(data=df_publication[df_publication['Field']=='Total'], x='Year', y='Value', hue='Type',errorbar=interval, palette=['royalblue','red'],legend=False)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.ylabel('Number of annual publications',fontsize=18)
plt.xlabel('Year',fontsize=20)
plt.xlim(year_start,year_end)
plt.ylim(0,1)
plt.grid(True, linestyle="--", alpha=1)
# plt.legend(fontsize=20)
# plt.title('Total',fontsize=20)
plt.tight_layout()
# plt.show()
plt.savefig(os.path.join('..','figure_0108',f'3-publication_plot.pdf'),dpi=300)
plt.close()

plt.figure(figsize=(10.5,6.5))
g=sns.barplot(data=df_publication, x='Field', y='Value', hue='Type',errorbar=interval, palette=['royalblue','red'],alpha=0.6,capsize=0.2,errwidth=1.5)
g.legend_.remove()
plt.xticks(range(len(fos_name_list)),sorted(fos_name_list),fontsize=15)
plt.yticks(fontsize=15)
plt.ylabel('Annual publication per researcher',fontsize=20)
plt.xlabel('Field of study',fontsize=20)
axis=plt.axis()
ymin=axis[-2]
ymax=axis[-1]
plt.vlines(x=np.array(range(1,len(fos_list)+1))-0.5,ymin=ymin,ymax=1.1*ymax,color='lightgrey',linestyle='--')
for i,fos in enumerate(sorted(fos_list)+['total']):
    df_fos=df_publication[df_publication['Field']==fos.capitalize()]
    AI_data=df_fos[df_fos['Type']=='AI-enabled']['Value'].to_list()
    NonAI_data=df_fos[df_fos['Type']=='without AI']['Value'].to_list()
    p=stats.ttest_ind(AI_data,NonAI_data).pvalue
    plt.text(x=i,y=ymax,s=('p=%.3f'%p if p>=0.001 else 'p<0.001'),va='center',ha='center',fontsize=15)
    print('%.3f\t%.3f-%.3f\t%.3f\t%.3f-%.3f\t%.3f\t%.3f-%.3f\t%.3f'%((np.mean(NonAI_data),)+interval(NonAI_data)+(np.mean(AI_data),)+interval(AI_data)+(np.mean(AI_data)-np.mean(NonAI_data),)+interval_diff(AI_data,NonAI_data)+(p,)))
plt.xlim(-0.5,len(fos_list)+0.5)
plt.ylim(ymin,ymax*1.1)
plt.tight_layout()
plt.show()

In [None]:
df_citation=pd.DataFrame(columns=['Field','Type','Year','Value'])
for year in years:    
    for fos in sorted(fos_list):
        df_citation=df_citation.append(pd.DataFrame({'Field':fos.capitalize(),'Type':'without AI','Year':year,'Value':NonAI_cite_fos[fos][year]}),ignore_index=True)
        df_citation=df_citation.append(pd.DataFrame({'Field':fos.capitalize(),'Type':'AI-enabled','Year':year,'Value':AI_cite_fos[fos][year]}),ignore_index=True)
for year in years:    
    for fos in sorted(fos_list):
        df_citation=df_citation.append(pd.DataFrame({'Field':'Total','Type':'without AI','Year':year,'Value':NonAI_cite_fos[fos][year]}),ignore_index=True)
        df_citation=df_citation.append(pd.DataFrame({'Field':'Total','Type':'AI-enabled','Year':year,'Value':AI_cite_fos[fos][year]}),ignore_index=True)

for fos in fos_list:
    plt.figure(figsize=(7,5))
    sns.lineplot(data=df_citation[df_citation['Field']==fos.capitalize()], x='Year', y='Value', hue='Type',errorbar=interval,palette=['royalblue','red'],legend=False)
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=15)
    plt.ylabel('Number of annual citations',fontsize=18)
    plt.xlabel('Year',fontsize=20)
    plt.xlim(year_start,year_end)
    plt.ylim(0,16)
    plt.grid(True, linestyle="--", alpha=1)
#     plt.legend(fontsize=20)
#     plt.title(fos.title(),fontsize=20)
    plt.tight_layout()
    # plt.show()
    plt.savefig(os.path.join('..','figure_0108',f'3-citation_plot_{fos}.pdf'),dpi=300)
    plt.close()

plt.figure(figsize=(7,5))
sns.lineplot(data=df_citation[df_citation['Field']=='Total'], x='Year', y='Value', hue='Type',errorbar=interval, palette=['royalblue','red'],legend=False)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.ylabel('Number of annual citations',fontsize=18)
plt.xlabel('Year',fontsize=20)
plt.xlim(year_start,year_end)
plt.ylim(0,16)
plt.grid(True, linestyle="--", alpha=1)
# plt.legend(fontsize=20)
# plt.title('Total',fontsize=20)
plt.tight_layout()
# plt.show()
plt.savefig(os.path.join('..','figure_0108',f'3-citation_plot.pdf'),dpi=300)
plt.close()

plt.figure(figsize=(10.5,6.5))
g=sns.barplot(data=df_citation, x='Field', y='Value', hue='Type',errorbar=interval, palette=['royalblue','red'],alpha=0.6,capsize=0.2,errwidth=1.5)
g.legend_.remove()
plt.xticks(range(len(fos_name_list)),sorted(fos_name_list),fontsize=15)
plt.yticks(fontsize=15)
plt.ylabel('Annual citation per researcher',fontsize=20)
plt.xlabel('Field of study',fontsize=20)
axis=plt.axis()
ymin=axis[-2]
ymax=axis[-1]
plt.vlines(x=np.array(range(1,len(fos_list)+1))-0.5,ymin=ymin,ymax=1.1*ymax,color='lightgrey',linestyle='--')
for i,fos in enumerate(sorted(fos_list)+['total']):
    df_fos=df_citation[df_citation['Field']==fos.capitalize()]
    AI_data=df_fos[df_fos['Type']=='AI-enabled']['Value'].to_list()
    NonAI_data=df_fos[df_fos['Type']=='without AI']['Value'].to_list()
    p=stats.ttest_ind(AI_data,NonAI_data).pvalue
    plt.text(x=i,y=ymax,s=('p=%.3f'%p if p>=0.001 else 'p<0.001'),va='center',ha='center',fontsize=15)
    print('%.3f\t%.3f-%.3f\t%.3f\t%.3f-%.3f\t%.3f\t%.3f-%.3f\t%.3f'%((np.mean(NonAI_data),)+interval(NonAI_data)+(np.mean(AI_data),)+interval(AI_data)+(np.mean(NonAI_data)-np.mean(AI_data),)+interval_diff(NonAI_data,AI_data)+(p,)))
plt.xlim(-0.5,len(fos_list)+0.5)
plt.ylim(ymin,ymax*1.1)
plt.tight_layout()
plt.show()