In [None]:
import os
import pickle
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from scipy import stats
import pandas as pd
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.family'] = ['Arial']

In [None]:
with open(os.path.join('.','process_data','paper_classify_union.pkl'),'rb') as f:
    paper_classify=pickle.load(f)

with open(os.path.join('.','process_data','paper_year.pkl'),'rb') as f:
    paper_year=pickle.load(f)   
with open(os.path.join('.','process_data','paper_root_fos_level0_multi.pkl'),'rb') as f:
    paper_root_fos=pickle.load(f)
with open(os.path.join('.','process_data','paper_team_last.pkl'),'rb') as f:
    paper_team=pickle.load(f)
    
with open(os.path.join('.','process_data','author_career.pkl'),'rb') as f:
    author_career=pickle.load(f)
    
with open(os.path.join('.','process_data','paper_selected_dict_title_abstract.pkl'),'rb') as f:
    paper_selected_dict_title_abstract=pickle.load(f)
with open(os.path.join('.','process_data','paper_selected_dict_venue.pkl'),'rb') as f:
    paper_selected_dict_venue=pickle.load(f)
    
with open(os.path.join('.','process_data','root_fos_level0.pkl'),'rb') as f:
    root_fos=pickle.load(f)

In [None]:
fos_list=['geology','chemistry','materials science','biology','physics','medicine']
fos_name_list=['Geology','Chemistry','Materials','Biology','Physics','Medicine','Total']

result=dict()
for fos in fos_list:
    result[fos]=dict()
    for year in range(1800,2022):
        result[fos][year]=[[],[]]
        
result_total=dict()
for year in range(1800,2022):
    result_total[year]=[[],[]]
        
result_support=dict()
for fos in fos_list:
    result_support[fos]=dict()
    for year in range(1800,2022):
        result_support[fos][year]=[[],[]]
        
result_support_total=dict()
for year in range(1800,2022):
    result_support_total[year]=[[],[]]
        
result_lead=dict()
for fos in fos_list:
    result_lead[fos]=dict()
    for year in range(1800,2022):
        result_lead[fos][year]=[[],[]]
        
result_lead_total=dict()
for year in range(1800,2022):
    result_lead_total[year]=[[],[]]

In [None]:
fos_set=set(fos_list)
for paper,classify in tqdm(paper_classify.items()):
    if (paper in paper_selected_dict_title_abstract) and (paper in paper_selected_dict_venue):
        try:
            foses=paper_root_fos[paper]
            year=paper_year[paper]
            team=paper_team[paper]
        except:
            continue

        if team[0]+team[1]>0:
            foses=set([root_fos[fos] for fos in foses])&fos_set
            if len(foses)>0:
                result_total[year][classify].append(team[0]+team[1])
                result_support_total[year][classify].append(team[0])
                result_lead_total[year][classify].append(team[1])
                for fos in foses:
                    result[fos][year][classify].append(team[0]+team[1])
                    result_support[fos][year][classify].append(team[0])
                    result_lead[fos][year][classify].append(team[1])

In [None]:
year_start=1990
year_end=2015
years=range(year_start,year_end+1)

In [None]:
def interval(data,confidence=0.99):
    return stats.t.interval(confidence=confidence, df=len(data)-1, loc=np.mean(data), scale=stats.sem(data))

def interval_diff(data1,data2,confidence=0.99):
    s1=stats.sem(data1)**2
    s2=stats.sem(data2)**2
    
    return stats.t.interval(confidence=confidence, df=(s1+s2)**2/(s1**2/(len(data1)-1)+s2**2/(len(data2)-1)), loc=np.mean(data1)-np.mean(data2), scale=(s1+s2)**0.5)

In [None]:
df_size_total=pd.DataFrame(columns=['Role','Type','Year','Value'])
for year in years:    
    df_size_total=df_size_total.append(pd.DataFrame({'Role':'Total','Type':'NonAI','Year':year,'Value':result_total[year][0]}),ignore_index=True)
    df_size_total=df_size_total.append(pd.DataFrame({'Role':'Total','Type':'AI','Year':year,'Value':result_total[year][1]}),ignore_index=True)
    df_size_total=df_size_total.append(pd.DataFrame({'Role':'Seinor','Type':'NonAI','Year':year,'Value':result_lead_total[year][0]}),ignore_index=True)
    df_size_total=df_size_total.append(pd.DataFrame({'Role':'Seinor','Type':'AI','Year':year,'Value':result_lead_total[year][1]}),ignore_index=True)
    df_size_total=df_size_total.append(pd.DataFrame({'Role':'Junior','Type':'NonAI','Year':year,'Value':result_support_total[year][0]}),ignore_index=True)
    df_size_total=df_size_total.append(pd.DataFrame({'Role':'Junior','Type':'AI','Year':year,'Value':result_support_total[year][1]}),ignore_index=True)
    
plt.figure(figsize=(10.5,6))
g=sns.barplot(data=df_size_total, x='Role', y='Value', hue='Type',errorbar=interval, palette=['royalblue','red'], alpha=0.6,capsize=0.2,errwidth=1.5)
g.legend_.remove()
plt.xticks(range(3),['Total','Established','Junior'],fontsize=15)
plt.yticks(fontsize=15)
plt.ylabel('Number in each team',fontsize=20)
plt.xlabel('Role of researcher',fontsize=20)
# plt.legend(fontsize=20,ncol=2,loc='upper center',bbox_to_anchor=(0.5,1.15))
axis=plt.axis()
ymin=axis[-2]
ymax=axis[-1]
plt.vlines(x=np.array(range(3))-0.5,ymin=ymin,ymax=1.1*ymax,color='lightgrey',linestyle='--')
for i,role in enumerate(['Total','Seinor','Junior']):
    df_role=df_size_total[df_size_total['Role']==role]
    AI_data=df_role[df_role['Type']=='AI']['Value'].to_list()
    NonAI_data=df_role[df_role['Type']=='NonAI']['Value'].to_list()
    p=stats.ttest_ind(AI_data,NonAI_data).pvalue
    plt.text(x=i,y=ymax,s=('p=%.3f'%p if p>=0.001 else 'p<0.001'),va='center',ha='center',fontsize=15)
    print('%.3f\t%.3f-%.3f\t%.3f\t%.3f-%.3f\t%.3f\t%.3f-%.3f\tp=%.3f'%((np.mean(NonAI_data),)+interval(NonAI_data)+(np.mean(AI_data),)+interval(AI_data)+(np.mean(NonAI_data)-np.mean(AI_data),)+interval_diff(NonAI_data,AI_data)+(p,)))
plt.xlim(-0.5,2.5)
plt.ylim(ymin,ymax*1.1)
plt.tight_layout()
plt.show()