In [1]:

import sys, time, json
import pandas as pd
import re
from glob import glob

import altair as alt
import seaborn as sns
import matplotlib.pyplot as plt
import scipy as sp

In [2]:
benchmarking_m10_1=pd.read_csv("../data/openAI_result_curated/Interferone/Benchmarking_M10.1.csv.gz",compression="gzip")
benchmarking_m10_1.head()


Unnamed: 0.1,Unnamed: 0,GeneSymbol,Statements,API_3x,API_5x,Manual_US,Manual_Thailand,Manual_Qatar,Claude-3
0,0,DDX58,a. Association with type I interferon responses,10.0,10.0,10.0,10.0,9.0,10.0
1,1,DDX58,b. Association with type II interferon responses,3.666667,5.8,5.666667,4.333333,3.333333,4.0
2,2,DDX58,c. Association with type III interferon responses,5.666667,6.2,7.333333,7.666667,5.333333,7.0
3,3,DDX58,d. Relevance to circulating leukocytes immune ...,8.666667,8.4,8.666667,6.0,6.0,8.0
4,4,DDX58,e. Used as a biomarker in clinical settings,1.666667,3.0,6.0,2.333333,2.0,3.0


In [43]:
benchmarking_m10_1.Statements.unique()

array(['a. Association with type I interferon responses',
       'b. Association with type II interferon responses',
       'c. Association with type III interferon responses',
       'd. Relevance to circulating leukocytes immune biology',
       'e. Used as a biomarker in clinical settings',
       'f. Potential value as a blood transcriptional biomarker',
       'g. Known drug target',
       'h. Therapeutically relevant for immune system diseases'],
      dtype=object)

In [44]:
## question color
q_color = {'a. Association with type I interferon responses':'#de2d26',
       'b. Association with type II interferon responses':'#fc9272',
       'c. Association with type III interferon responses':'#fee5d9',
       'd. Relevance to circulating leukocytes immune biology':'#756bb1',
       'e. Used as a biomarker in clinical settings':'#74c476',
       'f. Potential value as a blood transcriptional biomarker':'#c7e9c0',
       'g. Known drug target':'#c994c7',
       'h. Therapeutically relevant for immune system diseases':'#dd1c77'}

In [40]:
corr_inter = benchmarking_m10_1[['API_3x', 'API_5x', 'Manual_US',
       'Manual_Thailand', 'Manual_Qatar', 'Claude-3']].corr()

corr_interX = corr_inter.unstack().reset_index().rename({'level_0':'callMode1','level_1':'callMode2',0:'pearsonCorr'},axis=1)

alt.Chart(corr_interX).mark_rect().encode(
    x=alt.X("callMode1",axis=alt.Axis(title='',labelFontSize=14)),
    y=alt.Y("callMode2",axis=alt.Axis(title='',labelFontSize=14)),
    color=alt.Color('pearsonCorr',scale=alt.Scale(scheme="tealblues",clamp=True),legend=alt.Legend(title="Pearson Correlation",titleOrient="right")),
    tooltip=['pearsonCorr']

)

In [23]:
import itertools

mode_pair = list(itertools.combinations(['API_3x', 'API_5x', 'Manual_US',
       'Manual_Thailand', 'Manual_Qatar', 'Claude-3'],2))

statement_rel = pd.DataFrame(index=benchmarking_m10_1.Statements.unique())
for kname, kgrp in benchmarking_m10_1.groupby("Statements"):
    for kpair in mode_pair:
        kapr_name = ":".join(kpair)
        col1 = kgrp[kpair[0]].values
        col2 = kgrp[kpair[1]].values
        r, p = sp.stats.pearsonr(col1,col2)
        statement_rel.loc[kname,kapr_name] = round(r,2)
        statement_rel.loc[kname, kapr_name+"_p"] = p

In [5]:
statement_rel.to_csv("../data/openAI_result_curated/Interferone/Benchmarking_M10.1_scoreCorr.csv")

In [37]:
statement_relPT = statement_rel[statement_rel.columns[~statement_rel.columns.str.endswith("p")]].unstack().reset_index().rename({'level_0':'modePair','level_1':'statements',0:'pearson_r'},axis=1)
statement_relPT

Unnamed: 0,modePair,statements,pearson_r
0,API_3x:API_5x,a. Association with type I interferon responses,0.98
1,API_3x:API_5x,b. Association with type II interferon responses,0.95
2,API_3x:API_5x,c. Association with type III interferon responses,0.93
3,API_3x:API_5x,d. Relevance to circulating leukocytes immune ...,0.97
4,API_3x:API_5x,e. Used as a biomarker in clinical settings,0.87
...,...,...,...
115,Manual_Qatar:Claude-3,d. Relevance to circulating leukocytes immune ...,0.90
116,Manual_Qatar:Claude-3,e. Used as a biomarker in clinical settings,0.68
117,Manual_Qatar:Claude-3,f. Potential value as a blood transcriptional ...,0.90
118,Manual_Qatar:Claude-3,g. Known drug target,0.73


In [38]:
statement_relPT['statement_key'] = [i[0] for i in statement_relPT.statements.values]
statement_relPT.head()

Unnamed: 0,modePair,statements,pearson_r,statement_key
0,API_3x:API_5x,a. Association with type I interferon responses,0.98,a
1,API_3x:API_5x,b. Association with type II interferon responses,0.95,b
2,API_3x:API_5x,c. Association with type III interferon responses,0.93,c
3,API_3x:API_5x,d. Relevance to circulating leukocytes immune ...,0.97,d
4,API_3x:API_5x,e. Used as a biomarker in clinical settings,0.87,e


In [39]:

alt.Chart(statement_relPT,width=300, height=150).mark_rect().encode(
        x=alt.X("modePair",axis=alt.Axis(labelFontSize=14,labelLimit=500,title='')),
        y=alt.Y("statement_key",axis=alt.Axis(labelFontSize=14,labelLimit=20,title='Statement Key')),
        color=alt.Color('pearson_r',scale=alt.Scale(scheme='blueorange',clamp=True),legend=alt.Legend(title="Pearson Correlation",titleOrient="right")),
        tooltip=['modePair','statements','pearson_r']
    ).configure_view(strokeWidth=0).configure_axis(grid=False, domain=False)

In [7]:
statement_relPT

Unnamed: 0,modePair,statements,pearson_r
0,API_3x:API_5x_r,a. Association with type I interferon responses,0.98
1,API_3x:API_5x_r,b. Association with type II interferon responses,0.95
2,API_3x:API_5x_r,c. Association with type III interferon responses,0.93
3,API_3x:API_5x_r,d. Relevance to circulating leukocytes immune ...,0.97
4,API_3x:API_5x_r,e. Used as a biomarker in clinical settings,0.87
...,...,...,...
115,Manual_Qatar:Claude-3_r,d. Relevance to circulating leukocytes immune ...,0.90
116,Manual_Qatar:Claude-3_r,e. Used as a biomarker in clinical settings,0.68
117,Manual_Qatar:Claude-3_r,f. Potential value as a blood transcriptional ...,0.90
118,Manual_Qatar:Claude-3_r,g. Known drug target,0.73


In [8]:
    
def get_lmplot(df,x,y,height=3, aspect=1):
    g = sns.lmplot(x=x, y=y, data=df, height=height, aspect=aspect,hue="Statements")

    def annotate(data, **kws):
        r, p = sp.stats.pearsonr(data[x], data[y])
        ax = plt.gca()
        ax.text(.05, .8, 'r={:.2f}, p={:.2g}'.format(r, p),
                transform=ax.transAxes)
        r, p = sp.stats.pearsonr(data[x], data[y])
    return g.map_dataframe(annotate)

In [49]:
def giveCorrPlot(x_data, y_data):
    r, p = sp.stats.pearsonr(benchmarking_m10_1[x_data], benchmarking_m10_1[y_data])

    c = alt.Chart(benchmarking_m10_1,width=150,height=180).mark_point(filled=True).encode(
        x=alt.X(x_data,axis=alt.Axis(labelFontSize=14,)),
        y=alt.Y(y_data,axis=alt.Axis(labelFontSize=14,)),
        color=alt.Color("Statements",scale=alt.Scale(domain=list(q_color.keys()),range=list(q_color.values())),legend=alt.Legend(labelLimit=500)),
        tooltip=['GeneSymbol','Statements']
    )

    text = alt.Chart().mark_text(
        align="left",
        baseline="top",
        fontSize=12,
        fontWeight='normal',
        color='black'
    ).encode(
        x=alt.value(10),  # pixels from left
        y=alt.value(5),  # pixels from top
        text=alt.value([ 'r={:.2f}, p={:.2g}'.format(r, p),])
    )

    return c+text

In [51]:
c1 = giveCorrPlot('API_3x','API_5x')
c2 =  giveCorrPlot('API_3x','Manual_Qatar')
c3 =  giveCorrPlot('API_3x','Manual_US')

In [52]:

corrPlot = alt.hconcat(c1,c2,c3).configure_view(strokeWidth=3).configure_axis(grid=False, domain=False)
corrPlot

In [58]:
dx = benchmarking_m10_1[['GeneSymbol','Statements','API_3x']].rename({'API_3x':'score'},axis=1)
dx_sum = dx.groupby('GeneSymbol')['score'].sum()
dx_sum

GeneSymbol
DDX58     47.666667
DDX60     36.333333
DHX58     37.666667
FBXO6     18.000000
GBP1      44.666667
GBP4      35.333333
GBP5      42.000000
IFI35     37.000000
IFIH1     42.333333
IFIT2     44.666667
IFIT5     39.666667
IRF7      47.666667
LAP3      20.333333
OAS2      47.666667
PARP12    33.666667
PARP14    42.000000
SAMD9L    32.666667
SCO2       9.000000
STAT1     62.666667
TRIM22    44.000000
ZBP1      41.333333
Name: score, dtype: float64

In [69]:
barplot_Scores = alt.Chart(dx).mark_bar().encode(
    y=alt.Y('GeneSymbol',sort=list(dx_sum.sort_values().index),axis=alt.Axis(labelFontSize=14)),
    x=alt.X('score',axis=alt.Axis(title="Total Score",labelFontSize=14)),
    color=alt.Color('Statements',scale=alt.Scale(domain=list(q_color.keys()),range=list(q_color.values())),
                    legend=alt.Legend(labelLimit=500)),
)

In [70]:
barplot_Scores