In [1]:
# test azure
import sys, time, json
# from openai import OpenAI
import pandas as pd
import re
from glob import glob

import altair as alt
import seaborn as sns
import matplotlib.pyplot as plt

### Comparing Automated run anad manual runs for Erythrocytes

In [2]:

# define key dictionary for each question for concrete formatting

mainkey_qestion_key = {'symbol':'geneSymbol','description':'description',
               'associated with erythroid cells or erythropoiesis':'assoc_erythropoiesis',
               'biomarker in clinical settings':'bio_marker_clinical',
                'blood transcriptional biomarker':'bio_marker_blood',
                'circulating leukocytes':'cir_leuko_immBio',
                'known drug target':'grug_target_known',
                'therapeutically relevant for immune-mediated diseases':'theraput_immune_disease',
               }


def find_keyword(sline, keyLib):
    for mk in keyLib.keys():

        # Regular expression pattern to find all combinations of the letters in 'gene'
        pattern = r'{}'.format(mk)

        # Finding all matches in the sample text
        matches = re.findall(pattern, sline, re.IGNORECASE)
        if matches:
            return keyLib[mk]
        else:
            next
    return False



def convert_stringtodict(lines, keylib):
    dict_line = {}
    for k in lines:
        ksplit = k.split(":")

        if len(ksplit) ==2:
            key_tmp = find_keyword(ksplit[0].strip("\'|\"|', |").strip(), keylib)
            val_tmp = ksplit[1].strip("\'|\"|',|{|} ").strip()
            if key_tmp and val_tmp:
                if key_tmp == "Summary":
                    dict_line[key_tmp] = val_tmp
                else:
                    try:
                        dict_line[key_tmp] = float(val_tmp)
                    except:
                        dict_line[key_tmp] = 0
            else:
                next
                # print ("error in ", ksplit)

    return dict_line

def get_qScore(q, question_dict, subkey):
    q_scores = []
    for gname in q.keys():
        for model in q[gname].keys():
            kx = convert_stringtodict(q[gname][model],question_dict)
            kx.update({'gene_name':gname,
                        'runID':model,
                        "model_version":model.lstrip("datasvc-openai-compsci-poc-").split("_")[0],
                        "subjectKey":subkey,})
            q_scores.append(kx)
    print (len(q_scores))
    return pd.DataFrame(q_scores)

In [3]:
def getScores(dfX):
    _scores = dfX.set_index("gene_name").unstack().reset_index()
    _scores = _scores.rename({"level_0":'prompt',0:'response_score'},axis=1)
    score_aggregate = _scores.groupby(['gene_name','prompt'])['response_score'].agg(['mean','std'])
    return score_aggregate.reset_index()

def get_plot(df, moduleName):
    score_aggregatePT_mean = df.pivot_table(index="gene_name",columns="prompt",values="mean")
    genesorted_Score = list(score_aggregatePT_mean.sum(axis=1).sort_values().index)

    barplot_Scores = alt.Chart(df).mark_bar().encode(
        y=alt.Y('gene_name',sort=genesorted_Score),
        x='sum(mean)',
        color='prompt'
    ).properties(title="{} scores".format(moduleName))

    scoreDetailPlot =alt.Chart(df).mark_point(filled=True).encode(
        x= alt.X("prompt"),
        y= alt.Y("gene_name",sort=genesorted_Score),
        color=alt.Color("mean", scale=alt.Scale(scheme="lightgreyred",reverse=False)),
        size=alt.Size("std",scale=alt.Scale(reverse=True)),
        tooltip=['mean','std','gene_name','prompt']
    ).properties(title="{} Mean scores : STD".format(moduleName))

    scoresX = barplot_Scores|scoreDetailPlot

    return scoresX

In [4]:
key_set = {
    'symbol':'geneSymbol',
    'summary':'description',
    'function':'description',
    'description':'description',
    'associated with erythroid cells or erythropoiesis':'assoc_erythropoiesis',
    'biomarker in clinical settings':'bio_marker_clinical',
    'blood transcriptional biomarker':'bio_marker_blood',
    'circulating leukocytes':'cir_leuko_immBio',
    'known drug target':'drug_target_known',
    'therapeutically relevant for immune-mediated diseases':'theraput_immune_disease',
}

In [5]:
allGene = []

response_dir = "../../../../g4API_app/bloodgen3Gensets/output/"

for fname in glob(response_dir+"/resp_*.json"):
    tmp = json.load(open(fname,'r'))
    fox = {}
    for key, value in tmp.items():
        key_tmp = find_keyword(key, key_set)
        if (key_tmp == "geneSymbol") or(key_tmp=='description'):
            fox[key_tmp] = value
            
        else:
            try:
                fox[key_tmp] = float(value)
            except:
                fox[key_tmp] = 0
    if len(fox) < 4:
        print ("error in ", fname)
    else:
        allGene.append(fox)

In [6]:
dxV = pd.DataFrame(allGene)
dxV_filter = dxV[~dxV.geneSymbol.isna()].sort_values(by="geneSymbol")
dxV_filter = dxV_filter.sort_values(by="assoc_erythropoiesis").set_index("geneSymbol")
score_cols = dxV_filter.columns[-6:]

print (score_cols)

Index(['assoc_erythropoiesis', 'bio_marker_clinical', 'bio_marker_blood',
       'cir_leuko_immBio', 'drug_target_known', 'theraput_immune_disease'],
      dtype='object')


In [2]:
geneBase = pd.read_csv("../../../../g4API_app/bloodgen3Gensets/data/ModuleTranscript_geneList.csv.gzip",compression="gzip").set_index("geneSymbol")
geneBase = geneBase.drop("Unnamed: 0",axis=1)

In [5]:
geneBase.index.nunique()

11465

In [3]:
geneBase.head()

Unnamed: 0_level_0,ModuleID,AggregateNumber,ModuleTitle
geneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ALPP,M3.1,A29,Cell cycle
ALS2CR14,M3.1,A29,Cell cycle
ANKRD30B,M3.1,A29,Cell cycle
ARL16,M3.1,A29,Cell cycle
BCYRN1,M3.1,A29,Cell cycle


In [12]:
dxV_filter_annot = dxV_filter.join(geneBase)
dxV_filter_annot.head()

Unnamed: 0_level_0,False,description,assoc_erythropoiesis,bio_marker_clinical,bio_marker_blood,cir_leuko_immBio,drug_target_known,theraput_immune_disease,ModuleID,AggregateNumber,ModuleTitle
geneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ABCC13,0.0,The ABCC13 gene is a pseudogene in humans and ...,0.0,0.0,0.0,0.0,0.0,0.0,M13.30,A37,Erythrocytes
PLVAP,,{'official_full_name': 'plasmalemma vesicle as...,0.0,0.0,0.0,0.0,0.0,0.0,M15.100,A37,Erythrocytes
PRDX2,0.0,{'function': 'PRDX2 encodes for a member of th...,0.0,0.0,0.0,0.0,0.0,0.0,M12.11,A37,Erythrocytes
PTMS,,"Parathymosin is a small, acidic, and nuclear p...",0.0,0.0,0.0,0.0,0.0,0.0,M13.30,A37,Erythrocytes
PTPLA,0.0,PTPLA is a gene that encodes a protein with ho...,0.0,0.0,0.0,0.0,0.0,0.0,M16.96,A38,Erythrocytes


In [13]:
mod_X = pd.melt(dxV_filter_annot, id_vars='ModuleID',value_vars=['assoc_erythropoiesis', 'bio_marker_clinical', 'bio_marker_blood',
       'cir_leuko_immBio', 'drug_target_known', 'theraput_immune_disease'],value_name='score',var_name='query')
module_agg = mod_X .groupby(['ModuleID','query'])['score'].agg(['mean','std']).reset_index().fillna(0)
module_agg

Unnamed: 0,ModuleID,query,mean,std
0,M10.2,assoc_erythropoiesis,10.000000,0.000000
1,M10.2,bio_marker_blood,3.000000,0.000000
2,M10.2,bio_marker_clinical,1.000000,0.000000
3,M10.2,cir_leuko_immBio,1.000000,0.000000
4,M10.2,drug_target_known,0.000000,0.000000
...,...,...,...,...
139,M9.2,bio_marker_blood,1.037037,1.720498
140,M9.2,bio_marker_clinical,0.592593,1.047314
141,M9.2,cir_leuko_immBio,0.481481,0.935224
142,M9.2,drug_target_known,0.444444,1.050031


In [14]:
alt.Chart(module_agg).mark_point(filled=True).encode(
    x= alt.X('query'),
    y=alt.Y("ModuleID"),
    color=alt.Color("mean"),
    size=alt.Size('std',scale=alt.Scale(reverse=True)),
    tooltip=['mean','std']

)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [15]:
scrCol = ['assoc_erythropoiesis', 'bio_marker_clinical', 'bio_marker_blood',
       'cir_leuko_immBio', 'drug_target_known', 'theraput_immune_disease']
dxV_filter_annot['totalScore'] = dxV_filter_annot[scrCol].sum(axis=1)

In [16]:
# reading data from manual scoring
paper_scores = "../../../../g4API_app/bloodgen3Gensets/data/JMT_scores_EryModules.csv"
jtm_data = pd.read_csv(paper_scores).set_index("Gene symbol")
jtm_data_g4 = jtm_data[jtm_data.columns[jtm_data.columns.str.contains("GPT4")]]

In [17]:
_commonGenes = list(set(jtm_data_g4.index).intersection(set(dxV_filter_annot.index)))
print (len(_commonGenes))

141


In [18]:
jtm_data_g4['gName'] = [i.strip() for i in jtm_data_g4.index]
jtm_data_g4 = jtm_data_g4.drop_duplicates(subset="gName")
jtm_data_g4.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jtm_data_g4['gName'] = [i.strip() for i in jtm_data_g4.index]


(261, 7)

In [19]:
jtm_data_g4 = jtm_data_g4.set_index("gName")
jtm_data_g4.head()

Unnamed: 0_level_0,Score_Ery GPT4,Score_Clin BM GPT4,Score_BT BM GPT4,Score_Immuno GPT4,Score_drug target GPT4,Score_therapeutic GPT4
gName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ALAS2,10.0,0.0,4.0,0.0,4.0,2.0
BCL2L1,5.0,4.0,5.0,8.0,9.0,7.0
BPGM,10.0,0.0,5.0,0.0,0.0,1.0
C14ORF45,0.0,0.0,0.0,0.0,0.0,0.0
C1ORF128,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
_colRename = {'Score_Ery GPT4':'assoc_erythropoiesis_g4JTM', 
               'Score_Clin BM GPT4':'bio_marker_clinical_g4JTM',
               'Score_BT BM GPT4':'bio_marker_blood_g4JTM',
               'Score_Immuno GPT4':'cir_leuko_immBio_g4JTM',
               'Score_drug target GPT4':'drug_target_known_g4JTM',
               'Score_therapeutic GPT4':'theraput_immune_disease_g4JTM',}

jtm_data_g4 = jtm_data_g4.rename(_colRename,axis=1)
jtm_data_g4.head()

Unnamed: 0_level_0,assoc_erythropoiesis_g4JTM,bio_marker_clinical_g4JTM,bio_marker_blood_g4JTM,cir_leuko_immBio_g4JTM,drug_target_known_g4JTM,theraput_immune_disease_g4JTM
gName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ALAS2,10.0,0.0,4.0,0.0,4.0,2.0
BCL2L1,5.0,4.0,5.0,8.0,9.0,7.0
BPGM,10.0,0.0,5.0,0.0,0.0,1.0
C14ORF45,0.0,0.0,0.0,0.0,0.0,0.0
C1ORF128,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
jtm_data_g4.shape

(261, 6)

In [25]:
geneBase.loc[geneBase.index.intersection(jtm_data_g4.index)]

Unnamed: 0,ModuleID,AggregateNumber,ModuleTitle
MARCH8,M9.2,A37,Erythrocytes
ALAS2,M9.2,A37,Erythrocytes
BCL2L1,M9.2,A37,Erythrocytes
BPGM,M9.2,A37,Erythrocytes
C14ORF45,M9.2,A37,Erythrocytes
...,...,...,...
SLC6A9,M15.100,A37,Erythrocytes
SPTB,M15.100,A37,Erythrocytes
TBCEL,M15.100,A37,Erythrocytes
TMEM56,M15.100,A37,Erythrocytes


In [26]:
jtm_data_g4['totalScore'] = jtm_data_g4.sum(axis=1)
jtm_data_g4['ModuleID'] = geneBase.loc[geneBase.index.intersection(jtm_data_g4.index)]['ModuleID']

In [27]:
jtm_data_g4

Unnamed: 0_level_0,assoc_erythropoiesis_g4JTM,bio_marker_clinical_g4JTM,bio_marker_blood_g4JTM,cir_leuko_immBio_g4JTM,drug_target_known_g4JTM,theraput_immune_disease_g4JTM,totalScore,ModuleID
gName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ALAS2,10.0,0.0,4.0,0.0,4.0,2.0,20.0,M9.2
BCL2L1,5.0,4.0,5.0,8.0,9.0,7.0,38.0,M9.2
BPGM,10.0,0.0,5.0,0.0,0.0,1.0,16.0,M9.2
C14ORF45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,M9.2
C1ORF128,0.0,0.0,0.0,0.0,0.0,0.0,0.0,M9.2
...,...,...,...,...,...,...,...,...
SHARPIN,0.0,2.0,2.0,7.0,5.0,7.0,23.0,M13.30
SLC25A39,7.0,0.0,3.0,0.0,0.0,0.0,10.0,M13.30
ST6GALNAC4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,M13.30
TMEM86B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,M13.30


In [29]:
dxV_filter_annot.columns

Index([                    False,             'description',
          'assoc_erythropoiesis',     'bio_marker_clinical',
              'bio_marker_blood',        'cir_leuko_immBio',
             'drug_target_known', 'theraput_immune_disease',
                      'ModuleID',         'AggregateNumber',
                   'ModuleTitle',              'totalScore'],
      dtype='object')

In [34]:
dxV_filter_annotX = dxV_filter_annot[['assoc_erythropoiesis','bio_marker_clinical','bio_marker_blood',\
                  'cir_leuko_immBio','drug_target_known', 'theraput_immune_disease',\
                      'ModuleID']]

dxV_filter_annotX_melt = dxV_filter_annotX.reset_index().melt(id_vars=['geneSymbol','ModuleID'],value_vars=['assoc_erythropoiesis','bio_marker_clinical','bio_marker_blood',\
                  'cir_leuko_immBio','drug_target_known', 'theraput_immune_disease',],var_name='question',value_name='score')

In [35]:
dxV_filter_annotX_melt.head()

Unnamed: 0,geneSymbol,ModuleID,question,score
0,ABCC13,M13.30,assoc_erythropoiesis,0.0
1,PLVAP,M15.100,assoc_erythropoiesis,0.0
2,PRDX2,M12.11,assoc_erythropoiesis,0.0
3,PTMS,M13.30,assoc_erythropoiesis,0.0
4,PTPLA,M16.96,assoc_erythropoiesis,0.0


In [None]:
get_plot(dxV_filter_annotX_melt,'M10.1')