In [1]:
from flask import render_template
from flask import request
import pandas as pd
import numpy as np
import os
import gensim
from nltk.stem import WordNetLemmatizer
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
from ast import literal_eval

In [2]:
base_dir='/Volumes/Yuchen_Drive/Insight/OncoMatch'
abstract_catalog=pd.read_csv(os.path.join(base_dir,'data/abstract_citation_annot_df_50930.csv'),index_col='pmid')
cancertype_df=pd.read_csv(os.path.join(base_dir, "data/cancerty_df2.csv"))
gene_df=pd.read_csv(os.path.join(base_dir, "data/HGNC_df2.csv"))
pmid2doctor_dict=pickle.load(open(os.path.join(base_dir, "data/pmid2doctor_dict.pkl"),"rb"))
cancer2pmid_dict=pickle.load(open(os.path.join(base_dir, "data/cancer2pmid_dict.pkl"),"rb"))
doctor2clinical_dict2=pickle.load(open(os.path.join(base_dir, "data/doctor2clinical_dict2.pkl"),"rb"))
doctor2clinical_dict2_df=pd.read_csv(os.path.join(base_dir, 'data/doctor2clinical_dict2_df.csv'),index_col='name')
onco_df=pickle.load(open(os.path.join(base_dir, "data/Oncologist_info_clinicaltrial_1470.pkl"),"rb"))
onco_df.index=onco_df.name
cancertype_list=list(cancertype_df.cancer_type.sort_values().unique())

In [3]:
cancertype_df.head()

Unnamed: 0,pmid,cancer_type
0,10079468,Leukemia
1,10079468,Chronic Myelogenous Leukemia (CML)
2,10581602,Prostate Cancer
3,10581602,Bone Cancer
4,11719088,Pancreatic Cancer


In [48]:
gene_df.head()

Unnamed: 0,pmid,HGNC
0,10037348,DNMT1
1,10079468,BCR
2,10079468,ABL1
3,10873802,AKT1
4,11050493,DNMT1


In [58]:
gene_list=list(gene_df.HGNC.unique())

In [359]:
print(len(gene_list))

205


In [3]:
cancer_type='Breast Cancer'
prefix="_".join(i.lower() for i in cancer_type.split())

In [331]:
cancerclinical2doctor_dict_df=pd.read_csv(os.path.join(base_dir, 'data/cancerclinical2doctor_dict_df.csv'),converters={"doctor":literal_eval},index_col='cancer_type')

In [332]:
len(cancerclinical2doctor_dict_df)

53

In [333]:
cancerclinical2doctor_dict_df.head()

Unnamed: 0_level_0,doctor
cancer_type,Unnamed: 1_level_1
Breast Cancer,"[Abenaa M. Brewster, Adam Brufsky, Aditya Bard..."
Lung Cancer,"[Abhimanyu Ghose, Adam Brufsky, Aditya Bardia,..."
Leukemia,"[Aaron Rapoport, Abdulraheem Yacoub, Abhinav D..."
Prostate Cancer,"[Abhishek A Solanki, Adam Brufsky, Aditya Bard..."
Colorectal Cancer,"[Adam Brufsky, Alok A. Khorana, Ana Maria Lope..."


In [6]:
class Oncomatch_model():
    def __init__(self,cancer_type):
        self.base_dir='/Volumes/Yuchen_Drive/Insight/OncoMatch'
        self.cancer_type=cancer_type
        self.prefix="_".join(i.lower() for i in cancer_type.split())
        self.bow_dict=gensim.corpora.Dictionary.load(os.path.join(self.base_dir,'models_lda/{}_abstract_bow_dict.dict'.format(self.prefix)))
        self.tfidf=gensim.models.TfidfModel.load(os.path.join(self.base_dir,'models_lda/{}_abstract_tfidf.tfidf'.format(self.prefix)))
        self.lda_model=gensim.models.LdaMulticore.load(os.path.join(self.base_dir,'models_lda/{}_abstract_lda.lda'.format(self.prefix)))
        self.lda_df=pd.read_csv(os.path.join(self.base_dir, "models_lda/{}_abstract_ldavector_df.csv".format(self.prefix)), index_col='pmid').drop(['pred'],axis=1)
        self.stop_words=pickle.load(open(os.path.join(self.base_dir, "data/stop_words_lda.set"),"rb"))
        
        #self.onco_df=pickle.load(open(os.path.join(base_dir, "data/Oncologist_info_clinicaltrial_1470.pkl"),"rb"))
        self.onco_df=pd.read_csv(os.path.join(self.base_dir, "data/Oncologist_info_clinicaltrial_1470.csv"))
        self.onco_df.index=self.onco_df.name
        #self.cancerclinical2doctor_dict=pickle.load(open(os.path.join(base_dir, "data/cancerclinical2doctor_dict.pkl"),"rb"))
        self.cancerclinical2doctor_dict_df=pd.read_csv(os.path.join(self.base_dir, 'data/cancerclinical2doctor_dict_df.csv'),converters={"doctor":literal_eval},index_col='cancer_type')
        #self.doctor2clinical_dict2=pickle.load(open(os.path.join(base_dir, "data/doctor2clinical_dict2.pkl"),"rb"))
        self.doctor2clinical_dict2_df=pd.read_csv(os.path.join(self.base_dir, 'data/doctor2clinical_dict2_df.csv'),index_col='name')
        #self.pmid2doctor_dict=pickle.load(open(os.path.join(base_dir, "data/pmid2doctor_dict.pkl"),"rb"))
        self.pmid2doctor_dict_df=pd.read_csv(os.path.join(self.base_dir, 'data/pmid2doctor_dict_df.csv'),converters={"doctor":literal_eval},index_col='pmid')
        #self.word2vec = gensim.models.Word2Vec.load(os.path.join(base_dir, 'models_w2v/doctor_recommendation_w2vmodel_dim300_mincount2.w2v'))
        #self.doc2vec=gensim.models.Doc2Vec.load(os.path.join(base_dir, 'preprocessing/personalized_medicine_d2vmodel_dim100_v2.d2v'))
        #self.lsa_model=gensim.models.LsiModel.load(os.path.join(base_dir,'preprocessing/Personalized_Medicine_lsimodel'))
        #self.lsa_df=pickle.load(open(os.path.join(base_dir,"preprocessing/Personalized_Medicine_train_lsavector_df.pkl"),"rb")).drop(['pred','class'],axis=1)
        #self.doctovec_df=pickle.load(open(os.path.join(base_dir,"preprocessing/Personalized_Medicine_train_doctovec_df.pkl"),"rb"))

    def preprocess(self, text):
        result = []
        for token in gensim.utils.simple_preprocess(text):
            temp=WordNetLemmatizer().lemmatize(token).lower()
            if len(temp)>1 and temp not in self.stop_words:
                result.append(temp)
        return result

    def get_embedding_vector(self, text):
        text_corpus=[self.bow_dict.doc2bow(text)]
        text_tfidf=self.tfidf[text_corpus]

        #text_lsa=self.lsa_model[text_tfidf]
        #text_lsa=gensim.matutils.corpus2csc(text_lsa)
        #text_lsa=text_lsa.T.toarray()

        text_lda=self.lda_model[text_tfidf]
        text_lda = gensim.matutils.corpus2csc(text_lda)
        text_lda = text_lda.T.toarray()

        #text_doc2vec=self.doc2vec.infer_vector(text, alpha=0.001, min_alpha=0.001, steps=10000)
        #text_doc2vec=text_doc2vec.reshape(1,-1)

        return text_lda

    def compute_similarity(self, embedding_matrix, text_vector):
        return cosine_similarity(X=embedding_matrix,Y=text_vector,dense_output=False)

    def get_similarity_scores(self, text):
        text_preprocess=self.preprocess(text)
        text_lda=self.get_embedding_vector(text_preprocess)

        #text_lsa_sim=self.compute_similarity(self.lsa_df,text_lsa)
        text_lda_sim=self.compute_similarity(self.lda_df,text_lda)
        #text_doc2vec_sim=self.compute_similarity(self.doctovec_df,text_doc2vec)

        similarity_df=pd.DataFrame({'pmid':self.lda_df.index, 'lda_similarity':text_lda_sim.squeeze()}).sort_values(by=['lda_similarity'],ascending=False).reset_index().drop(['index'],axis=1)
        return similarity_df
    
    def get_onco_info(self, pmid_list,clinical_trial):
        """
        Input: cancer_type and a list of pmid
        Output: DataFrame containing information for oncologists that publish the articles
        """
        onco2pmid=defaultdict(list)
        onco2rank=defaultdict(list)
        for i,pmid in enumerate(pmid_list):
            for doctor in self.pmid2doctor_dict_df.loc[pmid,'doctor']:
                onco2pmid[doctor].append(pmid)
                onco2rank[doctor].append(i)
        temp_df = self.onco_df.loc[np.array(list(onco2pmid.keys()))]
        temp_df['query_pmid'] = temp_df.name.apply(lambda x:onco2pmid[x])
        temp_df['query_pmid_num'] = temp_df.name.apply(lambda x:len(onco2pmid[x]))
        temp_df['pmid_rank'] = temp_df.name.apply(lambda x:onco2rank[x])
        temp_df['pmid_rank_10th_percentile'] = temp_df.pmid_rank.apply(lambda x: sum(np.array(x)<=int(np.ceil(temp_df.shape[0]/10))))
    
        if clinical_trial and self.cancer_type in self.cancerclinical2doctor_dict_df.index:
            doctor_clin = np.intersect1d(np.array(self.cancerclinical2doctor_dict_df.loc[self.cancer_type,'doctor']), temp_df.name.values)
            temp_df = temp_df.loc[np.array(doctor_clin)].sort_values(by=['pmid_rank_10th_percentile'],ascending=False)
            temp_df['clinical_trial_num2']=temp_df.name.apply(lambda x: int(self.doctor2clinical_dict2_df.loc[x, self.cancer_type]))
    
        return temp_df

    def get_clinical_data(self, onco_name):
        cancer_prefix="_".join(i.lower() for i in self.cancer_type.split())
        temp_df = pd.read_csv(os.path.join(self.base_dir, 'clinical_trial/{}_clinical_trial_info.csv'.format(cancer_prefix)))
        temp_df_onco=temp_df.query('name=="{}"'.format(onco_name))
        return temp_df_onco

In [351]:
class Oncomatch_model():
    def __init__(self,cancer_type):
        self.base_dir='/Volumes/Yuchen_Drive/Insight/OncoMatch'
        self.cancer_type=cancer_type
        self.prefix="_".join(i.lower() for i in cancer_type.split())
        self.bow_dict=gensim.corpora.Dictionary.load(os.path.join(self.base_dir,'models_lda/{}_abstract_bow_dict.dict'.format(self.prefix)))
        self.tfidf=gensim.models.TfidfModel.load(os.path.join(self.base_dir,'models_lda/{}_abstract_tfidf.tfidf'.format(self.prefix)))
        self.lda_model=gensim.models.LdaMulticore.load(os.path.join(self.base_dir,'models_lda/{}_abstract_lda.lda'.format(self.prefix)))
        self.lda_df=pd.read_csv(os.path.join(self.base_dir, "models_lda/{}_abstract_ldavector_df.csv".format(self.prefix)), index_col='pmid').drop(['pred'],axis=1)
        self.stop_words=pickle.load(open(os.path.join(self.base_dir, "data/stop_words_lda.set"),"rb"))

        self.onco_df=pickle.load(open(os.path.join(self.base_dir, "data/Oncologist_info_clinicaltrial_1470.pkl"),"rb"))
        self.onco_df.index=self.onco_df.name
        self.cancerclinical2doctor_dict=pickle.load(open(os.path.join(self.base_dir, "data/cancerclinical2doctor_dict.pkl"),"rb"))
        #self.doctor2clinical_dict2_df=pd.read_csv(os.path.join(base_dir, 'data/doctor2clinical_dict2_df.csv'),index_col='name')
        self.doctor2clinical_dict2=pickle.load(open(os.path.join(self.base_dir, "data/doctor2clinical_dict2.pkl"),"rb"))
        self.pmid2doctor_dict=pickle.load(open(os.path.join(self.base_dir, "data/pmid2doctor_dict.pkl"),"rb"))
        #self.word2vec = gensim.models.Word2Vec.load(os.path.join(base_dir, 'models_w2v/doctor_recommendation_w2vmodel_dim300_mincount2.w2v'))
        #self.doc2vec=gensim.models.Doc2Vec.load(os.path.join(base_dir, 'preprocessing/personalized_medicine_d2vmodel_dim100_v2.d2v'))
        #self.lsa_model=gensim.models.LsiModel.load(os.path.join(base_dir,'preprocessing/Personalized_Medicine_lsimodel'))
        #self.lsa_df=pickle.load(open(os.path.join(base_dir,"preprocessing/Personalized_Medicine_train_lsavector_df.pkl"),"rb")).drop(['pred','class'],axis=1)
        #self.doctovec_df=pickle.load(open(os.path.join(base_dir,"preprocessing/Personalized_Medicine_train_doctovec_df.pkl"),"rb"))

    def preprocess(self, text):
        result = []
        for token in gensim.utils.simple_preprocess(text):
            temp=WordNetLemmatizer().lemmatize(token).lower()
            if len(temp)>1 and temp not in self.stop_words:
                result.append(temp)
        return result

    def get_embedding_vector(self, text):
        text_corpus=[self.bow_dict.doc2bow(text)]
        text_tfidf=self.tfidf[text_corpus]

        #text_lsa=self.lsa_model[text_tfidf]
        #text_lsa=gensim.matutils.corpus2csc(text_lsa)
        #text_lsa=text_lsa.T.toarray()

        text_lda=self.lda_model[text_tfidf]
        text_lda = gensim.matutils.corpus2csc(text_lda)
        text_lda = text_lda.T.toarray()

        #text_doc2vec=self.doc2vec.infer_vector(text, alpha=0.001, min_alpha=0.001, steps=10000)
        #text_doc2vec=text_doc2vec.reshape(1,-1)

        return text_lda

    def compute_similarity(self, embedding_matrix, text_vector):
        return cosine_similarity(X=embedding_matrix,Y=text_vector,dense_output=False)

    def get_similarity_scores(self, text):
        text_preprocess=self.preprocess(text)
        text_lda=self.get_embedding_vector(text_preprocess)

        #text_lsa_sim=self.compute_similarity(self.lsa_df,text_lsa)
        text_lda_sim=self.compute_similarity(self.lda_df,text_lda)
        #text_doc2vec_sim=self.compute_similarity(self.doctovec_df,text_doc2vec)

        similarity_df=pd.DataFrame({'pmid':self.lda_df.index, 'lda_similarity':text_lda_sim.squeeze()}).sort_values(by=['lda_similarity'],ascending=False).reset_index().drop(['index'],axis=1)
        return similarity_df

    def get_onco_info(self, pmid_list,clinical_trial):
        """
        Input: cancer_type and a list of pmid
        Output: DataFrame containing information for oncologists that publish the articles
        """
        onco2pmid=defaultdict(list)
        onco2rank=defaultdict(list)
        for i,pmid in enumerate(pmid_list):
            for doctor in self.pmid2doctor_dict[pmid]:
                onco2pmid[doctor].append(pmid)
                onco2rank[doctor].append(i)
        temp_df = self.onco_df.loc[np.array(list(onco2pmid.keys()))]
        temp_df['query_pmid'] = temp_df.name.apply(lambda x:onco2pmid[x])
        temp_df['query_pmid_num'] = temp_df.name.apply(lambda x:len(onco2pmid[x]))
        temp_df['pmid_rank'] = temp_df.name.apply(lambda x:onco2rank[x])
        temp_df['pmid_rank_10th_percentile'] = temp_df.pmid_rank.apply(lambda x: sum(np.array(x)<=int(np.ceil(temp_df.shape[0]/10))))

        if clinical_trial and self.cancer_type in self.cancerclinical2doctor_dict:
            doctor_clin = np.intersect1d(np.array(self.cancerclinical2doctor_dict[self.cancer_type]), temp_df.name.values)
            temp_df = temp_df.loc[np.array(doctor_clin)].sort_values(by=['pmid_rank_10th_percentile'],ascending=False)
            temp_df['clinical_trial_num2']=temp_df.name.apply(lambda x: self.doctor2clinical_dict2[x][self.cancer_type])

        return temp_df

    def get_clinical_data(self, onco_name):
        cancer_prefix="_".join(i.lower() for i in self.cancer_type.split())
        temp_df = pd.read_csv(os.path.join(self.base_dir, 'clinical_trial/{}_clinical_trial_info.csv'.format(cancer_prefix)))
        temp_df_onco=temp_df.query('name=="{}"'.format(onco_name))
        return temp_df_onco

In [6]:
class Get_info():
    def __init__(self):
        base_dir='/Volumes/Yuchen_Drive/Insight/OncoMatch'
        #self.doctor2pmid_dict=pickle.load(open(os.path.join(base_dir, "data/doctor2pmid_1153_dict.pkl"),"rb"))
        self.cancer2pmid_dict=pickle.load(open(os.path.join(base_dir, "data/cancer2pmid_dict.pkl"),"rb"))
        self.cancer2doctor_dict=pickle.load(open(os.path.join(base_dir, "data/cancer2doctor_dict.pkl"),"rb"))
        self.pmid2doctor_dict=pickle.load(open(os.path.join(base_dir, "data/pmid2doctor_dict.pkl"),"rb"))
        self.pmid2cancer_dict=pickle.load(open(os.path.join(base_dir, "data/pmid2doctor_dict.pkl"),"rb"))
        self.doctor2cancer_dict=pickle.load(open(os.path.join(base_dir, "data/doctor2cancer_dict.pkl"),"rb"))

    def gene_to_pmid(self,gene,data):
        pmid_array = data.query('HGNC=="{}"'.format(gene)).pmid.unique()
        return pmid_array
    
    #def doctor_to_pmid(self,doctor):
    #    return self.doctor2pmid_dict[doctor]
    
    def cancer_to_pmid(self,cancer):
        return self.cancer2pmid_dict[cancer]
    
    def cancer_to_doctor(self,cancer):
        return self.cancer2doctor_dict[cancer]
    
    def pmid_to_doctor(self,pmid):
        return self.pmid2doctor_dict[pmid]
    
    def pmid_to_cancer(self,pmid):
        return self.pmid2cancer_dict[pmid]
    
    def doctor_to_cancer(self,doctor):
        return self.doctor2cancer_dict[doctor]

In [206]:
def get_onco_info(cancer_type, pmid_list,clinical_trial):
    """
    Input: cancer_type and a list of pmid
    Output: DataFrame containing information for oncologists that publish the articles
    """
    onco2pmid=defaultdict(list)
    onco2rank=defaultdict(list)
    for i,pmid in enumerate(pmid_list):
        for doctor in pmid2doctor_dict[pmid]:
            onco2pmid[doctor].append(pmid)
            onco2rank[doctor].append(i)
    temp_df = onco_df.loc[np.array(list(onco2pmid.keys()))]
    temp_df['query_pmid'] = temp_df.name.apply(lambda x:onco2pmid[x])
    temp_df['query_pmid_num'] = temp_df.name.apply(lambda x:len(onco2pmid[x]))
    temp_df['pmid_rank'] = temp_df.name.apply(lambda x:onco2rank[x])
    temp_df['pmid_rank_10th_percentile'] = temp_df.pmid_rank.apply(lambda x: sum(np.array(x)<=int(np.ceil(temp_df.shape[0]/10))))
    
    if clinical_trial and cancer_type in cancerclinical2doctor_dict:
        doctor_clin = np.intersect1d(np.array(cancerclinical2doctor_dict[cancer_type]), temp_df.name)
        temp_df = temp_df.loc[np.array(doctor_clin)].sort_values(by=['pmid_rank_10th_percentile'],ascending=False)
        temp_df['clinical_trial_{}_num'.format(cancer_type)]=temp_df.name.apply(lambda x: doctor2clinical_dict2[x][cancer_type])
        temp_df.head()
    
    return temp_df

In [15]:
cancer_type='Breast Cancer'
original_text = 'The patient is looking for therapies for breast cancer'
onco=Oncomatch_model(cancer_type)
similarity_df=onco.get_similarity_scores(original_text)   ## (8563, 2)

In [16]:
## Gene filtering
gene_name = 'NA'
if gene_name != 'NA':
    pmid_gene=Get_info().gene_to_pmid(gene_name, gene_df)
    pmid_list=[]
    for i in similarity_df.pmid:
        if i in pmid_gene:
            pmid_list.append(i)
else:
    pmid_list=list(similarity_df.pmid)   ## 865

In [17]:
## clinical trial filtering and get oncologist info
clinical_trial = False
test_df = onco.get_onco_info(pmid_list,clinical_trial)   ## (95, 16)

In [22]:
test_df.sort_values(by=['query_pmid_num'],ascending=False)

Unnamed: 0_level_0,name,degree,phone,center_name2,address,city_state,speciality,certificate,center_name,article_num,clinical_trial_num,query_pmid,query_pmid_num,pmid_rank,pmid_rank_10th_percentile
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Gabriel N. Hortobagyi,Gabriel N. Hortobagyi,"MD, FACP, FASCO",(713) 792-4124,University of Texas MD Anderson Cancer Center,Unit 1354PO Box 301439,"Houston, TX 77230-1439, US","['Breast Cancer', 'Cancer Education', 'Drug De...","['Internal Medicine', 'Medical Oncology']",The University of Texas MD Anderson Cancer Center,569,1,"[11839684, 21607830, 21632501, 30139837, 20697...",506,"[37, 53, 102, 139, 144, 169, 174, 194, 219, 26...",2
Kelly Hunt,Kelly Hunt,MD,Search for Phone Number,University of Texas MD Anderson Cancer Center,Dept of Surgical Oncology - Unit 444P. O. Box ...,"Houston, TX 77230-1402, US","['Breast Cancer', 'Sarcoma', 'Clinical Trials/...",['Surgical Oncology'],The University of Texas MD Anderson Cancer Center,469,3,"[21695458, 19305161, 17028770, 15378473, 11884...",347,"[60, 62, 74, 160, 170, 258, 290, 311, 331, 335...",3
Fergus Couch,Fergus Couch,PhD,Search for Phone Number,Mayo Clinic,200 1st St SW,"Rochester, MN 55905-0001, US",[],[],Mayo Clinic Cancer Center,344,1,"[12464649, 23809231, 29884841, 21118973, 29492...",289,"[40, 105, 125, 157, 220, 235, 286, 325, 340, 3...",1
Monica Morrow,Monica Morrow,"MD, FASCO",(646) 888-5384,Memorial Sloan Kettering Cancer Center,300 E 66th St,"New York, NY 10065, US",['Breast Cancer'],['Surgical Oncology'],Memorial Sloan-Kettering Cancer Center,346,2,"[24777858, 27851913, 28130619, 26265365, 17510...",267,"[13, 36, 57, 112, 115, 122, 123, 141, 142, 162...",3
Lajos Pusztai,Lajos Pusztai,"MD, PhD",(203) 200-2328,Yale University,Breast CenterSmilow Cancer Hospital,"New Naven, CT , US","['Breast Cancer', 'Clinical Trials/Biostatisti...","['Internal Medicine', 'Medical Oncology']",Yale Cancer Center,274,11,"[16722773, 16293864, 25902916, 16948128, 22392...",240,"[50, 169, 243, 335, 502, 624, 629, 639, 829, 9...",1
Funda Meric-Bernstam,Funda Meric-Bernstam,MD,(713) 794-1226,University of Texas MD Anderson Cancer Center,1400 Holcombe BlvdUnit 455,"Houston, TX 77230, US","['Breast Cancer', 'Laboratory Research', 'Tumo...",['Surgical Oncology'],The University of Texas MD Anderson Cancer Center,337,9,"[16027034, 17028770, 15361028, 17911860, 21632...",230,"[72, 74, 91, 95, 102, 139, 160, 301, 311, 381,...",2
Eric P. Winer,Eric P. Winer,"MD, FASCO",(617) 632-3800,Dana-Farber Cancer Institute,450 Brookline Ave,"Boston, MA 02215, US","['Breast Cancer', 'Clinical Trials/Biostatisti...",['Medical Oncology'],Dana-Farber/Harvard Cancer Center,283,4,"[29109393, 12235222, 22833148, 19434504, 16457...",229,"[19, 48, 119, 153, 231, 236, 237, 282, 296, 29...",2
Edith A. Perez,Edith A. Perez,"MD, FASCO",(904) 953-2272,Mayo Clinic Florida,4500 San Pablo Rd S,"Jacksonville, FL 32224-1865, US","['Breast Cancer', 'Cancer Education', 'Cancer ...","['Hematology', 'Internal Medicine', 'Medical O...",Mayo Clinic Cancer Center,263,26,"[19901117, 19418493, 25605862, 21554044, 14702...",226,"[9, 106, 204, 244, 250, 339, 348, 369, 417, 46...",1
Vicente Valero,Vicente Valero,MD,(713) 792-4124,University of Texas MD Anderson Cancer Center,PO Box 301439,"Houston, TX 77230-1439, US","['Breast Cancer', 'Clinical Trials/Biostatisti...","['Hematology', 'Internal Medicine', 'Medical O...",The University of Texas MD Anderson Cancer Center,239,5,"[29928946, 26017070, 30139837, 16293864, 28860...",212,"[63, 69, 139, 169, 171, 194, 254, 258, 275, 34...",2
Banu Arun,Banu Arun,"MD, FASCO",(713) 792-7090,University of Texas MD Anderson Cancer Center,1515 Holcombe blvd,"Houston, TX 77030, US",['Breast Cancer'],"['Hematology', 'Medical Oncology']",The University of Texas MD Anderson Cancer Center,181,7,"[29044548, 18373644, 29733510, 16946209, 26017...",170,"[2, 12, 56, 66, 69, 194, 221, 281, 289, 301, 3...",5


In [None]:
onco=Oncomatch_model(cancer_type)
similarity_df=onco.get_similarity_scores(original_text)   ## (8563, 2)

In [8]:
doctor2cancer_dict=pickle.load(open(os.path.join(base_dir, "data/doctor2cancer_dict.pkl"),"rb"))
cancer2doctor_dict=pickle.load(open(os.path.join(base_dir, "data/cancer2doctor_dict.pkl"),"rb"))
mskcc_text_df=pd.read_csv(os.path.join(base_dir, 'data/mskcc_text_dict_df.csv'),index_col='name')
mskcc_text_df.index

Index(['Kenneth H. Yu', 'Robert Sidlow', 'Lisa Marie Ruppert', 'Yukio Sonoda',
       'Marsha Reyngold', 'Andrea Veronica Barrio', 'Seth M. Cohen',
       'Sam S. Yoon', 'Atif J. Khan', 'Alexandra S. Heerdt',
       ...
       'm-lia-palomba', 'mario-leitao-jr', 'julia-glade-bender',
       'diane-reidy-lagunes', 'j-smith', 'sre-bhavani-chalasani',
       'david-paul-kelsen', 'linda-vahdat', 'mary-sue-brady', 'iris-zhi'],
      dtype='object', name='name', length=170)

### Jedd D. Wolchok

In [9]:
onco_name = 'Jedd D. Wolchok'
doctor2cancer_dict[onco_name]

['Breast Cancer',
 'Lung Cancer',
 'Leukemia',
 'Prostate Cancer',
 'Colorectal Cancer',
 'Lymphoma',
 'Bone Cancer',
 'Melanoma',
 'Liver Cancer',
 'Kidney Cancer',
 'Ovarian Cancer',
 'Pancreatic Cancer',
 'Brain Tumor',
 'Head and Neck Cancer',
 'Acute Myeloid Leukemia (AML)',
 'Sarcoma',
 'Bladder Cancer',
 'Skin Cancer',
 'Gastric Cancer',
 'Gastrointestinal Tumor',
 'Endometrial Cancer',
 'Cardiac Tumors',
 'Thyroid Cancer',
 'Neuroblastoma',
 'Testicular Cancer',
 'Osteosarcoma',
 'Anal Cancer',
 'Gallbladder Cancer',
 'Wilms Tumor',
 'Pituitary Tumor']

In [10]:
## Get rank of similarity scores
#original_text = 'The patient is looking for therapies for breast cancer'
#original_text = 'This tumor is very malignant and aggressive. It tends to metastasize to multiple organs. The patient may not respond to targeted therapy but may become sensitive to immunotherapy after themotherapy.'
cancer_type='Melanoma'
original_text=mskcc_text_df.loc[onco_name,'text']
onco=Oncomatch_model(cancer_type)
similarity_df=onco.get_similarity_scores(original_text)   ## (8563, 2)

In [11]:
original_text

'I am a medical oncologist who specializes in caring for people with melanoma. As a researcher, I am working to develop innovative ways to use the immune system to treat cancer. I have been at the forefront of cancer immunotherapy as a clinician-scientist and as a principal investigator of several pivotal clinical trials.Read moreOne of these was a large phase III trial that led to the FDA approval of ipilimumab (Yervoy®), an antibody now used as a first-line treatment for people with advanced melanoma. Ipilimumab was the first drug ever shown to improve survival in these patients. It has fundamentally changed this disease’s outcome — people who were once given a matter of months are living years longer in some cases. A number of patients I treated with ipilimumab in 2004 are still alive and free of cancer today. As a physician, nothing is more gratifying than being able to give someone his or her life back after cancer.Despite the successes of ipilimumab, only about 20 percent of mela

In [54]:
## Gene filtering
gene_name = 'NA'
if gene_name != 'NA':
    pmid_gene=Get_info().gene_to_pmid(gene_name, gene_df)
    pmid_list=[]
    for i in similarity_df.pmid:
        if i in pmid_gene:
            pmid_list.append(i)
else:
    pmid_list=list(similarity_df.pmid)   ## 865

In [55]:
## clinical trial filtering and get oncologist info
clinical_trial = True
test_df = onco.get_onco_info(pmid_list,clinical_trial)   ## (95, 16)

In [56]:
test_df.head()

Unnamed: 0_level_0,name,degree,phone,center_name2,address,city_state,speciality,certificate,center_name,article_num,clinical_trial_num,query_pmid,query_pmid_num,pmid_rank,pmid_rank_10th_percentile,clinical_trial_num2
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Jedd D. Wolchok,Jedd D. Wolchok,"MD, PhD, FASCO",(646) 888-2395,Memorial Sloan Kettering Cancer Center,1275 York AveBox 340,"New York, NY 10021, US","['Melanoma', 'Clinical Trials/Biostatistics/Ep...","['Hematology', 'Medical Oncology']",Memorial Sloan-Kettering Cancer Center,279,7,"[19558193, 26317466, 15800947, 28841387, 19103...",202,"[7, 8, 17, 26, 45, 53, 77, 78, 82, 84, 115, 12...",5,6
Patrick Hwu,Patrick Hwu,MD,Search for Phone Number,University of Texas MD Anderson Cancer Center,1515 Holcombe BlvdUnit 430,"Houston, TX 77030, US","['Melanoma', 'Immunology/Immunobiology/Vaccines']",['Medical Oncology'],The University of Texas MD Anderson Cancer Center,182,5,"[21166498, 24292706, 28754817, 21111964, 27678...",142,"[24, 35, 40, 47, 65, 67, 71, 105, 119, 139, 16...",4,5
Keith Flaherty,Keith Flaherty,MD,(617) 726-1941,Massachusetts General Hospital,55 Fruit StYawkey 9E,"Boston, MA 02114, US","['Melanoma', 'Clinical Trials/Biostatistics/Ep...","['Hematology', 'Medical Oncology']",Massachusetts General Hospital,299,7,"[31358999, 19659612, 26037941, 28728868, 25265...",213,"[15, 20, 48, 60, 72, 87, 101, 113, 126, 128, 1...",3,5
Craig L. Slingluff,Craig L. Slingluff,MD,(434) 924-1730,University of Virginia,PO Box 800709,"Charlottesville, VA 22908-0709, US","['Melanoma', 'Biologic Therapy', 'Clinical Res...",['General Surgery'],University of Virginia Cancer Center,111,25,"[22435430, 23406162, 14734458, 24047116, 20221...",93,"[21, 25, 44, 135, 374, 401, 432, 545, 554, 560...",3,21
Kevin Kim,Kevin Kim,MD,(203) 785-4747,Yale Cancer Center,PO Box 208042333 Cedar Street,"New Haven, CT 06520-8042, US","['Gastrointestinal Cancer', 'Liver Cancer', 'L...",['Interventional Radiology/Diagnostic Radiology'],Yale Cancer Center,115,18,"[24720932, 23800008, 25962795, 22668797, 25148...",92,"[3, 6, 49, 67, 71, 109, 133, 136, 139, 143, 14...",3,13


In [33]:
test_df.loc[onco_name]

name                                                           Jedd D. Wolchok
degree                                                          MD, PhD, FASCO
phone                                                           (646) 888-2395
center_name2                            Memorial Sloan Kettering Cancer Center
address                                                   1275 York AveBox 340
city_state                                              New York, NY 10021, US
speciality                   ['Melanoma', 'Clinical Trials/Biostatistics/Ep...
certificate                                 ['Hematology', 'Medical Oncology']
center_name                             Memorial Sloan-Kettering Cancer Center
article_num                                                                279
clinical_trial_num                                                           7
query_pmid                   [19558193, 26317466, 15800947, 28841387, 19103...
query_pmid_num                                      

### Charles L. Sawyers

In [43]:
onco_name = 'Charles L. Sawyers'
doctor2cancer_dict[onco_name]

['Breast Cancer',
 'Lung Cancer',
 'Leukemia',
 'Prostate Cancer',
 'Colorectal Cancer',
 'Lymphoma',
 'Bone Cancer',
 'Melanoma',
 'Liver Cancer',
 'Kidney Cancer',
 'Pancreatic Cancer',
 'Brain Tumor',
 'Head and Neck Cancer',
 'Acute Myeloid Leukemia (AML)',
 'Sarcoma',
 'Bladder Cancer',
 'Skin Cancer',
 'Gastrointestinal Tumor',
 'Endometrial Cancer',
 'Acute Lymphoblastic Leukemia (ALL)',
 'Cardiac Tumors',
 'Thyroid Cancer',
 'Myeloproliferative Neoplasms',
 'Chronic Myelogenous Leukemia (CML)',
 'Ewing Sarcoma']

In [45]:
## Get rank of similarity scores
#original_text = 'The patient is looking for therapies for breast cancer'
#original_text = 'This tumor is very malignant and aggressive. It tends to metastasize to multiple organs. The patient may not respond to targeted therapy but may become sensitive to immunotherapy after themotherapy.'
cancer_type='Prostate Cancer'
original_text=mskcc_text_df.loc[onco_name,'text']
onco=Oncomatch_model(cancer_type)
similarity_df=onco.get_similarity_scores(original_text)   ## (8563, 2)

In [46]:
## Gene filtering
gene_name = 'NA'
if gene_name != 'NA':
    pmid_gene=Get_info().gene_to_pmid(gene_name, gene_df)
    pmid_list=[]
    for i in similarity_df.pmid:
        if i in pmid_gene:
            pmid_list.append(i)
else:
    pmid_list=list(similarity_df.pmid)   ## 865

In [50]:
## clinical trial filtering and get oncologist info
clinical_trial = False
test_df = onco.get_onco_info(pmid_list,clinical_trial)   ## (95, 16)

In [25]:
test_df.sort_values(by=['query_pmid_num'],ascending=False)

Unnamed: 0_level_0,name,degree,phone,center_name2,address,city_state,speciality,certificate,center_name,article_num,clinical_trial_num,query_pmid,query_pmid_num,pmid_rank,pmid_rank_10th_percentile
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Gabriel N. Hortobagyi,Gabriel N. Hortobagyi,"MD, FACP, FASCO",(713) 792-4124,University of Texas MD Anderson Cancer Center,Unit 1354PO Box 301439,"Houston, TX 77230-1439, US","['Breast Cancer', 'Cancer Education', 'Drug De...","['Internal Medicine', 'Medical Oncology']",The University of Texas MD Anderson Cancer Center,569,1,"[11839684, 21607830, 21632501, 30139837, 20697...",506,"[37, 53, 102, 139, 144, 169, 174, 194, 219, 26...",2
Kelly Hunt,Kelly Hunt,MD,Search for Phone Number,University of Texas MD Anderson Cancer Center,Dept of Surgical Oncology - Unit 444P. O. Box ...,"Houston, TX 77230-1402, US","['Breast Cancer', 'Sarcoma', 'Clinical Trials/...",['Surgical Oncology'],The University of Texas MD Anderson Cancer Center,469,3,"[21695458, 19305161, 17028770, 15378473, 11884...",347,"[60, 62, 74, 160, 170, 258, 290, 311, 331, 335...",3
Fergus Couch,Fergus Couch,PhD,Search for Phone Number,Mayo Clinic,200 1st St SW,"Rochester, MN 55905-0001, US",[],[],Mayo Clinic Cancer Center,344,1,"[12464649, 23809231, 29884841, 21118973, 29492...",289,"[40, 105, 125, 157, 220, 235, 286, 325, 340, 3...",1
Monica Morrow,Monica Morrow,"MD, FASCO",(646) 888-5384,Memorial Sloan Kettering Cancer Center,300 E 66th St,"New York, NY 10065, US",['Breast Cancer'],['Surgical Oncology'],Memorial Sloan-Kettering Cancer Center,346,2,"[24777858, 27851913, 28130619, 26265365, 17510...",267,"[13, 36, 57, 112, 115, 122, 123, 141, 142, 162...",3
Lajos Pusztai,Lajos Pusztai,"MD, PhD",(203) 200-2328,Yale University,Breast CenterSmilow Cancer Hospital,"New Naven, CT , US","['Breast Cancer', 'Clinical Trials/Biostatisti...","['Internal Medicine', 'Medical Oncology']",Yale Cancer Center,274,11,"[16722773, 16293864, 25902916, 16948128, 22392...",240,"[50, 169, 243, 335, 502, 624, 629, 639, 829, 9...",1
Funda Meric-Bernstam,Funda Meric-Bernstam,MD,(713) 794-1226,University of Texas MD Anderson Cancer Center,1400 Holcombe BlvdUnit 455,"Houston, TX 77230, US","['Breast Cancer', 'Laboratory Research', 'Tumo...",['Surgical Oncology'],The University of Texas MD Anderson Cancer Center,337,9,"[16027034, 17028770, 15361028, 17911860, 21632...",230,"[72, 74, 91, 95, 102, 139, 160, 301, 311, 381,...",2
Eric P. Winer,Eric P. Winer,"MD, FASCO",(617) 632-3800,Dana-Farber Cancer Institute,450 Brookline Ave,"Boston, MA 02215, US","['Breast Cancer', 'Clinical Trials/Biostatisti...",['Medical Oncology'],Dana-Farber/Harvard Cancer Center,283,4,"[29109393, 12235222, 22833148, 19434504, 16457...",229,"[19, 48, 119, 153, 231, 236, 237, 282, 296, 29...",2
Edith A. Perez,Edith A. Perez,"MD, FASCO",(904) 953-2272,Mayo Clinic Florida,4500 San Pablo Rd S,"Jacksonville, FL 32224-1865, US","['Breast Cancer', 'Cancer Education', 'Cancer ...","['Hematology', 'Internal Medicine', 'Medical O...",Mayo Clinic Cancer Center,263,26,"[19901117, 19418493, 25605862, 21554044, 14702...",226,"[9, 106, 204, 244, 250, 339, 348, 369, 417, 46...",1
Vicente Valero,Vicente Valero,MD,(713) 792-4124,University of Texas MD Anderson Cancer Center,PO Box 301439,"Houston, TX 77230-1439, US","['Breast Cancer', 'Clinical Trials/Biostatisti...","['Hematology', 'Internal Medicine', 'Medical O...",The University of Texas MD Anderson Cancer Center,239,5,"[29928946, 26017070, 30139837, 16293864, 28860...",212,"[63, 69, 139, 169, 171, 194, 254, 258, 275, 34...",2
Banu Arun,Banu Arun,"MD, FASCO",(713) 792-7090,University of Texas MD Anderson Cancer Center,1515 Holcombe blvd,"Houston, TX 77030, US",['Breast Cancer'],"['Hematology', 'Medical Oncology']",The University of Texas MD Anderson Cancer Center,181,7,"[29044548, 18373644, 29733510, 16946209, 26017...",170,"[2, 12, 56, 66, 69, 194, 221, 281, 289, 301, 3...",5


In [23]:
test_df.head()

Unnamed: 0_level_0,name,degree,phone,center_name2,address,city_state,speciality,certificate,center_name,article_num,clinical_trial_num,query_pmid,query_pmid_num,pmid_rank,pmid_rank_10th_percentile
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Kevin S. Hughes,Kevin S. Hughes,"MD, FACS",(617) 724-0048,Massachusetts General Hospital,Div of Surgcl Onc55 Fruit St Yawkey Bldg,"Boston, MA 02114, US","['Breast Cancer', 'Genetics/Gene Therapy']",['Surgical Oncology'],Massachusetts General Hospital,84,2,"[24748568, 29410346, 29785770, 17508275, 27160...",65,"[0, 28, 273, 327, 402, 431, 661, 672, 679, 847...",2
Yu Chen,Yu Chen,"MD, PhD",(646) 422-4465,Memorial Sloan Kettering Cancer Center,1275 York Ave Box 20,"New York, NY 10021, US","['Laboratory Research', 'Tumor Biology']",['Medical Oncology'],Memorial Sloan-Kettering Cancer Center,558,50,"[27447970, 29169152, 25259624, 24276478, 26209...",70,"[1, 167, 524, 626, 630, 722, 927, 1119, 1121, ...",1
Banu Arun,Banu Arun,"MD, FASCO",(713) 792-7090,University of Texas MD Anderson Cancer Center,1515 Holcombe blvd,"Houston, TX 77030, US",['Breast Cancer'],"['Hematology', 'Medical Oncology']",The University of Texas MD Anderson Cancer Center,181,7,"[29044548, 18373644, 29733510, 16946209, 26017...",170,"[2, 12, 56, 66, 69, 194, 221, 281, 289, 301, 3...",5
Naoto T. Ueno,Naoto T. Ueno,"MD, PhD",(713) 792-8754,University of Texas MD Anderson Cancer Center,1515 Holcombe BlvdUnit 1354,"Houston, TX 77030-4000, US","['Breast Cancer', 'Clinical Research', 'Develo...","['Internal Medicine', 'Medical Oncology']",The University of Texas MD Anderson Cancer Center,201,13,"[29044548, 21607830, 30005774, 26017070, 22961...",163,"[2, 53, 68, 69, 211, 322, 331, 378, 410, 448, ...",4
Michael J. Hassett,Michael J. Hassett,"MD, MPH, FASCO",(617) 632-4587,Dana-Farber Cancer Institute,450 Brookline Ave,"Boston, MA 02215, US","['Breast Cancer', 'Health Services Research']","['Hematology', 'Internal Medicine', 'Medical O...",Dana-Farber/Harvard Cancer Center,46,0,"[21681446, 22235542, 22585699, 26868124, 21711...",35,"[3, 234, 391, 411, 414, 477, 557, 616, 668, 74...",1


In [51]:
test_df.sort_values(by=['pmid'])

name                                                        Charles L. Sawyers
degree                                                                      MD
phone                                                  Search for Phone Number
center_name2                            Memorial Sloan Kettering Cancer Center
address                                                    1275 York AveBox 20
city_state                                              New York, NY 10065, US
speciality                                                                  []
certificate                  ['Hematology', 'Internal Medicine', 'Medical O...
center_name                             Memorial Sloan-Kettering Cancer Center
article_num                                                                146
clinical_trial_num                                                           0
query_pmid                   [29121144, 22641202, 17560336, 24027196, 19359...
query_pmid_num                                      

In [440]:
img_df=pd.read_csv(os.path.join(base_dir, "data/mskcc_img_dict_df.csv"),index_col='name')

In [448]:
onco_df[onco_df.center_name=='Abramson Cancer Center '].head()

Unnamed: 0,name,degree,phone,center_name2,address,city_state,speciality,certificate,center_name,article_num,clinical_trial_num
0,John H. Glick,"MD, FASCO",(215) 662-6334,University of Pennsylvania-Abramson Cancer Center,"3400 Civic Center Blvd3rd Fl, Ste 3-300S","Philadelphia, PA 19104-5127, US","['Breast Cancer', 'Cancer Prevention']","['Internal Medicine', 'Medical Oncology']",Abramson Cancer Center,7,0
1,Arthur M. Feldman,MD,(215) 662-9801,University of Pennsylvania-Abramson Cancer Center,Penn Presbyterian Medcl Ctr51 N 39th St MAB St...,"Philadelphia, PA 19104, US","['Breast Cancer', 'Geriatrics Oncology']","['Internal Medicine', 'Medical Oncology']",Abramson Cancer Center,0,1
2,David M. Mintzer,MD,Search for Phone Number,Abramson Cancer Center at Pennsylvania Hospital,230 W Washington Sq Fl 2,"Philadelphia, PA 19106-3500, US","['Breast Cancer', 'Lung Cancer', 'Palliative C...","['Hematology', 'Hospice and Palliative Medicin...",Abramson Cancer Center,13,0
3,David J. Vaughn,MD,(215) 349-8140,University of Pennsylvania-Abramson Cancer Center,PCAM South 10-1143400 Civic Center Blvd,"Philadelphia, PA 19104, US","['Bladder Cancer', 'Prostate Cancer', 'Testicu...",['Medical Oncology'],Abramson Cancer Center,86,1
4,Charles John Schneider,"MD, FACP",Search for Phone Number,"Hospital of the University of Pennsylvania, Ab...",Pereleman Center for Advanced Medicine3400 Civ...,"Philadelphia, PA 19104, US","['Clinical Research', 'Developmental Therapeut...",['Medical Oncology'],Abramson Cancer Center,0,5


In [450]:
onco_df.center_name.nunique()

48

In [475]:
from platform import python_version
print(python_version())

3.6.2


In [490]:
abstract_df_aws=pd.read_csv(os.path.join(base_dir,'data/abstract_citation_annot_df_50930_aws.csv'))
abstract_df_aws.columns

Index(['article_title', 'journal_title', 'journal_abbre', 'article_date',
       'abstract', 'pub_days', 'citation', 'abstract_clean', 'pmid'],
      dtype='object')

In [491]:
abstract_df_aws=abstract_df_aws[['article_title', 'journal_title', 'journal_abbre', 'article_date', 'pub_days', 'citation', 'pmid']]

In [492]:
abstract_df_aws.to_csv(os.path.join(base_dir, 'data/abstract_citation_annot_df_50930_aws.csv'), index=False)

In [493]:
abstract_df_aws.shape

(50930, 7)

In [488]:
abstract_df_aws=pd.read_csv(os.path.join(base_dir, 'data/abstract_citation_annot_df_50930_aws.csv'))

In [489]:
abstract_df_aws.head()

Unnamed: 0,article_title,journal_title,journal_abbre,article_date,abstract,pub_days,citation,abstract_clean,pmid
0,Pseudohypoproteinemia and multiple myeloma.,Cleveland Clinic journal of medicine,Cleve Clin J Med,1990-05-01,Paraproteinemia is an important diagnostic fea...,10857.0,1.0,Paraproteinemia is an important diagnostic fea...,2357786
1,Comparison of digital rectal examination and s...,The Journal of urology,J. Urol.,1994-05-01,To compare the efficacy of digital rectal exam...,9396.0,1218.0,To compare the efficacy of digital examination...,7512659
2,Accuracy of digital rectal examination and tra...,The Journal of urology,J. Urol.,1994-11-01,Not all prostate cancers are sonographically h...,9212.0,116.0,Not all cancers are sonographically hypoechoic...,7523707
3,Selection of optimal prostate specific antigen...,The Journal of urology,J. Urol.,1994-12-01,A prospective clinical trial of prostate cance...,9182.0,230.0,A prospective clinical trial of cancer screeni...,7525995
4,Effect of patient age on early detection of pr...,Urology,Urology,1993-10-01,This study was designed to determine the effec...,9608.0,132.0,This study was designed to determine the effec...,7692657


In [472]:
center_name=onco_df.center_name.unique()[22]
print(center_name)
"_".join([i.lower() for i in center_name.split()])

Sidney Kimmel Comprehensive Cancer Center


'sidney_kimmel_comprehensive_cancer_center'

In [433]:
similarity_df.shape

(8563, 2)

In [434]:
## Gene filtering
if gene_name != 'NA':
    pmid_gene=Get_info().gene_to_pmid(gene_name, gene_df)
    pmid_list=[]
    for i in similarity_df.pmid:
        if i in pmid_gene:
            pmid_list.append(i)
else:
    pmid_list=list(similarity_df.pmid)   ## 865

In [435]:
len(pmid_gene)

865

In [436]:
## clinical trial filtering and get oncologist info
test_df = onco.get_onco_info(pmid_list,clinical_trial)   ## (95, 16)

In [437]:
test_df.shape

(95, 16)

In [438]:
test_df.head()

Unnamed: 0_level_0,name,degree,phone,center_name2,address,city_state,speciality,certificate,center_name,article_num,clinical_trial_num,query_pmid,query_pmid_num,pmid_rank,pmid_rank_10th_percentile,clinical_trial_num2
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Banu Arun,Banu Arun,"MD, FASCO",(713) 792-7090,University of Texas MD Anderson Cancer Center,1515 Holcombe blvd,"Houston, TX 77030, US",['Breast Cancer'],"['Hematology', 'Medical Oncology']",The University of Texas MD Anderson Cancer Center,181,7,"[19174581, 16421416, 24522996, 20609467, 16955...",68,"[5, 13, 17, 41, 72, 73, 83, 85, 88, 99, 115, 1...",3,7
Mark E. Robson,Mark E. Robson,"MD, FASCO",(646) 888-5434,Memorial Sloan Kettering Cancer Center,1275 York AveBreast Medicine Service,"New York, NY 10065-6007, US","['Breast Cancer', 'Cancer Prevention', 'Geneti...","['Hematology', 'Medical Oncology']",Memorial Sloan-Kettering Cancer Center,88,2,"[21598239, 20554149, 20221693, 30689033, 30181...",37,"[0, 7, 19, 50, 69, 75, 105, 123, 138, 153, 202...",3,1
Shridar Ganesan,Shridar Ganesan,"MD, PhD",Search for Phone Number,Rutgers Cancer Institute of New Jersey,,"New Brunswick, NJ 08903, US","['Breast Cancer', 'Laboratory Research', 'Mole...","['Internal Medicine', 'Medical Oncology']",Rutgers Cancer Institute of New Jersey,64,2,"[24478461, 17350581, 21278454, 23650262, 22331...",14,"[6, 12, 22, 52, 76, 78, 80, 145, 241, 316, 338...",2,1
Kenneth Offit,Kenneth Offit,"MD, MPH, FASCO",(646) 888-4067,Memorial Sloan Kettering Cancer Center,1275 York AveInternal Box 192,"New York, NY 10065-6007, US",['Tumor Biology'],"['Hematology', 'Medical Oncology']",Memorial Sloan-Kettering Cancer Center,259,14,"[21598239, 20221693, 18326623, 17916242, 23054...",92,"[0, 19, 21, 27, 87, 92, 99, 100, 102, 105, 117...",2,4
Funda Meric-Bernstam,Funda Meric-Bernstam,MD,(713) 794-1226,University of Texas MD Anderson Cancer Center,1400 Holcombe BlvdUnit 455,"Houston, TX 77230, US","['Breast Cancer', 'Laboratory Research', 'Tumo...",['Surgical Oncology'],The University of Texas MD Anderson Cancer Center,337,9,"[19174581, 29533782, 29093017, 24811890, 16955...",17,"[5, 11, 44, 49, 72, 83, 91, 312, 319, 339, 420...",2,6


In [439]:
test_df.loc[onco_name,'center_name']

'Memorial Sloan-Kettering Cancer Center'

In [427]:
test_df.head()

Unnamed: 0_level_0,name,degree,phone,center_name2,address,city_state,speciality,certificate,center_name,article_num,clinical_trial_num,query_pmid,query_pmid_num,pmid_rank,pmid_rank_10th_percentile,clinical_trial_num2
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Kenneth Offit,Kenneth Offit,"MD, MPH, FASCO",(646) 888-4067,Memorial Sloan Kettering Cancer Center,1275 York AveInternal Box 192,"New York, NY 10065-6007, US",['Tumor Biology'],"['Hematology', 'Medical Oncology']",Memorial Sloan-Kettering Cancer Center,259,14,"[12464649, 14709740, 18268356, 21118973, 15131...",92,"[3, 8, 12, 13, 15, 17, 26, 48, 49, 52, 55, 65,...",6,4
Claudine Isaacs,Claudine Isaacs,MD,Search for Phone Number,"Lombardi Cancer Center, Georgetown University",3800 Reservoir Rd NW,"Washington, DC 20007-2113, US","['Breast Cancer', 'Clinical Research', 'Geneti...","['Internal Medicine', 'Medical Oncology']",Georgetown Lombardi Comprehensive Cancer Center,185,6,"[12464649, 15095307, 18268356, 21118973, 18195...",91,"[3, 10, 12, 13, 16, 24, 25, 26, 35, 39, 43, 68...",5,5
Mark E. Robson,Mark E. Robson,"MD, FASCO",(646) 888-5434,Memorial Sloan Kettering Cancer Center,1275 York AveBreast Medicine Service,"New York, NY 10065-6007, US","['Breast Cancer', 'Cancer Prevention', 'Geneti...","['Hematology', 'Medical Oncology']",Memorial Sloan-Kettering Cancer Center,88,2,"[28765325, 18268356, 15131025, 14680495, 12655...",37,"[7, 12, 15, 17, 49, 52, 92, 94, 136, 157, 161,...",4,1
Banu Arun,Banu Arun,"MD, FASCO",(713) 792-7090,University of Texas MD Anderson Cancer Center,1515 Holcombe blvd,"Houston, TX 77030, US",['Breast Cancer'],"['Hematology', 'Medical Oncology']",The University of Texas MD Anderson Cancer Center,181,7,"[29044548, 29733510, 16946209, 18779615, 21656...",68,"[0, 5, 6, 18, 27, 28, 29, 37, 53, 56, 66, 74, ...",4,7
Larry Norton,Larry Norton,"MD, FASCO",(646) 888-5438,Memorial Sloan Kettering Cancer Center,1275 York Ave,"New York, NY 10065, US","['Breast Cancer', 'Clinical Research']",['Medical Oncology'],Memorial Sloan-Kettering Cancer Center,192,25,"[28765325, 18268356, 14680495, 26011570, 12023...",18,"[7, 12, 17, 79, 92, 123, 126, 180, 247, 257, 3...",3,7


In [428]:
test_df.name.iloc[0]

'Kenneth Offit'

In [415]:
test_df = pd.read_csv(os.path.join(base_dir,'temp/temp.csv'),index_col='name',converters={"query_pmid":literal_eval, "pmid_rank":literal_eval})
pmid_list=test_df.loc[onco_name,'query_pmid'][:test_df.loc[onco_name,'pmid_rank_10th_percentile']]

In [416]:
pmid_list

[12464649, 14709740, 18268356, 21118973, 15131025, 14680495]

In [410]:
onco_name='Kenneth Offit'
pmid_list = test_df.loc[onco_name,'query_pmid'][:test_df.loc[onco_name,'pmid_rank_10th_percentile']]

In [412]:
pmid_df=abstract_catalog.loc[np.array(pmid_list)]
pmid_df.head()

Unnamed: 0_level_0,article_title,journal_title,journal_abbre,article_date,abstract,article_type,pub_days,citation,abstract_clean,Breast Cancer,...,Endometrial Cancer,Acute Lymphoblastic Leukemia (ALL),Chronic Lymphocytic Leukemia (CLL),Cardiac Tumors,Cervical Cancer,Esophageal Cancer,Thyroid Cancer,Rectal Cancer,text_clean_seq,text_length
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12464649,Oral contraceptives and the risk of breast can...,Journal of the National Cancer Institute,J. Natl. Cancer Inst.,2002-12-01,Oral contraceptive use has been associated wit...,Journal Article,6260.0,289.0,Oral contraceptive use has been associated wit...,1,...,0,0,0,0,0,0,0,0,"[518, 6780, 119, 71, 85, 41, 5, 35, 344, 4, 3,...",1836.0
14709740,Frequency of BRCA1 and BRCA2 mutations in unse...,Journal of the National Cancer Institute,J. Natl. Cancer Inst.,2004-01-01,Mutations in BRCA1 and BRCA2 that predispose t...,Journal Article,5864.0,51.0,Mutations in BRCA1 and BRCA2 that predispose t...,1,...,0,0,0,0,0,0,0,0,"[138, 4, 766, 2, 1167, 17, 6043, 6, 2, 12, 32,...",1001.0
18268356,Risk-reducing salpingo-oophorectomy for the pr...,Journal of clinical oncology : official journa...,J. Clin. Oncol.,2008-02-11,Risk-reducing salpingo-oophorectomy (RRSO) has...,Journal Article,4362.0,401.0,Risk-reducing salpingo-oophorectomy RRSO has b...,1,...,0,0,0,0,0,0,0,0,"[43, 1818, 5690, 3470, 7376, 71, 85, 1792, 445...",1667.0
21118973,Common breast cancer susceptibility alleles an...,Cancer research,Cancer Res.,2010-11-30,The known breast cancer susceptibility polymor...,Journal Article,3339.0,,The known cancer susceptibility polymorphisms ...,1,...,0,0,0,0,0,0,0,0,"[3, 440, 12, 1432, 1203, 4, 5273, 22426, 19887...",1691.0
15131025,BRCA mutations and risk of prostate cancer in ...,Clinical cancer research : an official journal...,Clin. Cancer Res.,2004-05-01,The Breast Cancer Linkage Consortium and other...,Comparative Study,5743.0,111.0,The Cancer Linkage Consortium and other family...,1,...,0,0,0,0,0,0,0,0,"[3, 12, 4820, 2404, 2, 127, 607, 90, 54147, 47...",1429.0


In [388]:
onco_name='Kenneth Offit'
onco_clinical_df = onco.get_clinical_data(onco_name)

In [389]:
onco_clinical_df.head()

Unnamed: 0,NCTId,name,BriefTitle,Phase,Condition,cancer_type
684,NCT00579514,Kenneth Offit,Germline Alterations of Tumor Susceptibility G...,Not Applicable,"Breast Cancer, Bladder Cancer, Kidney Cancer, ...",Breast Cancer
685,NCT00579488,Kenneth Offit,Clinical Significance of Germline BRCA Mutations,,"Breast Cancer, Ovarian Cancer",Breast Cancer
686,NCT00590109,Kenneth Offit,Germline BRCA1 and BRCA2 Mutations in Jewish W...,,Breast Cancer,Breast Cancer


In [390]:
onco_clinical_df.shape

(3, 6)

In [391]:
onco_df.head()

Unnamed: 0,name,degree,phone,center_name2,address,city_state,speciality,certificate,center_name,article_num,clinical_trial_num
0,John H. Glick,"MD, FASCO",(215) 662-6334,University of Pennsylvania-Abramson Cancer Center,"3400 Civic Center Blvd3rd Fl, Ste 3-300S","Philadelphia, PA 19104-5127, US","['Breast Cancer', 'Cancer Prevention']","['Internal Medicine', 'Medical Oncology']",Abramson Cancer Center,7,0
1,Arthur M. Feldman,MD,(215) 662-9801,University of Pennsylvania-Abramson Cancer Center,Penn Presbyterian Medcl Ctr51 N 39th St MAB St...,"Philadelphia, PA 19104, US","['Breast Cancer', 'Geriatrics Oncology']","['Internal Medicine', 'Medical Oncology']",Abramson Cancer Center,0,1
2,David M. Mintzer,MD,Search for Phone Number,Abramson Cancer Center at Pennsylvania Hospital,230 W Washington Sq Fl 2,"Philadelphia, PA 19106-3500, US","['Breast Cancer', 'Lung Cancer', 'Palliative C...","['Hematology', 'Hospice and Palliative Medicin...",Abramson Cancer Center,13,0
3,David J. Vaughn,MD,(215) 349-8140,University of Pennsylvania-Abramson Cancer Center,PCAM South 10-1143400 Civic Center Blvd,"Philadelphia, PA 19104, US","['Bladder Cancer', 'Prostate Cancer', 'Testicu...",['Medical Oncology'],Abramson Cancer Center,86,1
4,Charles John Schneider,"MD, FACP",Search for Phone Number,"Hospital of the University of Pennsylvania, Ab...",Pereleman Center for Advanced Medicine3400 Civ...,"Philadelphia, PA 19104, US","['Clinical Research', 'Developmental Therapeut...",['Medical Oncology'],Abramson Cancer Center,0,5


In [279]:
print(test_df.shape)
test_df.head()

(181, 15)


Unnamed: 0_level_0,name,degree,phone,center_name2,address,city_state,speciality,certificate,center_name,article_num,clinical_trial_num,query_pmid,query_pmid_num,pmid_rank,pmid_rank_10th_percentile
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Banu Arun,Banu Arun,"MD, FASCO",(713) 792-7090,University of Texas MD Anderson Cancer Center,1515 Holcombe blvd,"Houston, TX 77030, US",['Breast Cancer'],"['Hematology', 'Medical Oncology']",The University of Texas MD Anderson Cancer Center,181,7,"[29044548, 29733510, 16946209, 18779615, 21656...",68,"[0, 5, 6, 18, 27, 28, 29, 37, 53, 56, 66, 74, ...",4
Naoto T. Ueno,Naoto T. Ueno,"MD, PhD",(713) 792-8754,University of Texas MD Anderson Cancer Center,1515 Holcombe BlvdUnit 1354,"Houston, TX 77030-4000, US","['Breast Cancer', 'Clinical Research', 'Develo...","['Internal Medicine', 'Medical Oncology']",The University of Texas MD Anderson Cancer Center,201,13,"[29044548, 27135926]",2,"[0, 555]",1
Saundra S. Buys,Saundra S. Buys,MD,Search for Phone Number,Huntsman Cancer Institute - University of Utah...,2000 Circle of Hope Dr,"Salt Lake City, UT 84112-5550, US","['Breast Cancer', 'Genetics/Gene Therapy']","['Hematology', 'Medical Oncology']",Huntsman Cancer Institute,125,4,"[18704680, 17021353, 21118973, 19188678, 11792...",52,"[1, 4, 13, 38, 41, 89, 95, 113, 117, 127, 134,...",3
Charis Eng,Charis Eng,"MD, PhD, FACP",(216) 444-3440,Cleveland Clinic Foundation,Cleveland Clin Genomic Med Inst9500 Euclid Ave...,"Cleveland, OH 44195, US","['Breast Cancer', 'Endocrine Tumors', 'Genetic...","['Internal Medicine', 'Medical Oncology', 'Oth...",Cleveland Clinic,300,1,"[23740344, 24458845, 29684080, 16563180, 16032...",20,"[2, 20, 140, 166, 196, 201, 208, 217, 236, 241...",1
Claudine Isaacs,Claudine Isaacs,MD,Search for Phone Number,"Lombardi Cancer Center, Georgetown University",3800 Reservoir Rd NW,"Washington, DC 20007-2113, US","['Breast Cancer', 'Clinical Research', 'Geneti...","['Internal Medicine', 'Medical Oncology']",Georgetown Lombardi Comprehensive Cancer Center,185,6,"[12464649, 15095307, 18268356, 21118973, 18195...",91,"[3, 10, 12, 13, 16, 24, 25, 26, 35, 39, 43, 68...",5


In [393]:
test_df=pd.read_csv(os.path.join(base_dir,'temp/temp.csv'))

In [394]:
test_df.head()

Unnamed: 0,name,degree,phone,center_name2,address,city_state,speciality,certificate,center_name,article_num,clinical_trial_num,query_pmid,query_pmid_num,pmid_rank,pmid_rank_10th_percentile,clinical_trial_num2
0,Kenneth Offit,"MD, MPH, FASCO",(646) 888-4067,Memorial Sloan Kettering Cancer Center,1275 York AveInternal Box 192,"New York, NY 10065-6007, US",['Tumor Biology'],"['Hematology', 'Medical Oncology']",Memorial Sloan-Kettering Cancer Center,259,14,"[12464649, 14709740, 18268356, 21118973, 15131...",92,"[3, 8, 12, 13, 15, 17, 26, 48, 49, 52, 55, 65,...",6,4
1,Claudine Isaacs,MD,Search for Phone Number,"Lombardi Cancer Center, Georgetown University",3800 Reservoir Rd NW,"Washington, DC 20007-2113, US","['Breast Cancer', 'Clinical Research', 'Geneti...","['Internal Medicine', 'Medical Oncology']",Georgetown Lombardi Comprehensive Cancer Center,185,6,"[12464649, 15095307, 18268356, 21118973, 18195...",91,"[3, 10, 12, 13, 16, 24, 25, 26, 35, 39, 43, 68...",5,5
2,Mark E. Robson,"MD, FASCO",(646) 888-5434,Memorial Sloan Kettering Cancer Center,1275 York AveBreast Medicine Service,"New York, NY 10065-6007, US","['Breast Cancer', 'Cancer Prevention', 'Geneti...","['Hematology', 'Medical Oncology']",Memorial Sloan-Kettering Cancer Center,88,2,"[28765325, 18268356, 15131025, 14680495, 12655...",37,"[7, 12, 15, 17, 49, 52, 92, 94, 136, 157, 161,...",4,1
3,Banu Arun,"MD, FASCO",(713) 792-7090,University of Texas MD Anderson Cancer Center,1515 Holcombe blvd,"Houston, TX 77030, US",['Breast Cancer'],"['Hematology', 'Medical Oncology']",The University of Texas MD Anderson Cancer Center,181,7,"[29044548, 29733510, 16946209, 18779615, 21656...",68,"[0, 5, 6, 18, 27, 28, 29, 37, 53, 56, 66, 74, ...",4,7
4,Larry Norton,"MD, FASCO",(646) 888-5438,Memorial Sloan Kettering Cancer Center,1275 York Ave,"New York, NY 10065, US","['Breast Cancer', 'Clinical Research']",['Medical Oncology'],Memorial Sloan-Kettering Cancer Center,192,25,"[28765325, 18268356, 14680495, 26011570, 12023...",18,"[7, 12, 17, 79, 92, 123, 126, 180, 247, 257, 3...",3,7


In [219]:
test_df.shape

(95, 16)

In [221]:
test_df.head()

Unnamed: 0_level_0,name,degree,phone,center_name2,address,city_state,speciality,certificate,center_name,article_num,clinical_trial_num,query_pmid,query_pmid_num,pmid_rank,pmid_rank_10th_percentile,clinical_trial_Breast Cancer_num
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Kenneth Offit,Kenneth Offit,"MD, MPH, FASCO",(646) 888-4067,Memorial Sloan Kettering Cancer Center,1275 York AveInternal Box 192,"New York, NY 10065-6007, US",['Tumor Biology'],"['Hematology', 'Medical Oncology']",Memorial Sloan-Kettering Cancer Center,259,14,"[12464649, 14709740, 18268356, 21118973, 15131...",92,"[3, 8, 12, 13, 15, 17, 26, 48, 49, 52, 55, 65,...",6,4
Claudine Isaacs,Claudine Isaacs,MD,Search for Phone Number,"Lombardi Cancer Center, Georgetown University",3800 Reservoir Rd NW,"Washington, DC 20007-2113, US","['Breast Cancer', 'Clinical Research', 'Geneti...","['Internal Medicine', 'Medical Oncology']",Georgetown Lombardi Comprehensive Cancer Center,185,6,"[12464649, 15095307, 18268356, 21118973, 18195...",91,"[3, 10, 12, 13, 16, 24, 25, 26, 35, 39, 43, 68...",5,5
Mark E. Robson,Mark E. Robson,"MD, FASCO",(646) 888-5434,Memorial Sloan Kettering Cancer Center,1275 York AveBreast Medicine Service,"New York, NY 10065-6007, US","['Breast Cancer', 'Cancer Prevention', 'Geneti...","['Hematology', 'Medical Oncology']",Memorial Sloan-Kettering Cancer Center,88,2,"[28765325, 18268356, 15131025, 14680495, 12655...",37,"[7, 12, 15, 17, 49, 52, 92, 94, 136, 157, 161,...",4,1
Banu Arun,Banu Arun,"MD, FASCO",(713) 792-7090,University of Texas MD Anderson Cancer Center,1515 Holcombe blvd,"Houston, TX 77030, US",['Breast Cancer'],"['Hematology', 'Medical Oncology']",The University of Texas MD Anderson Cancer Center,181,7,"[29044548, 29733510, 16946209, 18779615, 21656...",68,"[0, 5, 6, 18, 27, 28, 29, 37, 53, 56, 66, 74, ...",4,7
Larry Norton,Larry Norton,"MD, FASCO",(646) 888-5438,Memorial Sloan Kettering Cancer Center,1275 York Ave,"New York, NY 10065, US","['Breast Cancer', 'Clinical Research']",['Medical Oncology'],Memorial Sloan-Kettering Cancer Center,192,25,"[28765325, 18268356, 14680495, 26011570, 12023...",18,"[7, 12, 17, 79, 92, 123, 126, 180, 247, 257, 3...",3,7


In [197]:
test_df = get_onco_info('Breast Cancer', breast_pmid,clinical_trial=True)
test_df.shape
test_df.head()

Unnamed: 0_level_0,name,degree,phone,center_name2,address,city_state,speciality,certificate,center_name,article_num,clinical_trial_num,query_pmid,query_pmid_num,pmid_rank,pmid_rank_10th_percentile,clinical_trial_Breast Cancer_num
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Kelly Hunt,Kelly Hunt,MD,Search for Phone Number,University of Texas MD Anderson Cancer Center,Dept of Surgical Oncology - Unit 444P. O. Box ...,"Houston, TX 77230-1402, US","['Breast Cancer', 'Sarcoma', 'Clinical Trials/...",['Surgical Oncology'],The University of Texas MD Anderson Cancer Center,469,3,"[11773149, 11800340, 11884322, 11923128, 11923...",347,"[0, 5, 30, 47, 49, 72, 73, 76, 109, 120, 122, ...",8,3
Gabriel N. Hortobagyi,Gabriel N. Hortobagyi,"MD, FACP, FASCO",(713) 792-4124,University of Texas MD Anderson Cancer Center,Unit 1354PO Box 301439,"Houston, TX 77230-1439, US","['Breast Cancer', 'Cancer Education', 'Drug De...","['Internal Medicine', 'Medical Oncology']",The University of Texas MD Anderson Cancer Center,569,1,"[11773149, 11839684, 11844821, 11852999, 11856...",506,"[0, 13, 14, 18, 20, 41, 43, 76, 82, 94, 96, 10...",8,1
Larry Norton,Larry Norton,"MD, FASCO",(646) 888-5438,Memorial Sloan Kettering Cancer Center,1275 York Ave,"New York, NY 10065, US","['Breast Cancer', 'Clinical Research']",['Medical Oncology'],Memorial Sloan-Kettering Cancer Center,192,25,"[11821455, 11821456, 11870168, 11901151, 11981...",158,"[10, 11, 24, 37, 66, 85, 165, 313, 323, 335, 3...",5,7
Vicente Valero,Vicente Valero,MD,(713) 792-4124,University of Texas MD Anderson Cancer Center,PO Box 301439,"Houston, TX 77230-1439, US","['Breast Cancer', 'Clinical Trials/Biostatisti...","['Hematology', 'Internal Medicine', 'Medical O...",The University of Texas MD Anderson Cancer Center,239,5,"[11852999, 11914909, 11919237, 12006521, 12015...",212,"[18, 41, 43, 76, 82, 94, 96, 120, 129, 130, 14...",4,5
Nuhad K. Ibrahim,Nuhad K. Ibrahim,MD,(713) 792-2817,University of Texas MD Anderson Cancer Center,1515 Holcombe Blvd # 1354,"Houston, TX 77030-4000","['Breast Cancer', 'Clinical Trials/Biostatisti...","['Internal Medicine', 'Medical Oncology']",The University of Texas MD Anderson Cancer Center,45,2,"[11852999, 12006521, 12429622, 12429628, 12620...",41,"[18, 76, 213, 214, 305, 371, 737, 866, 1014, 1...",2,2


In [134]:
cancerclinical2doctor_dict=pickle.load(open(os.path.join(base_dir, "data/cancerclinical2doctor_dict.pkl"),"rb"))
clinical_trial=True
if clinical_trial and cancer_type in cancerclinical2doctor_dict:
    doctor_clin = cancerclinical2doctor_dict[cancer_type]

In [139]:
test_df2 = test_df.loc[np.array(doctor_clin)].sort_values(by=['rank_pmid'])
doctor2clinical_dict2=pickle.load(open(os.path.join(base_dir, "data/doctor2clinical_dict2.pkl"),"rb"))
test_df2['clinical_trial_{}_num'.format(cancer_type)]=test_df2.name.apply(lambda x: doctor2clinical_dict2[x][cancer_type])
test_df2.head()

Unnamed: 0_level_0,name,degree,phone,center_name2,address,city_state,speciality,certificate,center_name,article_num,clinical_trial_num,query_pmid,query_pmid_num,rank_pmid,clinical_trial_Breast Cancer_num
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Gabriel N. Hortobagyi,Gabriel N. Hortobagyi,"MD, FACP, FASCO",(713) 792-4124,University of Texas MD Anderson Cancer Center,Unit 1354PO Box 301439,"Houston, TX 77230-1439, US","['Breast Cancer', 'Cancer Education', 'Drug De...","['Internal Medicine', 'Medical Oncology']",The University of Texas MD Anderson Cancer Center,569,1,"[11773149, 11839684, 11844821, 11852999, 11856...",506,0,1
Kelly Hunt,Kelly Hunt,MD,Search for Phone Number,University of Texas MD Anderson Cancer Center,Dept of Surgical Oncology - Unit 444P. O. Box ...,"Houston, TX 77230-1402, US","['Breast Cancer', 'Sarcoma', 'Clinical Trials/...",['Surgical Oncology'],The University of Texas MD Anderson Cancer Center,469,3,"[11773149, 11800340, 11884322, 11923128, 11923...",347,1,3
Eddy J. Chen,Eddy J. Chen,MD,(978) 287-3436,Massachusetts General Hospital Cancer Center,55 Fruit StMgh Cancer Center,"Boston, MA 02114-2621, US","['Bladder Cancer', 'Kidney Cancer', 'Medical O...","['Hematology', 'Internal Medicine', 'Medical O...",Massachusetts General Hospital,3,3,[11782367],1,2,2
Claudine Isaacs,Claudine Isaacs,MD,Search for Phone Number,"Lombardi Cancer Center, Georgetown University",3800 Reservoir Rd NW,"Washington, DC 20007-2113, US","['Breast Cancer', 'Clinical Research', 'Geneti...","['Internal Medicine', 'Medical Oncology']",Georgetown Lombardi Comprehensive Cancer Center,185,6,"[11786581, 11881908, 12023993, 12376518, 12464...",161,3,5
Saundra S. Buys,Saundra S. Buys,MD,Search for Phone Number,Huntsman Cancer Institute - University of Utah...,2000 Circle of Hope Dr,"Salt Lake City, UT 84112-5550, US","['Breast Cancer', 'Genetics/Gene Therapy']","['Hematology', 'Medical Oncology']",Huntsman Cancer Institute,125,4,"[11792833, 15217505, 16600944, 16626501, 16896...",96,5,4


In [126]:
test_df2.shape

(280, 13)

In [136]:
doctor2clinical_dict2=pickle.load(open(os.path.join(base_dir, "data/doctor2clinical_dict2.pkl"),"rb"))

In [137]:
doctor2clinical_dict2['Gabriel N. Hortobagyi']['Breast Cancer']

Counter({'Breast Cancer': 1})

In [None]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        temp=WordNetLemmatizer().lemmatize(token).lower()
        if len(temp)>1 and temp not in stop_words:
            result.append(temp)
    return result

In [67]:
gene_name = 'BCR'

In [73]:
def get_pmid_for_gene(gene,data=gene_df):
    pmid_array = data.query('HGNC=="{}"'.format(gene)).pmid.unique()
    # pmid_array[~pd.isnull(pmid_array)]
    return pmid_array

In [77]:
pmid_gene = get_pmid_for_gene(gene_name,data=gene_df)

In [78]:
pmid_gene

array([10079468, 11781252, 11790564, 11821445, 11857085, 11877262,
       11877311, 11914627, 11920509, 11932905, 11976731, 11979552,
       11986204, 11986206, 11986238, 11993784, 12036931, 12042704,
       12086872, 12086882, 12091333, 12114413, 12114417, 12114418,
       12124177, 12149456, 12161751, 12173333, 12176876, 12176916,
       12191565, 12196213, 12200353, 12204532, 12209598, 12209733,
       12214291, 12231544, 12351420, 12358901, 12359762, 12365015,
       12374452, 12393385, 12393600, 12394172, 12410573, 12411300,
       12414617, 12447845, 12528773, 12538464, 12560227, 12563615,
       12569603, 12576334, 12599232, 12613514, 12614767, 12617866,
       12618518, 12627512, 12631595, 12637317, 12637470, 12637609,
       12681367, 12691141, 12712475, 12712476, 12714820, 12727828,
       12730115, 12735513, 12755554, 12763937, 12767088, 12775739,
       12783368, 12784336, 12791647, 12817431, 12879469, 12893773,
       12902478, 12908554, 12921945, 12935973, 12942553, 12973

In [79]:
similarity_df.head()

Unnamed: 0_level_0,lda_similarity
pmid,Unnamed: 1_level_1
27836010,0.999982
24412296,0.999965
19800463,0.999942
28535000,0.99994
25605296,0.99994


In [80]:
pmid_list=[]
for i in similarity_df.index:
    if i in pmid_gene:
        pmid_list.append(i)

In [82]:
len(pmid_list)

12

### Validation LDA models using text description

In [3]:
mskcc_text_df=pd.read_csv(os.path.join(base_dir, 'data/mskcc_text_dict_df.csv'),index_col='name')
mskcc_text_df.head()

Unnamed: 0_level_0,text
name,Unnamed: 1_level_1
Kenneth H. Yu,I am a medical oncologist with expertise in th...
Robert Sidlow,I am a board-certified general internist and p...
Lisa Marie Ruppert,I am a physician of Physical Medicine and Reha...
Yukio Sonoda,I am a gynecologic oncologist who performs abo...
Marsha Reyngold,I am a board-certified radiation oncologist wh...
