Attempt topic modeling on the words in all abstracts and articles for best doctors only to categorize their specialties. The column "Articlewords" contains all titles amd abstracts of papers for the doctors.

In [24]:
#relevant imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [25]:
#combine best docs in Boston and NY to one dataframe and do topic modeling on resultant text
df = pd.read_pickle('BestNY.pkl')
df_NY = pd.read_pickle('bestbos.pkl')
df = pd.concat([df,df_NY])
df = df.ArticleWords
del df_NY
df.head()

fullname
Sylvia Adams         NFS1 undergoes positive selection in lung tumo...
Michael Grossbard    Survival of Asian Females With Advanced Lung C...
Francisco Esteva     A phase I trial of ganetespib in combination w...
Cynthia Leichman     Phase II Study of Olaparib (AZD-2281) After St...
Daniel Cho           Phase Ib Study of Safety and Pharmacokinetics ...
Name: ArticleWords, dtype: object

In [26]:
#pre-process words 
#make lowercase
df = df.apply(lambda x: " ".join(x.lower() for x in x.split()))
df.head(3)

fullname
Sylvia Adams         nfs1 undergoes positive selection in lung tumo...
Michael Grossbard    survival of asian females with advanced lung c...
Francisco Esteva     a phase i trial of ganetespib in combination w...
Name: ArticleWords, dtype: object

In [27]:
#remove punctuation
df = df.str.replace('[^\w\s]','')
df.head()

fullname
Sylvia Adams         nfs1 undergoes positive selection in lung tumo...
Michael Grossbard    survival of asian females with advanced lung c...
Francisco Esteva     a phase i trial of ganetespib in combination w...
Cynthia Leichman     phase ii study of olaparib azd2281 after stand...
Daniel Cho           phase ib study of safety and pharmacokinetics ...
Name: ArticleWords, dtype: object

In [29]:
#remove stopwords
stop = stopwords.words('english')
df = df.apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df.head()

fullname
Sylvia Adams         nfs1 undergoes positive selection lung tumours...
Michael Grossbard    survival asian females advanced lung cancer er...
Francisco Esteva     phase trial ganetespib combination paclitaxel ...
Cynthia Leichman     phase ii study olaparib azd2281 standard syste...
Daniel Cho           phase ib study safety pharmacokinetics pi3k in...
Name: ArticleWords, dtype: object

In [43]:
#most common words, tenth word is breast, should keep, not removing any, important for TF-IDF
freq = pd.Series(' '.join(df).split()).value_counts()[:8]
freq

breast          4021
tumor           3867
cells           3703
risk            3590
disease         3320
chemotherapy    3244
associated      3231
phase           3198
dtype: int64

In [44]:
#remove most common words
freq = list(freq.index)
df = df.apply(lambda x: " ".join(x for x in x.split() if x not in freq))
df.head()

fullname
Sylvia Adams         nfs1 undergoes positive selection lung tumours...
Michael Grossbard    asian females advanced lung era tyrosine kinas...
Francisco Esteva     trial ganetespib combination paclitaxel trastu...
Cynthia Leichman     ii olaparib azd2281 standard systemic therapie...
Daniel Cho           ib safety pharmacokinetics pi3k inhibitor sar2...
Name: ArticleWords, dtype: object

In [45]:
#remove rare words: all words repeated less than 10 times? 
freq = pd.Series(' '.join(df).split()).value_counts()[-29000:]
freq.head()

mirvetuximab       10
generator          10
originate          10
conformational     10
vulnerabilities    10
dtype: int64

In [46]:
freq.tail()

ipeptidedisplaying       1
gemcitabineresistance    1
therapyresistance        1
apxl                     1
damagedna                1
dtype: int64

In [47]:
#this takes a WHILE to run, be careful!
freq = list(freq.index)
df = df.apply(lambda x: " ".join(x for x in x.split() if x not in freq))

In [48]:
df.head()

fullname
Sylvia Adams         positive selection lung tumours protects envir...
Michael Grossbard    asian females advanced lung era tyrosine kinas...
Francisco Esteva     trial ganetespib combination paclitaxel trastu...
Cynthia Leichman     ii olaparib standard systemic therapies dissem...
Daniel Cho           ib safety pharmacokinetics pi3k inhibitor huma...
Name: ArticleWords, dtype: object

In [49]:
len(df)

99

In [50]:
# NMF is able to use tf-idf, LDA does not. try both
no_features = 1000

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()


In [51]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [52]:
no_topics = 15

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 10

In [53]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [54]:
#check Non-negative Matrix Factorization topics
display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0:
inhibitor factor gene carcinoma activity result growth combination human protein
Topic 1:
lymphoma hodgkin hl dlbcl rituximab bcell mcl nhl relapsed nonhodgkin
Topic 2:
prostate castrationresistant androgen men ar deprivation crpc psa adt prednisone
Topic 3:
ovarian serous epithelial endometrial eoc uterine recurrent gynecologic carboplatin paclitaxel
Topic 4:
colorectal pancreatic rectal colon crc neuroendocrine oxaliplatin irinotecan 5fluorouracil hepatic
Topic 5:
leukemia acute aml transplantation myeloid hct allogeneic gvhd hematopoietic donor
Topic 6:
melanoma ipilimumab immune vaccine ctla4 immunotherapy antigen antibody braf nivolumab
Topic 7:
myeloma mm bortezomib lenalidomide dexamethasone relapsed proteasome bone marrow thalidomide
Topic 8:
urothelial bladder germ carcinoma cisplatin lymph pdl1 firstline node testicular
Topic 9:
sarcoma gist osteosarcoma soft stromal imatinib kit tissue gastrointestinal pdl1
Topic 10:
brca1 mutation brca2 carrier ovarian woman suscep

In [55]:
#check LDA topics
display_topics(lda, tf_feature_names, no_top_words)


Topic 0:
leukemia prostate acute inhibitor lymphoma year gene mutation factor drug
Topic 1:
transplantation acute leukemia stem donor blood allogeneic year ci hematopoietic
Topic 2:
woman adjuvant mutation age ovarian young year factor carcinoma recurrence
Topic 3:
prostate gene mutation factor inhibitor analysis carcinoma ovarian group result
Topic 4:
lymphoma inhibitor prostate activity factor overall strategy hodgkin rate refractory
Topic 5:
prostate leukemia acute men bone androgen aml myeloid mutation year
Topic 6:
mm myeloma inhibitor mutation activity lymphoma colorectal bone novel growth
Topic 7:
lung mutation egfr colorectal inhibitor carcinoma factor adenocarcinoma growth resistance
Topic 8:
colorectal mutation association woman 95 brca1 ci carrier gene brca2
Topic 9:
prostate mutation castrationresistant factor analysis androgen gene result men 95
Topic 10:
lymphoma myeloma mm inhibitor relapsed bortezomib activity hodgkin combination novel
Topic 11:
gene sarcoma mutation nf

### We can see that NMF outperforms LDA, likely because it uses TF-IDF scoring.

In [56]:
#manually label each speciality
topic_list = ["General Neoplasms","Lymphoma",
              "Prostrate Cancer","Ovarian Cancer","Pancreatic & Colorectal Cancer"
              ,"Leukemia","Melanoma","Myeloma & Marrow Cancer",
             "Bladder Cancer","Osteosarcoma","Breast Cancer","Early Stage Breast Cancer",
              "Head & Neck Tumors","Lung Cancer", "Kidney Cancer"]

In [57]:
#doc_topic_mat = model.transform(word_embedding)
#model.transform = nmf.transform
word_embedding = tf_vectorizer.fit_transform(documents)

#check the matrix that assigns each doctor a rank for the specialty
doc_topic_mat = nmf.transform(word_embedding)



In [58]:
#see what this looks like.
doc_topic_mat.shape

(99, 15)

In [59]:
#SAVE top 3 topics/specialties. It is expected that all docs will be in "general" category. 
indices_top_3_topics = np.argpartition(doc_topic_mat, -3, axis=1)[:, -3:]
indices_top_3_topics

array([[11,  6,  0],
       [ 4,  1,  0],
       [13,  0, 11],
       [11,  4,  0],
       [ 6,  0, 14],
       [ 4, 11,  0],
       [13,  0, 12],
       [ 1,  5,  0],
       [10,  8,  0],
       [ 1, 13,  0],
       [ 5,  2,  0],
       [14,  2,  0],
       [ 1,  4,  0],
       [ 5, 10,  0],
       [ 3,  0,  1],
       [ 1,  0,  7],
       [ 5,  0,  7],
       [ 1,  0, 12],
       [13, 11,  0],
       [ 4,  0, 11],
       [ 0, 12,  5],
       [ 5,  6, 14],
       [ 7,  1,  0],
       [ 4,  0, 10],
       [14,  8,  0],
       [ 2, 13,  0],
       [10, 13,  0],
       [10, 13,  0],
       [ 6,  9,  0],
       [ 3,  0, 11],
       [10,  3,  0],
       [10, 13,  0],
       [13,  4,  0],
       [14, 12,  0],
       [ 4,  3,  0],
       [10,  5,  0],
       [14,  4,  0],
       [ 6,  9,  0],
       [13,  0,  6],
       [11, 13,  0],
       [ 9,  3,  0],
       [10, 13,  0],
       [ 8, 14,  0],
       [ 9,  0,  6],
       [12,  8,  0],
       [11,  4,  0],
       [12,  2,  0],
       [ 1,  

### These topics can now be used to suggest doctor speciality. 