In [1]:
% run pubmedplay.py

In [2]:
% matplotlib inline

In [4]:
term = raw_input('Enter the term of interest:')

Enter the term of interest:disorder


In [7]:
# default name
if term == "":
    term = "disorder"

#### Extract associated articles:

In [8]:
results = search(term,500)
id_list = results['IdList']
papers = fetch_details(id_list)

#### Extract paper abstracts:

In [9]:
listOfAbstracts = []
for paper in papers:
    if 'Abstract' in paper['MedlineCitation']['Article'].keys():
        listOfAbstracts.append(mergeAbstract(paper['MedlineCitation']['Article']['Abstract']['AbstractText']))

#### Extract disorders and concepts from Cognitive Atlas:

In [10]:
from cognitiveatlas.api import get_disorder
from cognitiveatlas.api import get_concept

In [11]:
disorders_json = get_disorder()

http://cognitiveatlas.org/api/v-alpha/disorder?
Result Includes:<pandas:data frame><json:dict><txt:str><url:str>


In [12]:
disorders = [disorder['name'] for disorder in disorders_json.json]

In [13]:
concepts_json = get_concept()

http://cognitiveatlas.org/api/v-alpha/concept?
Result Includes:<pandas:data frame><json:dict><txt:str><url:str>


In [14]:
concepts = [concept['name'] for concept in concepts_json.json]

In [15]:
all_terms =  list(set(concepts+disorders))

#### Create TF-IDF matrix:

In [16]:
vect = TfidfVectorizer(ngram_range = (1,2),max_df = 0.99,vocabulary = all_terms)
tfidf = vect.fit_transform(listOfAbstracts)

In [17]:
# Displaying TF-IDF
# extracting the tf-idf for the first document
row = pd.DataFrame({'tfidf':np.array(tfidf[0,].todense()).flatten()})
row.index = vect.get_feature_names()
row = row.sort(['tfidf'],ascending=[0])

In [18]:
print('Abstract:')
print(listOfAbstracts[0])

Abstract:
Darier disease is an autosomal dominant skin disorder caused by mutations in the ATPase, Ca++ transporting, cardiac muscle, slow twitch 2 (ATP2A2) gene and previously reported to cosegregate with bipolar disorder and schizophrenia in occasional pedigrees. It is, however, unknown whether these associations exist also in the general population, and the objective of this study was to examine this question.We compared a national sample of individuals with Darier disease and their first-degree relatives with matched unexposed individuals from the general population and their first-degree relatives, respectively. To examine risks for bipolar disorder and schizophrenia, risk ratios and 95% confidence intervals (CIs) were estimated using conditional logistic regressions.Individuals with Darier disease had a 4.3 times higher risk of being diagnosed with bipolar disorder (95% CI: 2.6-7.3) and a 2.3 times higher risk of being diagnosed with schizophrenia (95% CI: 1.1-5.2) than matched i

Tf-idf for first this abstract:

In [19]:
print(row.tfidf[row.tfidf>0.1])

bipolar disorder    0.597067
schizophrenia       0.566177
risk                0.541873
attention           0.125239
association         0.116805
Name: tfidf, dtype: float64


#### Non-negative Matrix Factorization:

In [20]:
num_topics = 2
num_top_words = 10
nmf = decomposition.NMF(n_components=num_topics, random_state=1)
doctopic = nmf.fit_transform(tfidf)
topic_words = []
vocab = np.array(vect.get_feature_names())

In [21]:
for topic in nmf.components_:
    word_idx = np.argsort(topic)[::-1][0:num_top_words]
    topic_words.append([vocab[i] for i in word_idx])

In [22]:
from IPython.display import display

In [23]:
colnames = ['Topic '+str(i) for i in np.arange(num_topics)+1]

In [24]:
display(pd.DataFrame(topic_words,index = colnames).transpose())

Unnamed: 0,Topic 1,Topic 2
0,bipolar disorder,sleep
1,risk,movement
2,mood,risk
3,schizophrenia,dementia
4,association,association
5,anxiety,dream
6,attention,loss
7,psychosis,sleep disorder
8,mood disorder,anxiety
9,stress,fatigue


#### Graph Visualization:

In [25]:
import pyLDAvis

In [None]:
import networkx as nx

In [None]:
# create association graph:
A = tfidf.T*tfidf

In [None]:
a = nx.Graph(A)
pos = nx.circular_layout(a)
#nx.draw_circular(a)
#nx.draw(a,pos = pos)
nx.draw(a,pos = pos)