In [1]:
import sys
sys.path.insert(1, '../src')
from util import *
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import nltk
import warnings
from sklearn.decomposition import TruncatedSVD
from collections import defaultdict
from icecream import ic
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize;
from pandarallel import pandarallel
from IPython.display import clear_output

In [2]:
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option("display.max_rows", 999)
pd.set_option("display.max_columns", None)
pd.set_option("precision", 3)
%pylab inline
%config InlineBackend.figure_formats = ['retina']
pandarallel.initialize()

Populating the interactive namespace from numpy and matplotlib
INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## **1. Read clean data**

In [3]:
fn = '../data/data_preprocessed.pickle'
data = read_from_pickle(fn)
data.keys()

Read data from "../data/data_preprocessed.pickle"


dict_keys(['df_words_tkn', 'dict_ngrams', 'df_original'])

## **2. Remove unnecessary words**
These words are either the highest level topics or unnecessary. And they largely change modeling results.

In [4]:
df_original = data['df_original']
df_words_tkn = data['df_words_tkn']
dict_ngrams = data['dict_ngrams']
remove_lst = ['mild traumatic brain', 'patient severe traumatic',
              'severe traumatic brain', 
              'injury traumatic brain', 'brain injury patient',
              'brain injury traumatic', 'traumatic brain injury',
              'central nervous system', 'patient traumatic brain'
             ]
for w in remove_lst:
    try:
        dict_ngrams['trigrams'].remove(w)
    except:
        print(w)

remove_lst = ['brain injury', 'traumatic brain', 'head injury',
              'mild traumatic', 'long term', 'severe traumatic',
              'post traumatic', 'patient traumatic', 'present study',
              'head trauma', 'injury traumatic', 'brain damage','brain barrier',
              'aim study', 'blood brain', 'scale score', 'systematic review',
              'quality life'
             ]
for w in remove_lst:
    try:
        dict_ngrams['bigrams'].remove(w)
    except:
        print(w)
    
exception_dict = ['day', 'month', 'hour', 'le', 'case', 'time',
                  'level', 'effect', 'data', 'change', 'analysis',
                  'test', 'result', 'brain injury rat', 'outcome', 'finding', 'condition',
                  'cost','muscle', 'artery','area', 'normal', 'role', 
                  'period', 'function', 'potential', 'region', 'use',
                  'type', 'min', 'different', 'approach', 'method',
                  'increase', 'decrease', 'reduce', 'low', 'site',
                  'max', 'mean', 'higher', 'lower', 'measure',
                  'total', 'activity','response', 'research',
                  'non', 'current', 'specific', 'week', 'new', 'old',
                  'task', 'work', 'evidence','management', 'common',
                  'health', 'number','analysis', 'study','bbb',
                  'aki', 'pre', 'bank', 'national', 'center', 'admission',
                  'dc', 'pc', 'il', 'administration', 'addition', 'value',
                  'early', 'greater', 'major', 'overall', 'related', 'single'
                  'participant', 'individual', 'key', 'self', 'ratio', 'kg',
                  'rate', 'cns', 'trauma', 'article'
                 ]

## **3. Topic modeling using NMF**  

Steps:
- Merge n-grams
- Vectorized the documents using TFIDF
    - Set min_df and max_df to remove least anb most frequent terms
- Train and NMF model ([Ref for parameters](https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py))
    - Use 5 topics for now. But need to identify the best number of topic for clustering in the future




In [5]:
years = range(1991,2022)
doc_topics_dict = defaultdict(list)
vectorizer_dict = defaultdict(list)
model_dict = defaultdict(list)
for y in years:
    if y % 4 == 0:
        print('fitted models for years from {0} to {1}'
              .format(1991, y))
    # Consider n-grams so that the tokenizer does not process specific phrases
    doc_word = df_words_tkn[y].parallel_apply(merge_ngrams, 
                                        dict_ngrams=dict_ngrams,
                                        exception_dict=exception_dict).apply(','.join)
    df = pd.DataFrame()
    df['text'] = doc_word

    n_topics = 5
    t_vectorizer = TfidfVectorizer(stop_words = 'english',
                                   tokenizer=dummy, # dummy tokenizer to maintain phrases
                                   analyzer='word',
                                   min_df=0.05,
                                   max_df=0.95,                                   
                                   )
    doc_word = t_vectorizer.fit_transform(df['text'])

    t_nmf_model = NMF(n_topics, random_state=15,
                      beta_loss='kullback-leibler',
                      solver='mu', max_iter=2000,
                      alpha=0.1, l1_ratio=0.5,
                      init='nndsvd'
                     )
    doc_topics_dict[y] = t_nmf_model.fit_transform(doc_word);
    vectorizer_dict[y] = t_vectorizer
    model_dict[y] = t_nmf_model
    
print('finished')
fn = '../data/nmf_models_all_year.pickle'
results_dict = {'doc_topics_dict': doc_topics_dict,
                'vectorizer_dict': vectorizer_dict,
                'model_dict': model_dict,
               }
save_as_pickle(fn, results_dict)

fitted models for years from 1991 to 1992
fitted models for years from 1991 to 1996
fitted models for years from 1991 to 2000
fitted models for years from 1991 to 2004
fitted models for years from 1991 to 2008
fitted models for years from 1991 to 2012
fitted models for years from 1991 to 2016
fitted models for years from 1991 to 2020
finished
Saved data to "../data/nmf_models_all_year.pickle"


## 4. Results and visualizations
Refer to [../figures](https://github.com/weizhao-BME/metis-project4/tree/main/figures) for all the figures generated from this section

### 4.1 Bargraph showing the weight of words for each topic  

In [6]:
for y in years:
    t_nmf_model = model_dict[y]
    t_vectorizer = vectorizer_dict[y]
    doc_topics = doc_topics_dict[y]
    doc_cluster = doc_topics.argmax(axis = 1)
    no_papers = pd.Series(doc_cluster).value_counts()

    plot_top_words(t_nmf_model,
                   t_vectorizer.get_feature_names(), 10, 
                   y, no_papers,
                   figsize=(30, 20))
    fn = '../figures/{0}_all_topics_bargraph.png'.format(y)
    plt.savefig(fn, dpi=300, bbox_inches='tight', facecolor="white")
    plt.close()

### 4.2 Extract  all topics with 10 words for each from 1991 - 2021

In [7]:
topic_words_dict = store_topic_words(model_dict, vectorizer_dict)

### 4.3 Word cloud figure for each topic and each year

In [8]:
# word cloud
from wordcloud import WordCloud
# y=2021
# n=0
for y in years:
    doc_topics = doc_topics_dict[y]
    words = vectorizer_dict[y].get_feature_names()
    t = model_dict[y].components_.argsort(axis=1)[:,-1:-8:-1]
    topic_words = [[words[e] for e in l] for l in t]
    if y%4 == 0:
        print(y)
    for n in range(0,len(topic_words)):
        wc = WordCloud(background_color="white",
                       random_state=15,
                       width=400,
                       height=300,
                      ).generate(','.join(topic_words[n]))
        plt.subplots(figsize=(6, 6));
        plt.imshow(wc, interpolation='bilinear')
        plt.axis("off");
        fn = '../figures/{0}_topics_{1}.png'.format(y, n)
        plt.savefig(fn, dpi=300, bbox_inches='tight', facecolor="white")
        plt.close()



1992
1996
2000
2004
2008
2012
2016
2020


## **5. Semantic similarity of topic words**
To investigate the development of these identified topics, a semantic similarity analysis was performed. Using all the available documents, a word2vec model was trained. 

In [9]:
df = (df_original['title_and_abstract_lemma']
      .parallel_apply(word_tokenize)
      .parallel_apply(del_abbreviation)
      .parallel_apply(merge_ngrams,
                      dict_ngrams=dict_ngrams,
                      exception_dict=exception_dict))
df.head()

0    [nondisclosure, collegiate, student, athlete, ...
1    [extracellular, vesicle, concentration, glial,...
2    [artificial, intelligence, report, fictional, ...
3    [exo70, intracellular, redistribution, mild tr...
4    [diffusion, tensor, neuropsychological, perfor...
Name: title_and_abstract_lemma, dtype: object

In [10]:
from gensim.models import Word2Vec
w2v_model = Word2Vec(min_count=20,
                     window=3,
                     size=500,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=12,
                     seed=15)

w2v_model.build_vocab(sentences=df.to_list(), progress_per=10000)
w2v_model.train(sentences=df.to_list(),
                total_examples=w2v_model.corpus_count,
                epochs=50, report_delay=1)
w2v_model.init_sims(replace=True)

In [11]:
fn = '../data/w2v_model.pickle'
save_as_pickle(fn, w2v_model)

Saved data to "../data/w2v_model.pickle"


### Cosine similarity between each pair of topics (5x5=25 pairs) of previous and following years

In [12]:
comparison_mat = np.zeros([n_topics, n_topics, len(years)])
for idx_year, y in enumerate(years):
    t1, t2 = topic_words_dict[y-1], topic_words_dict[y]
    for idx_1, word_lst_1 in enumerate(t1):
        for idx_2, word_lst_2 in enumerate(t2):
            comparison_mat[idx_1, idx_2, idx_year] = (w2v_model
                                                      .wv
                                                      .n_similarity(word_lst_1,
                                                                    word_lst_2))   

### For each topic of previous years, identify its corresponding most similar topic of following years

In [13]:
idx_max = np.zeros([n_topics, len(years)], dtype=int)
idx_max[:, 0] = np.arange(0, n_topics)
max_score = np.zeros([n_topics, len(years)])
for n in range(0, n_topics):
    
    for i in range(1, comparison_mat.shape[2]):
        
        idx_max[n, i] = comparison_mat[idx_max[n, i-1],:,i].argmax()
        max_score[n, i] = comparison_mat[idx_max[n, i-1],:,i].max().round(2)    

### Semantic cosine similarity scores for each topic year by year

In [14]:
x_labels = []
for i, y in enumerate(years):
    if i + 1 >=len(years):
        break
    else:
        x_labels.append(str(y)+'-'+str(years[i+1]))

fontdict={'fontsize': 15}
for n in range(0, n_topics):
    plt.subplots(figsize=(10, 5))
    plt.title('Topic {}'.format(n), fontdict={'fontsize': 18})
    ax = sns.barplot(x=list(range(1992, 2022)),
                     y=max_score[n,1:],
                     color=[0.2, 0.4, 0.6],
                     edgecolor=(0,0,0),
                    )
    
    ax.set_xticklabels(x_labels,
                       rotation=90,
                       fontdict={'fontsize': 13},
                      );
    ax.set_yticklabels(np.arange(0, 1.1, 0.2).round(1),
                       fontdict=fontdict,
                      );
    ax.set_ylim([0,1])
    ax.set_xlim([-0.8,30.1])

    [t_ax.set_width(1) for t_ax in ax.patches];
    ax.set_xlabel('Years', fontdict=fontdict)
    ax.set_ylabel('Semantic Cosine Similarity',
                  fontdict=fontdict);

    fn = '../figures/topic_{}_development.svg'.format(n)
    plt.savefig(fn, dpi=300, bbox_inches='tight', facecolor="white")
    plt.close()

