# 3. Text preprocessing

## Setup

In [1]:
%run __init__.py

In [2]:
import pandas as pd
import pyLDAvis
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()



In [3]:
RANDOM_SEED = 42

## Common pipeline

## COVID-19
bla bla bla

### Loading the dataframe

In [None]:
CORD_DATASET_DIR = os.path.join(DATA_DIR, 'cord19')
CORD19_FILE_PATH = os.path.join(CORD_DATASET_DIR, 'cord19_dataframe.pkl')

cord19_df = pd.read_pickle(CORD19_FILE_PATH)

## Agriculture

### Loading the dataframe

In [4]:
AGRICULTURE_DATASET_DIR = os.path.join(DATA_DIR, 'agriculture')
PMC_FILE_PATH = os.path.join(AGRICULTURE_DATASET_DIR, 'pmc_dataframe.pkl')

pmc_df = pd.read_pickle(PMC_FILE_PATH)

### Preprocessing text

In [5]:
publications = pmc_df['text_cleaned'].values

In [6]:
import numpy as np
import multiprocessing as mp

import string
import spacy 
import en_core_web_sm
from sklearn.base import TransformerMixin, BaseEstimator

nlp = en_core_web_sm.load()
nlp.Defaults.stop_words |= {"et","al", "introduction", "Fig", "fig", "figure"}

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, n_jobs=1):
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        return self

    def transform(self, X, *args, **kwargs):
        return [self._preprocess_text(text) for text in X]

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        doc = nlp(text)
        return [t.lemma_ for t in doc if len(t.text) > 2 and
                not t.is_stop and t.text not in string.punctuation
                and t.is_alpha and not t.is_digit]

    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
        return [t for t in doc if not t.is_stop]
    
    def _remove_digits(self, doc):
        return [t for t in doc if t.is_alpha and not t.is_digit]

    def _lemmatize(self, doc):
        return [t.lemma_ for t in doc if len(t.text) > 2]
    

In [7]:
preprocessed_texts = TextPreprocessor().fit_transform(publications)

### Common functions
TODO: move to herc_common

In [8]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()



### LDA

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

def dummy(doc):
    return doc


tf_vectorizer = CountVectorizer(preprocessor=dummy, tokenizer=dummy, ngram_range=(1, 1))
dtm_tf = tf_vectorizer.fit_transform(preprocessed_texts)

In [10]:
from tmtoolkit.topicmod.evaluate import metric_coherence_gensim


def base_scoring_function(vectorizer, texts, model, X, top_n=20):
    return metric_coherence_gensim(measure='c_v', top_n=top_n, 
                                   topic_word_distrib=model.components_, 
                                   dtm=X, 
                                   vocab=np.array([x for x in vectorizer.vocabulary_.keys()]), 
                                   texts=texts, return_mean=True)

In [None]:
from functools import partial

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV


lda_scoring_func = lambda clf, X: base_scoring_function(vectorizer=tf_vectorizer,
                                        texts=preprocessed_texts, model=clf, X=X)

MIN_TOPICS = 7
MAX_TOPICS = 32

search_params = {
    'n_components': range(MIN_TOPICS, MAX_TOPICS, 2),
    'learning_decay': [.5, .7, .9]
}

search = RandomizedSearchCV(LatentDirichletAllocation(random_state=RANDOM_SEED), 
                            param_distributions=search_params, n_iter=15,
                            scoring=lda_scoring_func)
search.fit(dtm_tf)

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [18]:
list(range(7, 32, 2))

[7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]

In [13]:
best_lda_model = search.best_estimator_

print(f"Best pipeline parameters: {search.best_params_}")
print(f"Best Topic coherence: {search.best_score_}")

Best pipeline parameters: {'n_components': 1, 'learning_decay': 0.7}
Best Topic coherence: 0.8109366098700599


Visualization. Do this with the best model obtained before.

In [14]:
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(best_lda_model, tf_feature_names, 10)

Topic #0: plant stress high gene soil increase level study root rice



In [26]:
pyLDAvis.sklearn.prepare(best_lda_model, dtm_tf, tf_vectorizer, mds='tsne')

In [32]:
from sklearn.pipeline import Pipeline

lda_pipeline = Pipeline([('preprocessing', TextPreprocessor()),
                         ('vectorizer', CountVectorizer(preprocessor=dummy, tokenizer=dummy)),
                         ('model', LatentDirichletAllocation(**best_lda_model.get_params()))])

lda_pipeline.fit_transform(publications)

### Latent Semantic Analysis (LSA)

In [19]:
from sklearn.decomposition import TruncatedSVD

lsa_scoring_func = lda_scoring_func

lsa_search_params = {
    'n_components': [5, 10, 15, 20, 25, 30],
    'algorithm': ["randomized"]
}

lsa_search = RandomizedSearchCV(TruncatedSVD(random_state=RANDOM_SEED),
                                param_distributions=lsa_search_params, n_iter=15,
                                scoring=lsa_scoring_func)
lsa_search.fit(dtm_tf)

RandomizedSearchCV(estimator=TruncatedSVD(random_state=42), n_iter=1,
                   param_distributions={'algorithm': ['randomized'],
                                        'n_components': [5, 10, 15, 20, 25,
                                                         30]},
                   scoring=<function <lambda> at 0x7fb8573b8048>)

In [20]:
best_lsa_model = lsa_search.best_estimator_

print(f"LSA model")
print("-" * 10)
print(f"Best pipeline parameters: {lsa_search.best_params_}")
print(f"Best Topic coherence: {lsa_search.best_score_}")

LSA model
----------
Best pipeline parameters: {'n_components': 10, 'algorithm': 'randomized'}
Best Topic coherence: 0.752091402194526


In [21]:
best_lsa_model.components_

array([[ 1.11101933e-05,  5.66621733e-05,  8.31557596e-05, ...,
         2.37255096e-05,  2.37255096e-05,  2.37255096e-05],
       [ 5.90909490e-05, -4.66386579e-05, -4.66532114e-05, ...,
        -1.51990253e-05, -1.51990253e-05, -1.51990253e-05],
       [-2.42802194e-07, -8.32372179e-05, -1.27723294e-04, ...,
         6.53735330e-05,  6.53735330e-05,  6.53735330e-05],
       ...,
       [ 4.69763549e-06, -1.81172133e-04,  1.53590122e-04, ...,
         2.34028904e-06,  2.34028904e-06,  2.34028904e-06],
       [-1.86321678e-06, -1.70027495e-04,  1.53102131e-04, ...,
        -7.09106254e-06, -7.09106254e-06, -7.09106254e-06],
       [ 7.31959703e-06, -1.52168814e-04, -2.76786763e-04, ...,
         2.13589841e-08,  2.13589841e-08,  2.13589841e-08]])

In [22]:
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(best_lsa_model, tf_feature_names, 10)

Topic #0: plant stress gene high increase expression level root protein show
Topic #1: system soil crop food rice increase high base level agricultural
Topic #2: decoction orally take Wild Herb medicinal system Grinding food area
Topic #3: food system plant production virus value chain regime infect agricultural
Topic #4: stress take heat orally decoction Wild food system gene Herb
Topic #5: MED stress NUT treatment application report heat Karst informant temperature
Topic #6: root MED NUT report treatment application gene Karst expression informant
Topic #7: root growth non food regime plant system accession fungal shoot
Topic #8: virus stress specie native infect CMV food heat infection introduce
Topic #9: root virus sensor node specie power energy application adaptation agricultural



Visualization

In [None]:
tf_feature_names = lsa_pipeline['vectorizer'].get_feature_names()
print_top_words(lsa_pipeline['model'], tf_feature_names, 10)

In [16]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

lsa_pipeline = Pipeline([('preprocessing', TextPreprocessor()),
                 ('vectorizer', TfidfVectorizer(preprocessor=dummy, tokenizer=dummy)),
                 ('model', TruncatedSVD(**lsa_search.get_params()))])
lsa_pipeline.fit_transform(publications)

### NMF

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = CountVectorizer(preprocessor=dummy, tokenizer=dummy, ngram_range=(1, 1))
dtm_tfidf = tfidf_vectorizer.fit_transform(preprocessed_texts)

In [36]:
from sklearn.decomposition import NMF

nmf_scoring_func = lambda clf, X: base_scoring_function(vectorizer=tfidf_vectorizer,
                                                        texts=preprocessed_texts, model=clf, X=X)

nmf_search_params = {
    'n_components': [5, 10, 15, 20, 25, 30]
}

nmf_search = RandomizedSearchCV(NMF(random_state=RANDOM_SEED),
                                param_distributions=nmf_search_params, n_iter=15,
                                scoring=nmf_scoring_func)
nmf_search.fit(dtm_tfidf)



RandomizedSearchCV(estimator=NMF(max_iter=1, random_state=42), n_iter=1,
                   param_distributions={'n_components': [15]},
                   scoring=<function <lambda> at 0x7fb88f4f1400>)

In [37]:
best_nmf_model = nmf_search.best_estimator_

print(f"NMF model")
print("-" * 10)
print(f"Best pipeline parameters: {nmf_search.best_params_}")
print(f"Best Topic coherence: {nmf_search.best_score_}")

NMF model
----------
Best pipeline parameters: {'n_components': 15}
Best Topic coherence: 0.7410074169407151


In [38]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(best_nmf_model, tfidf_feature_names, 10)

Topic #0: plant stress expression increase gene level high show result root
Topic #1: soil crop rice increase system high yield practice study residue
Topic #2: take orally decoction Wild plant Herb medicinal Grinding area treat
Topic #3: system food production regime chain value type vegetable innovation agricultural
Topic #4: stress heat gene tolerance expression drought protein response level induce
Topic #5: plant MED treatment NUT report application Karst informant use oral
Topic #6: root MED shoot NUT expression report soil plant PRE precursor
Topic #7: gene expression protein transgenic show transcript construct control regulate resistance
Topic #8: virus plant specie native CMV infection infect introduce BYMV inoculate
Topic #9: root energy agricultural sensor node adaptation datum model power information
Topic #10: node sensor power base application agricultural protocol wireless energy network
Topic #11: growth cold accession Arabidopsis regime acclimation Yukon Shandong temp

In [58]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

nmf_pipeline = Pipeline([('preprocessing', TextPreprocessor()),
                         ('vectorizer', TfidfVectorizer(preprocessor=dummy, tokenizer=dummy)),
                         ('model', NMF(**best_nmf_model.get_params()))])

nmf_pipeline.fit_transform(publications)

### Corex

In [43]:
import scipy.sparse as ss

from corextopic import corextopic as ct
from corextopic import vis_topic as vt


corex_vectorizer = CountVectorizer(preprocessor=dummy, tokenizer=dummy, binary=True)
binary_tf = corex_vectorizer.fit_transform(preprocessed_texts)
sparse_binary_tf = ss.csr_matrix(binary_tf)

sparse_binary_tf.shape # n_docs x m_words

(126, 28798)

In [None]:
corex_words = corex_vectorizer.get_feature_names()

corex_model = ct.Corex(n_hidden=50, words=corex_words, max_iter=200, verbose=False, seed=RANDOM_STATE)
corex_model.fit(sparse_binary_tf, words=corex_words)



In [None]:
corex_topics = corex_model.get_topics()
for n, topic in enumerate(corex_topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

Visualizing best number of topics

In [None]:
plt.figure(figsize=(10,5))
plt.bar(range(corex_model.tcs.shape[0]), corex_model.tcs, color='#4e79a7', width=0.5)
plt.xlabel('Topic', fontsize=16)
plt.ylabel('Total Correlation (nats)', fontsize=16);

Building the final model