# 3. Text preprocessing

## Setup

In [1]:
%run __init__.py

INFO:root:Starting logger


In [2]:
import pandas as pd



## Common pipeline

## COVID-19
bla bla bla

### Loading the dataframe

In [None]:
CORD_DATASET_DIR = os.path.join(DATA_DIR, 'cord19')
CORD19_FILE_PATH = os.path.join(CORD_DATASET_DIR, 'cord19_dataframe.pkl')

cord19_df = pd.read_pickle(CORD19_FILE_PATH)

## Agriculture

### Loading the dataframe

In [3]:
AGRICULTURE_DATASET_DIR = os.path.join(DATA_DIR, 'agriculture')
PMC_FILE_PATH = os.path.join(AGRICULTURE_DATASET_DIR, 'pmc_dataframe.pkl')

pmc_df = pd.read_pickle(PMC_FILE_PATH)

### Preprocessing text

In [4]:
publications = pmc_df['text_cleaned'].values

In [5]:
import numpy as np
import multiprocessing as mp

import string
import spacy 
import en_core_web_sm
from sklearn.base import TransformerMixin, BaseEstimator

nlp = en_core_web_sm.load()
nlp.Defaults.stop_words |= {"et","al", "introduction", "Fig", "fig", "figure"}

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, n_jobs=1):
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        return self

    def transform(self, X, *args, **kwargs):
        return [self._preprocess_text(text) for text in X]

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        doc = nlp(text)
        return [t.lemma_ for t in doc if len(t.text) > 2 and
                not t.is_stop and t.text not in string.punctuation
                and t.is_alpha and not t.is_digit]

    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
        return [t for t in doc if not t.is_stop]
    
    def _remove_digits(self, doc):
        return [t for t in doc if t.is_alpha and not t.is_digit]

    def _lemmatize(self, doc):
        return [t.lemma_ for t in doc if len(t.text) > 2]
    

### LDA

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

def dummy(doc):
    return doc

lda_pipeline = Pipeline([('preprocessing', TextPreprocessor()),
                     ('vectorizer', CountVectorizer(preprocessor=dummy, tokenizer=dummy)),
                     ('model', LatentDirichletAllocation())])


In [7]:
from sklearn.model_selection import RandomizedSearchCV

search_params = {
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'model__n_components': [10, 15, 20, 25, 30],
    'model__learning_decay': [.5, .7, .9]
}

search = RandomizedSearchCV(lda_pipeline, param_distributions=search_params, n_iter=15)
search.fit(publications)

KeyboardInterrupt: 

In [71]:
best_lda_model = search.best_estimator_

print(f"Best pipeline parameters: {search.best_params_}")
print(f"Best Log Likelihood Score: {search.best_score_}")

Best pipeline parameters: {'vectorizer__ngram_range': (1, 1), 'model__n_components': 10, 'model__learning_decay': 0.7}
Best Log Likelihood Score: -847826.416759485


In [14]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()



Visualization. Do this with the best model obtained before.

In [28]:
import pyLDAvis
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()

In [25]:
# {'vectorizer__ngram_range': (1, 1), 'model__n_components': 10, 'model__learning_decay': 0.7}

preprocessed_text = TextPreprocessor().fit_transform(publications)
tf_vectorizer = CountVectorizer(preprocessor=dummy, tokenizer=dummy, ngram_range=(1, 1))
dtm_tf = tf_vectorizer.fit_transform(preprocessed_text)
lda_tf = LatentDirichletAllocation(n_components=10, learning_decay=0.7).fit(dtm_tf)

In [26]:
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda_tf, tf_feature_names, 10)

Topic #0: habitat plant growth grassland cold bird accession Arabidopsis regime high
Topic #1: plant virus infect infection non sample infected specie leave light
Topic #2: plant protein expression gene root level response acid stress induce
Topic #3: system food crop plant practice take farmer orally decoction farm
Topic #4: soil rice study site high value water sample season emission
Topic #5: plant population value datum volatile high result base flower metabolite
Topic #6: soil crop root study plant rodent day yield Salmonella stage
Topic #7: plant gene stress transgenic expression line control show protein salt
Topic #8: plant soil rice increase root treatment stress high concentration content
Topic #9: plant application crop MED base sensor treatment report system node



In [29]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='tsne')

INFO:numexpr.utils:Note: NumExpr detected 24 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


### Latent Semantic Analysis (LSA)

In [16]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

lsa_pipeline = Pipeline([('preprocessing', TextPreprocessor()),
                 ('vectorizer', TfidfVectorizer(preprocessor=dummy, tokenizer=dummy)),
                 ('model', TruncatedSVD(n_components=10))])

In [21]:
tf_feature_names = lsa_pipeline['vectorizer'].get_feature_names()
print_top_words(lsa_pipeline['model'], tf_feature_names, 10)

Topic #0: plant stress gene rice soil expression transgenic root increase protein
Topic #1: rice soil crop season emission residue system farm farmer winter
Topic #2: food farmer farm system household farming livestock crop climate diversity
Topic #3: stress heat salt tolerance drought NaCl food ROS salinity ALDH
Topic #4: transgenic gene rice expression emission crop line season protein drought
Topic #5: medicinal salt heat yield plant stress informant NaCl hypertension season
Topic #6: transgenic salt rice sensor food NaCl iot emission node system
Topic #7: emission sensor rice node season iot temperature heat defense Las
Topic #8: medicinal gene mutant informant hypertension wild decoction hydrophilin protein RNAi
Topic #9: endophyte yield sensor node grain root iot bean soil irrigation



In [30]:
lsa_search_params = {
    'vectorizer__min_df': [1, 2, 3],
    'vectorizer__ngram_range': [(1, 1), (1, 2), (2, 2), (1, 3), (2, 3)],
    'model__n_components': [5, 10, 15, 20, 25, 30],
    'model__algorithm': ["arpack", "randomized"]
}

lsa_search = RandomizedSearchCV(lsa_pipeline, param_distributions=lsa_search_params, n_iter=2)
lsa_search.fit(publications)

TypeError: If no scoring is specified, the estimator passed should have a 'score' method. The estimator Pipeline(steps=[('preprocessing', TextPreprocessor()),
                ('vectorizer',
                 TfidfVectorizer(preprocessor=<function dummy at 0x7f01195fd7b8>,
                                 tokenizer=<function dummy at 0x7f01195fd7b8>)),
                ('model', TruncatedSVD(n_components=10))]) does not.

Visualization

In [None]:
tf_feature_names = lsa_pipeline['vectorizer'].get_feature_names()
print_top_words(lsa_pipeline['model'], tf_feature_names, 10)

### NMF

In [19]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

nmf_pipeline = Pipeline([('preprocessing', TextPreprocessor()),
                     ('vectorizer', TfidfVectorizer(preprocessor=dummy, tokenizer=dummy)),
                     ('model', NMF(n_components=10))])

In [None]:
nmf_pipeline.fit_transform(publications)

In [23]:
tf_feature_names = nmf_pipeline['vectorizer'].get_feature_names()
print_top_words(nmf_pipeline['model'], tf_feature_names, 10)

Topic #0: plant endophyte defense herbivore metabolite volatile GSL insect touch expression
Topic #1: rice soil emission season crop residue paddy straw winter treatment
Topic #2: food farmer farm system household CSA climate farming crop livestock
Topic #3: stress plant salt heat tolerance NaCl drought gene expression treatment
Topic #4: transgenic plant gene expression line protein sequence transcript overexpressor RNAi
Topic #5: metal toxicity soil plant root concentration heavy acid uptake increase
Topic #6: sensor iot node IoT wireless network communication Smart power Campus
Topic #7: plant infect CMV Las virus infected infection NahG citri symptomatic
Topic #8: medicinal plant informant hypertension decoction Herb traditional specie orally ailment
Topic #9: crop yield soil legume grain bird bean habitat residue wheat



In [None]:
nmf_pip