# 3. Text preprocessing

## Setup

In [1]:
%run __init__.py

INFO:root:Starting logger


In [2]:
import pandas as pd



## Common pipeline

## COVID-19
bla bla bla

### Loading the dataframe

In [None]:
CORD_DATASET_DIR = os.path.join(DATA_DIR, 'cord19')
CORD19_FILE_PATH = os.path.join(CORD_DATASET_DIR, 'cord19_dataframe.pkl')

cord19_df = pd.read_pickle(CORD19_FILE_PATH)

## Agriculture

### Loading the dataframe

In [3]:
AGRICULTURE_DATASET_DIR = os.path.join(DATA_DIR, 'agriculture')
PMC_FILE_PATH = os.path.join(AGRICULTURE_DATASET_DIR, 'pmc_dataframe.pkl')

pmc_df = pd.read_pickle(PMC_FILE_PATH)

### Preprocessing text

In [4]:
publications = pmc_df['text_cleaned'].values

In [49]:
import numpy as np
import multiprocessing as mp

import string
import spacy 
import en_core_web_sm
from sklearn.base import TransformerMixin, BaseEstimator

nlp = en_core_web_sm.load()
nlp.Defaults.stop_words |= {"et","al", "introduction", "Fig"}

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, n_jobs=1):
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        return self

    def transform(self, X, *args, **kwargs):
        return [self._preprocess_text(text) for text in X]

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        doc = nlp(text)
        return [t.lemma_ for t in doc if len(t.text) > 2 and
                not t.is_stop and t.text not in string.punctuation
                and t.is_alpha and not t.is_digit]

    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
        return [t for t in doc if not t.is_stop]
    
    def _remove_digits(self, doc):
        return [t for t in doc if t.is_alpha and not t.is_digit]

    def _lemmatize(self, doc):
        return [t.lemma_ for t in doc if len(t.text) > 2]
    

### LDA

In [65]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

def dummy(doc):
    return doc

lda_pipeline = Pipeline([('preprocessing', TextPreprocessor()),
                     ('vectorizer', CountVectorizer(preprocessor=dummy, tokenizer=dummy)),
                     ('model', LatentDirichletAllocation())])


In [None]:
from sklearn.model_selection import RandomizedSearchCV

search_params = {
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'model__n_components': [10, 15, 20, 25, 30],
    'model__learning_decay': [.5, .7, .9]
}

search = RandomizedSearchCV(lda_pipeline, param_distributions=search_params, n_iter=15)
search.fit(publications)

In [None]:
best_lda_model = search.best_estimator_

print(f"Best pipeline parameters: {search.best_params_}")
print(f"Best Log Likelihood Score: {search.best_score_}")

In [52]:
tf_feature_names = pipeline['vectorizer'].get_feature_names()
print_top_words(pipeline['model'], tf_feature_names, 10)

Topic #0: plant transgenic gene line base control datum show sensor node
Topic #1: plant soil root carbon cell different phytolith sample non high
Topic #2: plant yield high temperature water crop grain leaf stress result
Topic #3: system crop food farmer farm level practice change study increase
Topic #4: plant infect protein infection infected Las non root virus shoot
Topic #5: plant stress increase level high gene growth expression treatment condition
Topic #6: rice soil root crop Fig treatment high water increase sample
Topic #7: plant gene protein expression stress figure show mutant response Fig
Topic #8: habitat farming mussel mission grassland agent target cost value bird
Topic #9: plant medicinal specie take informant Herb decoction virus orally report



In [9]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()



Visualization. Do this with the best model obtained before.

In [53]:
import pyLDAvis
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()

In [62]:
preprocessed_text = TextPreprocessor().fit_transform(publications)
tf_vectorizer = CountVectorizer(preprocessor=dummy, tokenizer=dummy, ngram_range=(1, 3))
dtm_tf = tf_vectorizer.fit_transform(preprocessed_text)
lda_tf = LatentDirichletAllocation().fit(dtm_tf)

In [63]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='tsne')

### Latent Semantic Analysis (LSA)

In [None]:
from sklearn.decomposition import TruncatedSVD

lsa_pipeline = ([('preprocessing', TextPreprocessor()),
                 ('vectorizer', TFIDFVectorizer()),
                 ('model', TruncatedSVD())])

### NMF

In [None]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TFIDFVectorizer

pipeline = Pipeline([('preprocessing', TextPreprocessor()),
                     ('vectorize', TFIDFVectorizer()),
                     ('model', NMF())])