# 3. Text preprocessing

## Setup

In [1]:
%run __init__.py

INFO:root:Starting logger


In [2]:
import pandas as pd



## Common pipeline

## COVID-19
bla bla bla

### Loading the dataframe

In [None]:
CORD_DATASET_DIR = os.path.join(DATA_DIR, 'cord19')
CORD19_FILE_PATH = os.path.join(CORD_DATASET_DIR, 'cord19_dataframe.pkl')

cord19_df = pd.read_pickle(CORD19_FILE_PATH)

## Agriculture

### Loading the dataframe

In [3]:
AGRICULTURE_DATASET_DIR = os.path.join(DATA_DIR, 'agriculture')
PMC_FILE_PATH = os.path.join(AGRICULTURE_DATASET_DIR, 'pmc_dataframe.pkl')

pmc_df = pd.read_pickle(PMC_FILE_PATH)

### Preprocessing text

In [4]:
publications = pmc_df['text_cleaned'].values

In [14]:
import numpy as np
import multiprocessing as mp

import string
import spacy 
import en_core_web_sm
from sklearn.base import TransformerMixin, BaseEstimator

nlp = en_core_web_sm.load()

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, n_jobs=1):
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        return self

    def transform(self, X, *args, **kwargs):
        return [self._preprocess_text(text) for text in X]

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        doc = nlp(text)
        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        return self._lemmatize(removed_stop_words)

    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
        return [t for t in doc if not t.is_stop]
    
    def _remove_digits(self, doc):
        return [t for t in doc if not t.is_digit]

    def _lemmatize(self, doc):
        return [t.lemma_ for t in doc]
    

### LDA

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

def dummy(doc):
    return doc

pipeline = Pipeline([('preprocessing', TextPreprocessor()),
                     ('vectorizer', CountVectorizer(preprocessor=dummy, tokenizer=dummy)),
                     ('model', LatentDirichletAllocation())])


In [None]:
pipeline.fit_transform(publications)

In [8]:
processed_text = pipeline['preprocessing'].fit_transform(p)

126

In [None]:
tf_feature_names = pipeline['vectorizer'].get_feature_names()
print_top_words(pipeline['model'], tf_feature_names, 25)

In [9]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()



### Latent Semantic Analysis (LSA)

In [None]:
from sklearn.decomposition import TruncatedSVD

lsa_pipeline = ([('preprocessing', TextPreprocessor()),
                 ('vectorizer', TFIDFVectorizer()),
                 ('model', TruncatedSVD())])

### NMF

In [None]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TFIDFVectorizer

pipeline = Pipeline([('preprocessing', TextPreprocessor()),
                     ('vectorize', TFIDFVectorizer()),
                     ('model', NMF())])