In [None]:
# November 2022
# Text classifier using logistic regression
# Violeta Berdejo-Espinola

In [None]:
# linting
# !nbqa pylint 1.pre_process_main_text.ipynb

# background theme 
# !jt -t monokai -cellw 90% #grade3

In [1]:
# from IPython.display import display, HTML
# display(HTML("<style>.container { width:90% !important; }</style>"))

import numpy as np
import pandas as pd
import os
import re
import string

pd.options.display.max_columns = 65
# os.getcwd()

In [2]:
df = pd.read_csv('../datasets/py_outputs/pos_neg.csv', encoding='utf-8')

from sklearn.preprocessing import LabelEncoder

labeler = LabelEncoder()
labeler.fit([0,1])

df["label"] = labeler.fit_transform(df["label"])

df['abstract_spa'] = df['abstract_spa'].str.replace(r'Resumen', '', regex=True)

In [4]:
from collections import Counter
counter= Counter()



In [9]:
# x data

corpus_df = df.loc[:,"title_spa":"abstract_spa"]
corpus_list = corpus_df.values.tolist()

print(f'length of article list is {len(corpus_list)}')

length of article list is 5019


In [10]:
print(len(df))
print(df.title_spa.duplicated().any())
print(df["label"].value_counts())
print(df["label"].isna().sum())
df.iloc[0:62]
df.head(63)
corpus_list[:2]

5019
False
0    4957
1      62
Name: label, dtype: int64
0


[['Manejo forestal comunitario en el sur de méxico: ¿es una práctica sustentable para el mantenimiento de los ensambles de escarabajos?',
  'Evaluamos los cambios en la diversidad, estructura y composición de especies de escarabajos copronecrófagos (Coleoptera: Scarabaeidae: Scarabaeinae) en un paisaje bajo manejo forestal comunitario en el sur de México. Se dispusieron trampas de caída cebadas con excretas de cerdo y calamar en descomposición en sitios con diferente tratamiento forestal. En total, se registraron 3,608 individuos y 21 especies de Scarabaeinae. Registramos un mayor número de especies en el área sin intervención y cambios significativos en la composición de especies entre sitios. Dos escarabajos generalistas (Ontherus mexicanus y Onthophagus cyanellus) representaron 51% de la abundancia total. Nuestros resultados indican que el efecto relativo del manejo forestal sobre el ensamble de escarabajos es proporcional a la intensidad de corte. El impacto negativo sobre las comu

# feature engineering

# removing special characters, punctiation, and numbers

In [None]:
%%time

# function to enact regex substitution on a list of strings

def sub_all(regex, corpus_list, replacement=" "):
    
    return [[regex.sub(replacement, col) for col in row] for row in corpus_list]

# defining regular expressions as objects to find unwated text and symbols in corpus

re_citation = re.compile(r"\(.[^())]*\d{4}[^())]*\)")
re_tabfig = re.compile(r"\(\s?\w{1,7}[.]?\s?\d{1}\w?\s?\)")
re_digit_char = re.compile(r"\d+\w{,2}")
re_one_two_letter = re.compile(r"\b\w{1,2}\b")
re_new_line = re.compile(r"\n{1,}")
re_tab = re.compile(r"\t{1,}")
re_html = re.compile(r"</?\w+>")
re_alt_html = re.compile(r"<.*?>")
re_spacing = re.compile(r"\s{2,}")
re_fig = re.compile(r"(fig)")
re_table = re.compile(r"(cuadro)")
punctuation_text = string.punctuation + "¿±♂♀’”°´“×–…" + "\xad" + "\xa0"
translator = str.maketrans(punctuation_text, " " * len(punctuation_text))

# function to process text

def text_processing(corpus_list):
    
    output = [
        [col.lower() if type(col) is str else "" for col in row] for row in corpus_list
    ]
    output = sub_all(re_citation, output)
    output = sub_all(re_tabfig, output)
    output = sub_all(re_fig, output)
    output = sub_all(re_table, output)
    output = sub_all(re_digit_char, output)
    output = sub_all(re_one_two_letter, output)
    output = [[col.translate(translator) for col in row] for row in output]
#     output = sub_all(re_non_breaking_space, output)
    output = sub_all(re_new_line, output)
    output = sub_all(re_tab, output)
    output = sub_all(re_html, output)
    output = sub_all(re_alt_html, output)
    output = sub_all(re_spacing, [[word.strip() for word in row] for row in output])

    return output

corpus_clean1 = text_processing(corpus_list)

In [None]:
corpus_clean1[:2]

# lemmatization

In [None]:
%%time

import spacy

nlp = spacy.load('es_core_news_md', disable=['parser', 'ner']) # pre-trained spacy Spanish language object #!python3 -m spacy download es_core_news_md 

def lemmatizer(text):
    
    doc_list = []
    for sentence in text: 
        doc_list.append(" ".join([token.lemma_ for token in nlp(" ".join(sentence))]))
    
    return doc_list

corpus_clean2 = lemmatizer(corpus_clean1)

In [None]:
corpus_clean2[:2]

# removing stopwords 

In [None]:
%%time

from stop_words import get_stop_words

def remove_stopwords(text):
    
    corpus_clean = [
    " ".join([word for word in sentence.split() if re.sub(r'\W+', '', word) not in get_stop_words('spanish')]) for sentence in text
]
    if any (stopword in corpus_clean for stopword in get_stop_words('spanish')):
        print ('stopwords not excluded from vocabulary')
    else:
        print ('stopwords excluded from vocabulary')
    if any (number in corpus_clean for number in list(range(1,1000001))):
        print ('\nnumbers not excluded from vocabulary')
    else:
        print ('\nnumbers excluded from vocabulary')

    return corpus_clean

corpus_clean3 = remove_stopwords(corpus_clean2)

In [None]:
corpus_clean3[:2]

In [None]:
# character length of each example before and after text preprocessing

each_example_len_1 = []
for each_example in doc_list:
    each_example_len_1.append(sum(map(len, each_example)))

each_example_len_2 = []
for each_example in corpus_clean:
    each_example_len_2.append(len(each_example))

lens = pd.DataFrame({"len_before_processing":each_example_len_1,
                    "len_after_processing":each_example_len_2})
lens

In [None]:
# save data to disk - serialise python object to bytes

import pickle

with open('corpus_clean.pickle', 'wb') as handle:
    pickle.dump(corpus_clean3, handle, protocol=pickle.HIGHEST_PROTOCOL)

# keyword-based classifier

Here, we build a keyword based classifier to identify the most frequent words in positive instances under the assumption that these are a predictors of the positive class. Thus, we remove the negative instances that have those words present in the text. 

In [None]:
# find frequent words in positive instances

pos = corpus_clean3[0:62]
neg = corpus_clean3[62:5020]
neg_complete = neg.copy()

In [None]:
neg[:3]

In [None]:
# function to count frequency of words

from nltk import ngrams

n = 2
def count_word(text):
    
    wordcount = {}
    for each_example in text:
        for gram in ngrams(each_example.split(), n):
            if gram not in wordcount:
                wordcount[gram] = 1
            else:
                wordcount[gram] += 1
            
    return wordcount

# count word frequency and find indices of instances

word_count = count_word(pos)
word_count_sorted = sorted(word_count.items(), key = lambda item:item[1], reverse=True)
common_gram_tuple = word_count_sorted[:50]
common_gram = [word[0] for word in common_gram_tuple]

top_grams = []
for gram in common_gram:
    top_grams.append(" ".join(gram))
        
print(top_grams)
len(top_grams)

In [None]:
# remove grams that are verbs, places, etc

grams_remove = ['poder ser', 'haber ser', 'ser mayor', 'objetivo ser', 'costa rico', 'robinson crusoe', 'sur méxico', 'encontrar diferencia', 'diferencia significativo', 'ser determinar', 'resultado mostrar', 'ser significativamente', 'ser menor', 'san josé', 'haber pasar','presentar mayor']

indices = []
for i, item in enumerate(top_grams):
    if item in grams_remove:
        indices.append(i)
        
for indx in sorted(indices, reverse = True): 
    del top_grams[indx]

print(indices)
print(len(top_grams))
top_grams

In [None]:
# function to find indices of instances that use frequent words

def find_indices(list1, list2, min_word_count=2):
    
    indices = []
    for i, item1 in enumerate(list1):
        word_count = sum(1 for gram in list2 if gram in item1)
        if word_count >= min_word_count:
            indices.append(i)
            
    return indices

indices_pos = find_indices(pos, top_grams)
indices_neg = find_indices(neg, top_grams)

print(len(neg_complete))
print(len(indices_pos))#62
print(len(indices_neg))

In [None]:
# plot wordcloud of vocabulary in each corpus

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud, STOPWORDS
# pip install Pillow --> this library is needed to read in image as the mask for the word cloud
from PIL import Image 

pos_words = " ".join(pos)
neg_words = " ".join(neg)

stopwords = set(STOPWORDS) # excludes stopwords in plot

def plot_wordcloud(text):
    
    # create wordcloud object
    
    wordcloud = WordCloud(
    width = 800, height = 800,
    background_color ='white',
    stopwords = stopwords,
    min_font_size = 10).generate(text)
    
    # plot the WordCloud image
    
    plt.figure(figsize = (8, 8), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)
    plt.show()

# wordcloud.to_file("name.png")

In [None]:
plot_wordcloud(pos_words)

In [None]:
plot_wordcloud(neg_words)

# removing instances from negative corpus that contain frequent words

In [None]:
removed_examples = []
for indx in sorted(indices_neg, reverse=True):
    if indx < len(neg):
        removed_examples.append(neg.pop(indx))
        
print(len(removed_examples))
removed_examples[:]

print(len(neg_complete))
print(len(neg))

By the end of the pre-processing section we have:

- one corpus of positives with lenght 62
- two corpuses of negatives:
    - neg_complete with length 4957
    - neg with length 3848

In [None]:
import mpu

mpu.io.write('neg_short.pickle', neg)
mpu.io.write('neg_complete.pickle', neg_complete)
mpu.io.write('pos.pickle', pos)

----------------------------------------------------------------------------------------------------------