In [None]:
# November 2022
# Feature engineering
# Violeta Berdejo-Espinola

In [None]:
%pip install numpy pandas spacy mpu spacy stop-words

In [None]:
import pandas as pd
import re
import string

In [None]:
df = pd.read_csv('../data/pos_neg.csv', encoding='utf-8')

print(f'duplicates: {df.title_spa.duplicated().any()}')
print(f'nas: {df["label"].isna().sum()}')

In [None]:
# x data

corpus_list = df.loc[:,"title_spa":"abstract_spa"].values.tolist()
corpus_list_long = df.loc[:,"title_spa":"main_text_spa"].values.tolist()

print(f'instances per\n{df["label"].value_counts()}')

# feature engineering

# removing special characters, punctiation, and numbers

In [None]:
# function to enact regex substitution on a list of strings

def sub_all(regex, corpus_list, replacement=" "):
    
    return [[regex.sub(replacement, col) for col in row] for row in corpus_list]

# defining regular expressions as objects to find unwated text and symbols in corpus

re_citation = re.compile(r"\(.[^())]*\d{4}[^())]*\)")
re_tabfig = re.compile(r"\(\s?\w{1,7}[.]?\s?\d{1}\w?\s?\)")
re_digit_char = re.compile(r"\d+\w{,2}")
re_one_two_letter = re.compile(r"\b\w{1,2}\b")
re_new_line = re.compile(r"\n{1,}")
re_tab = re.compile(r"\t{1,}")
re_html = re.compile(r"</?\w+>")
re_alt_html = re.compile(r"<.*?>")
re_spacing = re.compile(r"\s{2,}")
re_fig = re.compile(r"(fig)")
re_table = re.compile(r"(cuadro)")
punctuation_text = string.punctuation + "¿±♂♀’”°´“×–…" + "\xad" + "\xa0"
translator = str.maketrans(punctuation_text, " " * len(punctuation_text))

# function to process text and output 'clean corpus'

def text_processing(corpus_list):
    
    output = [
        [col.lower() if type(col) is str else "" for col in row] for row in corpus_list
    ]
    output = sub_all(re_citation, output)
    output = sub_all(re_tabfig, output)
    output = sub_all(re_fig, output)
    output = sub_all(re_table, output)
    output = sub_all(re_digit_char, output)
    output = sub_all(re_one_two_letter, output)
    output = [[col.translate(translator) for col in row] for row in output]
#     output = sub_all(re_non_breaking_space, output)
    output = sub_all(re_new_line, output)
    output = sub_all(re_tab, output)
    output = sub_all(re_html, output)
    output = sub_all(re_alt_html, output)
    output = sub_all(re_spacing, [[word.strip() for word in row] for row in output])

    return output

# function to process text and output 'raw corpus'

def text_processing_raw(text):

    output = sub_all(re_html, text)
    output = sub_all(re_alt_html, output)
    
    return output

# run

corpus_clean = text_processing(corpus_list)
corpus_clean_long = text_processing(corpus_list_long)

corpus_clean_raw = text_processing_raw(corpus_list)
corpus_clean_raw_long = text_processing_raw(corpus_list_long)

# lemmatization

In [None]:
import spacy

MODEL = 'es_core_news_md'
spacy.cli.download(MODEL) 
nlp = spacy.load(MODEL, disable=['parser', 'ner'])

def lemmatizer(text):
    
    doc_list = []
    for sentence in text: 
        doc_list.append(" ".join([token.lemma_ for token in nlp(" ".join(sentence))]))
    
    return doc_list

corpus_clean = lemmatizer(corpus_clean)
corpus_clean_long = lemmatizer(corpus_clean_long)

# removing stopwords 

In [None]:
from stop_words import get_stop_words

def remove_stopwords(text):
    
    corpus_clean = [
    " ".join([word for word in sentence.split() if re.sub(r'\W+', '', word) not in get_stop_words('spanish')]) for sentence in text
]
    if any (stopword in corpus_clean for stopword in get_stop_words('spanish')):
        print ('stopwords not excluded from vocabulary')
    else:
        print ('stopwords excluded from vocabulary')
    if any (number in corpus_clean for number in list(range(1,1000001))):
        print ('\nnumbers not excluded from vocabulary')
    else:
        print ('\nnumbers excluded from vocabulary')

    return corpus_clean

corpus_clean = remove_stopwords(corpus_clean)
corpus_clean_long = remove_stopwords(corpus_clean_long)

In [None]:
# character length of each example before and after text preprocessing

each_example_len_1 = []
for each_example in corpus_list:
    each_example_len_1.append(sum(map(len, each_example)))

each_example_len_2 = []
for each_example in corpus_clean:
    each_example_len_2.append(len(each_example))

lens = pd.DataFrame({"len_before_processing":each_example_len_1,
                    "len_after_processing":each_example_len_2})
lens

lens.to_csv('../results/preprocessing/2.diff_word_length_after_feat_eng_longcorpus.csv')

In [None]:
# creating lists of pos and neg instances

pos = corpus_clean[0:62]
neg = corpus_clean[62:5020]

In [None]:
# save data to disk - serialise python object to bytes

import mpu

mpu.io.write('../data/neg.pickle', neg)
mpu.io.write('../data/pos.pickle', pos)
mpu.io.write("../data/corpus_clean.pickle", corpus_clean)
mpu.io.write("../data/corpus_clean_long.pickle", corpus_clean_long)
mpu.io.write("../data/corpus_raw_long.pickle", corpus_clean_raw_long)
mpu.io.write("../data/corpus_raw.pickle", corpus_clean_raw)

----------------------------------------------------------------------------------------------------------