In [None]:
from collections import defaultdict
import json
import os
from string import punctuation

from gensim.summarization import keywords, summarize
import matplotlib.pyplot as plt
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from rake_nltk import Rake
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from wordcloud import WordCloud
from src.preprocessing import CorpusPreprocess
from src import PROJECT_ROOT

In [None]:
def get_top_n_ngrams(matrix, vocab, ngram, n=20):
    """
    Function to get top n-grams from from document-term matrix and corresponding vocabulary
    """
    # Filter vocab to include just n-grams
    vocab = dict(filter(lambda x: len(x[0].split()) == ngram, vocab.items()))
    # Filter matrix to include just n-grams
    matrix = matrix[:, list(vocab.values())]
    # Get inverse vocab mapping: new_matrix_index -> n-gram
    inv_vocab = {i: k for i, (k, _) in enumerate(vocab.items())}
    # Get count of each n-gram
    counts = np.asarray(matrix.sum(axis=0)).flatten()
    top_ngrams = defaultdict(int)
    # Iterate over n argmax indexes of counts
    for i in reversed(counts.argsort()[-n:]):
        top_ngrams[inv_vocab[i]] = counts[i]
    return top_ngrams

# Reading the data

In [None]:
# Reading data
INPUT_PATH = os.path.join(PROJECT_ROOT, "tasks", "extract_text", "output")
with open(os.path.join(INPUT_PATH, "pdf_files.json")) as json_file:
    data = json.load(json_file)
    
df = pd.DataFrame(
    {
        "filename": data.keys(),
        "country": [i["Country"] for i in data.values()],
        "text": [i["Text"] for i in data.values()]
    }
)

In [None]:
# Creating word count field
df['word_count'] = df['text'].apply(lambda x: len(str(x).split(" ")))
df.count()

In [None]:
# Removing document without text
rmv = df.index[df['word_count'] == 1].tolist()
print(df.loc[rmv, 'filename'])
df = df.drop(rmv).reset_index(drop=True)
df.count()

In [None]:
# Removing badly read documents
bad_docs = ["CreditoGanadero_Mexico", "Ley Especial Cafe_ElSalvador", "Sembrando Vida Report"]
df = df.drop(df.index[df['filename'].isin(bad_docs)].tolist()).reset_index(drop=True)
df.count()

In [None]:
df.head()

In [None]:
df.count()

# Preprocessing the data

#### Experiment: Using a stanza pipeline -> turns out that lemmatization is not as necessary for now

In [None]:
# import stanza 

# nlp = stanza.Pipeline(lang='es', processors='tokenize,mwt,pos,lemma')
# lemmatize_pipeline = stanza.Pipeline(lang='es', processors='tokenize, lemma')

# def lemmatize_text(text):
#     lemmatized_text = lemmatize_pipeline(text)
#     return " ".join([word.lemma for sentence in lemmatized_text.sentences for word in sentence.words])

# df["pre_pretext"] = df["pre_pretext"].apply(lambda x: lemmatize_text(x))

#### Mix common stopwords with words that we know are frequent, such as dates 

In [None]:
spa_stopwords = set(stopwords.words('spanish'))
extra_stopwords = {"ley", "artículo", "ser", "así", "según", "nº", "diario", 
                   "enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "setiembre", "octubre", "noviembre", "diciembre",
                   "lunes", "martes", "miercoles", "jueves", "viernes", "sabado", "domingo"}
spa_stopwords = spa_stopwords.union(extra_stopwords)

In [None]:
prep = CorpusPreprocess(
    language='spanish', 
    stop_words=spa_stopwords,
    lowercase=True,
    strip_accents=True,
    strip_numbers=True,
    punctuation_list=punctuation,
    strip_urls=True,
#     stemmer=SnowballStemmer('spanish'), 
    max_df=0.9, 
    min_df=2
)

In [None]:
df['prep_text'] = prep.fit_transform(df['text'], tokenize=False)

In [None]:
df.head()

# Word count for each document

In [None]:
# Fetch word count for each document
df['word_count'].plot(kind='box')
plt.show()

In [None]:
# Describe word count
df['word_count'].describe()

Should we weight each document? Otherwise we could find keywords that do not represent each document in the same way.

# Bag-of-Words

In [None]:
# Count Vectorizer
cv = CountVectorizer(max_features=20000, ngram_range=(1,7))
bow_X = cv.fit_transform(df['prep_text'])

In [None]:
# Get top uni-grams
top_unigrams = get_top_n_ngrams(bow_X, cv.vocabulary_, 1, 20)

plt.bar(top_unigrams.keys(), top_unigrams.values())
plt.xticks(rotation=90)
plt.ylabel('freq')
plt.title('Top 20 unigrams')
plt.show()

In [None]:
# Get top bi-grams
top_bigrams = get_top_n_ngrams(bow_X, cv.vocabulary_, 2, 20)

plt.bar(top_bigrams.keys(), top_bigrams.values())
plt.xticks(rotation=90)
plt.ylabel('freq')
plt.title('Top 20 bigrams')
plt.show()

In [None]:
# Get top tri-grams
top_trigrams = get_top_n_ngrams(bow_X, cv.vocabulary_, 3, 20)

plt.bar(top_trigrams.keys(), top_trigrams.values())
plt.xticks(rotation=90)
plt.ylabel('freq')
plt.title('Top 20 trigrams')
plt.show()

In [None]:
top_trigrams

## What if we want to normalize by word counts?

In [None]:
bow_X_norm = bow_X / bow_X.sum(axis=1)

In [None]:
# Get top uni-grams
top_unigrams = get_top_n_ngrams(bow_X_norm, cv.vocabulary_, 1, 20)

plt.bar(top_unigrams.keys(), top_unigrams.values())
plt.xticks(rotation=90)
plt.ylabel('freq')
plt.title('Top 20 unigrams')
plt.show()

In [None]:
# Get top bi-grams
top_bigrams = get_top_n_ngrams(bow_X_norm, cv.vocabulary_, 2, 20)

plt.bar(top_bigrams.keys(), top_bigrams.values())
plt.xticks(rotation=90)
plt.ylabel('freq')
plt.title('Top 20 bigrams')
plt.show()

In [None]:
# Get top tri-grams
top_trigrams = get_top_n_ngrams(bow_X_norm, cv.vocabulary_, 3, 20)

plt.bar(top_trigrams.keys(), top_trigrams.values())
plt.xticks(rotation=90)
plt.ylabel('freq')
plt.title('Top 20 trigrams')
plt.show()

In [None]:
top_trigrams

# TF-IDF

In [None]:
# Count Vectorizer
tv = TfidfVectorizer(max_features=20000, ngram_range=(1,3))
tfidf_X = tv.fit_transform(df['prep_text'])

In [None]:
# Get top uni-grams
top_unigrams = get_top_n_ngrams(tfidf_X, tv.vocabulary_, 1, 20)

plt.bar(top_unigrams.keys(), top_unigrams.values())
plt.xticks(rotation=90)
plt.ylabel('freq')
plt.title('Top 20 unigrams')
plt.show()

In [None]:
# Get top bi-grams
top_bigrams = get_top_n_ngrams(tfidf_X, cv.vocabulary_, 2, 20)

plt.bar(top_bigrams.keys(), top_bigrams.values())
plt.xticks(rotation=90)
plt.ylabel('freq')
plt.title('Top 20 bigrams')
plt.show()

In [None]:
# Get top tri-grams
top_trigrams = get_top_n_ngrams(tfidf_X, cv.vocabulary_, 3, 20)

plt.bar(top_trigrams.keys(), top_trigrams.values())
plt.xticks(rotation=90)
plt.ylabel('freq')
plt.title('Top 20 trigrams')
plt.show()

# Can we see keywords for single document?

In [None]:
print(df.loc[40, "text"][:1000],"...")

print('\nGet top uni-grams bow:')
for k, v in get_top_n_ngrams(bow_X[40], cv.vocabulary_, 1, 10).items():
    print(f"\"{k}\" count: {round(v,3)}")
    
print('\nGet top uni-grams tfidf:')
for k, v in get_top_n_ngrams(tfidf_X[40], tv.vocabulary_, 1, 10).items():
    print(f"\"{k}\" count: {round(v,3)}")

# Word cloud

## BOW

In [None]:
sorted_vocab = {k: v for k, v in sorted(cv.vocabulary_.items(), key=lambda item: item[1])}
frequencies = np.asarray(bow_X.sum(axis=0)).flatten()
word_freq = {k:v for k, v in zip(sorted_vocab.keys(), frequencies)}

wordcloud = WordCloud(
    background_color='white',
    max_words=100,
    max_font_size=50, 
    random_state=42
).generate_from_frequencies(word_freq)

fig = plt.figure(figsize=(13, 13))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
# fig.savefig("word1.png", dpi=900)

## BOW normalized

In [None]:
sorted_vocab = {k: v for k, v in sorted(cv.vocabulary_.items(), key=lambda item: item[1])}
frequencies = np.asarray(bow_X_norm.sum(axis=0)).flatten()
word_freq = {k:v for k, v in zip(sorted_vocab.keys(), frequencies)}

wordcloud = WordCloud(
    background_color='white',
    max_words=100,
    max_font_size=50, 
    random_state=42
).generate_from_frequencies(word_freq)

fig = plt.figure(figsize=(13, 13))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
# fig.savefig("word1.png", dpi=900)

## TF-IDF

In [None]:
sorted_vocab = {k: v for k, v in sorted(tv.vocabulary_.items(), key=lambda item: item[1])}
frequencies = np.asarray(tfidf_X.sum(axis=0)).flatten()
word_freq = {k:v for k, v in zip(sorted_vocab.keys(), frequencies)}

wordcloud = WordCloud(
    background_color='white',
    max_words=100,
    max_font_size=50, 
    random_state=42
).generate_from_frequencies(word_freq)

fig = plt.figure(figsize=(13, 13))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
# fig.savefig("word1.png", dpi=900)

# Keyword extraction algorithms

## Preprocessing (keep sentence structure)

In [None]:
sentences = df['text'].apply(lambda x: sent_tokenize(x, language='spanish')).explode()
sentences

In [None]:
# Word count per sentence
sentences.str.split().apply(lambda x: len(x)).describe()

In [None]:
prep = CorpusPreprocess(
    language='spanish', 
    stop_words=spa_stopwords,
    lowercase=True,
    strip_accents=True,
    strip_numbers=True,
    strip_punctuation=punctuation,
#   stemmer=SnowballStemmer('spanish'), 
    max_df=0.9, 
    min_df=2
)

sentences_prep = pd.Series(prep.fit_transform(sentences, tokenize=False), index=sentences.index)

In [None]:
sentences_prep

# Rake and TextRank

In [None]:
for ix in sentences_prep.index.unique():
    # RAKE
    rake = Rake(language="spanish")
    rake.extract_keywords_from_sentences(sentences_prep[ix])
    rake_out = rake.get_ranked_phrases()
    print("\nRAKE OUTPUT:\n> ", "\n> ".join(rake_out[:10]))
    
    # TextRankV1
    textrankv1_out = keywords(" ".join(sentences_prep[ix]), split=True)
    print("\nTEXTRANKV1 OUTPUT:\n> ", "\n> ".join(textrankv1_out[:10]))
    
    # TextRankV2
    textrankv2_out = summarize(". ".join(sentences_prep[ix]), split=True)
    print("\nTEXTRANKV2 OUTPUT:\n> ", "\n> ".join(textrankv2_out[:10]))
    break

# Look into 
https://boudinfl.github.io/pke/build/html/index.html