In [1]:
import numpy as np
import pandas as pd
import glob
import re
import unicodedata

# NLTK
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk import pos_tag

# Joblib
from joblib import Parallel, delayed

# Gensim
from gensim import corpora
from gensim import models

# Spacy
import spacy
from spacy_spanish_lemmatizer import SpacyCustomLemmatizer

# Sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Carga de documentos

In [22]:
path_health = "../documents/health"
path_politics = "../documents/politics"
path_sports = "../documents/sports"
path_documents = "../documents"
path_stopwords = "../documents/stopwords.txt"

In [3]:
def load_document(path):
    return open(path,encoding='utf-8').read(), path.split("\\")[-2]

In [4]:
documents = Parallel(n_jobs = -1)(delayed(load_document)(path) for path in glob.glob(path_documents+"/*/*.txt"))
documents = pd.DataFrame(documents, columns=["text", "class"])
documents['text'] = documents['text'].astype('string')

In [5]:
documents.head()

Unnamed: 0,text,class
0,Aceptémoslo de una vez: perder peso de manera ...,health
1,"Sin tiempo para hacer recuento de daños, irrum...",health
2,Mucha gente intenta mostrar en las redes socia...,health
3,Una faceta clave en la frenética lucha global ...,health
4,La curva de contagios de coronavirus se mantie...,health


## Preprocesado

In [29]:
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)|(\⁰)|(\•)|(\\')")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "
    
nlp = spacy.load("es")
lemmatizer = SpacyCustomLemmatizer()

def load_stopwords(path):
    return [line.strip() for line in open(path_stopwords, encoding = "utf-8").readlines()]

STOP_WORDS = set(load_stopwords(path_stopwords))

def delete_stop_words(doc):
    tokens = wordpunct_tokenize(doc)
    clean = [token for token in tokens if token not in STOP_WORDS and len(token) > 2]
    return clean

def preprocess_document(document):
    document = REPLACE_NO_SPACE.sub(NO_SPACE, document.lower())
    document = REPLACE_WITH_SPACE.sub(SPACE, document)
    # tokens = wordpunct_tokenize(document)
    # tokens = delete_proper_nouns(tokens)
    tokens = delete_stop_words(document)
    return tokens

def lemmatize(tokens):
    tokens = nlp(" ".join(tokens))
    return [token.lemma_ for token in tokens]


# TODO: REVISAR ESTO

def delete_proper_nouns(tokens):
    # Tag the tokens with their type - ie are they nouns or not
    lTokens = pos_tag(tokens)
    # find all the proper nouns and print them out
    lTagDict = findtags('NNP', lTokens)
    return [token.lower() for token in tokens if token not in lTagDict]
    
def findtags(tag_prefix, tagged_text):
    """
    Find tokens matching the specified tag_prefix
    """
    cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text
                                  if tag.startswith(tag_prefix))
    print(cfd.conditions())
    return [list(cfd[tag].keys()) for tag in cfd.conditions()][0]

In [30]:
documents["preprocesado"] = documents["text"].apply(lambda x: preprocess_document(x))
# documents["lematizado"] = documents["preprocesado"].apply(lambda x: lemmatize(x))
documents.head()

Unnamed: 0,text,class,preprocesado
0,Aceptémoslo de una vez: perder peso de manera ...,health,"[aceptémoslo, vez, perder, peso, manera, rápid..."
1,"Sin tiempo para hacer recuento de daños, irrum...",health,"[tiempo, hacer, recuento, daños, irrumpe, segu..."
2,Mucha gente intenta mostrar en las redes socia...,health,"[mucha, gente, intenta, mostrar, redes, social..."
3,Una faceta clave en la frenética lucha global ...,health,"[faceta, clave, frenética, lucha, global, pfiz..."
4,La curva de contagios de coronavirus se mantie...,health,"[curva, contagios, coronavirus, mantiene, espa..."


In [31]:
# Training
train_health = documents[documents["class"] == "health"].iloc[:15]
train_politics = documents[documents["class"] == "politics"].iloc[:15]
train_sports = documents[documents["class"] == "sports"].iloc[:15]

train_data = pd.concat([train_health, train_politics, train_sports])
print(f"Training data ==> {len(train_data)} documents")

Training data ==> 45 documents


In [32]:
# Testing
test_health = documents[documents["class"] == "health"].iloc[15:]
test_politics = documents[documents["class"] == "politics"].iloc[15:]
test_sports = documents[documents["class"] == "sports"].iloc[15:]

test_data = pd.concat([test_health, test_politics, test_sports])
print(f"Testing data ==> {len(test_data)} documents")

Testing data ==> 105 documents


# Extracción del glosario

## Extracción propia

In [33]:
stopwords_dir = "../documents/stopwords.txt"

In [34]:
def get_k_tfidf_keywords(df, k):
    tokens = df["preprocesado"].values
    dictionary = corpora.Dictionary(tokens)
    bow = [dictionary.doc2bow(doc) for doc in tokens]
    tfidf = models.TfidfModel(bow)
    bow_tfidf = tfidf[bow]
    tfidf_dic = {dictionary.get(id): value for doc in bow_tfidf for id, value in doc}
    tfidf_list = [k for k, v in sorted(tfidf_dic.items(), key=lambda item: item[1], reverse = True)]
    return tfidf_list[:k]

In [35]:
keywords_tfidf_health = get_k_tfidf_keywords(train_health, 30)
keywords_tfidf_politics = get_k_tfidf_keywords(train_politics, 30)
keywords_tfidf_sports = get_k_tfidf_keywords(train_sports, 30)

In [54]:
def remove_duplicates(d1, d2, d3):

    i1 = set(d1) & set(d2)
    i2 = set(d1) & set(d3)
    i3 = set(d2) & set(d3)
    
    deleted = set(list(i1.union(i2).union(i3)))
    
    for key in deleted:
        try:
            d1.remove(key)
        except:
            print(f"D1 no tiene {key}")
        try:
            d2.remove(key)
        except:
            print(f"D2 no tiene {key}")
        try:
            d3.remove(key)
        except:
            print(f"D3 no tiene {key}")
            
    return d1, d2, d3   

In [55]:
keywords_tfidf_health, keywords_tfidf_politics, keywords_tfidf_sports = remove_duplicates(keywords_tfidf_health, keywords_tfidf_politics, keywords_tfidf_sports)

## Python RAKE

In [58]:
import RAKE

In [87]:
def sort_tuple(tup):
    tup.sort(key = lambda x: x[1])
    return tup

rake = RAKE.Rake(path_stopwords)
train_health["joined"] = train_health["preprocesado"].apply(lambda x: " ".join(x))
train_health['joined'] = train_health.joined.astype(str)
data = " ".join(train_health["joined"].values)
print(type(data))
sort_tuple(rake.run(data))[-10:]

<class 'str'>


[]

## RAKE Nltk

In [96]:
from rake_nltk import Rake

r = Rake(list(STOP_WORDS), max_length=1)

r.extract_keywords_from_sentences(train_health["joined"].values)
r.get_ranked_phrases()[0:10]

[]

## Gensim

In [104]:
from gensim.summarization import keywords
print(keywords(data, scores=True, words=30, pos_filter=('NNP', 'JJ', "NNPS", "VB")))

[('puede', 0.19594940686009282), ('puedes', 0.19594940686009282), ('dia', 0.17008555228542271), ('dias', 0.17008555228542271), ('enfermedad', 0.16558211421752667), ('enfermedades', 0.16558211421752667), ('solo', 0.152695702636688), ('solos', 0.152695702636688), ('personas', 0.14003570409239718), ('persona', 0.14003570409239718), ('caso', 0.13349165653363246), ('casos', 0.13349165653363246), ('ser', 0.12216997285059364), ('estudio', 0.11202040604382715), ('estudios', 0.11202040604382715), ('forma', 0.11181865814141997), ('formas', 0.11181865814141997), ('comidas', 0.10754844009392951), ('comida', 0.10754844009392951), ('salud', 0.09408628758123108), ('saludable', 0.09408628758123108), ('saludables', 0.09408628758123108), ('asi', 0.09401564774487135), ('hora', 0.09375266330875939), ('horas', 0.09375266330875939), ('pueden', 0.09040105628916605), ('tiempo', 0.09018828300330411), ('tiempos', 0.09018828300330411), ('menos', 0.08643507842851535), ('ano', 0.08482748300054685), ('anos', 0.0848

## Kmeans

In [56]:
def get_k_kmeans_keywords(data, k):
    data = data.copy()
    data["joined"] = data["preprocesado"].apply(lambda x: " ".join(x))
    k_means_data = data["joined"].values
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(k_means_data)
    
    model = KMeans(n_clusters=3, init='k-means++', max_iter=1000, n_init=1, random_state = 5, algorithm="full")
    model.fit(X)
    
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    
    keywords_kmeans_politics = [terms[ind] for ind in order_centroids[0, :k]]
    keywords_kmeans_health = [terms[ind] for ind in order_centroids[1, :k]]
    keywords_kmeans_sports = [terms[ind] for ind in order_centroids[2, :k]]
    
    return keywords_kmeans_politics, keywords_kmeans_health, keywords_kmeans_sports

In [57]:
keywords_kmeans_politics, keywords_kmeans_health, keywords_kmeans_sports = get_k_kmeans_keywords(train_data, 30)
print(len(keywords_kmeans_politics), len(keywords_kmeans_health), len(keywords_kmeans_sports))
keywords_kmeans_politics, keywords_kmeans_health, keywords_kmeans_sports = remove_duplicates(keywords_kmeans_politics, keywords_kmeans_health, keywords_kmeans_sports)
print(len(keywords_kmeans_politics), len(keywords_kmeans_health), len(keywords_kmeans_sports))

30 30 30
D3 no tiene sánchez
D1 no tiene madrid
D1 no tiene puede
D1 no tiene ser
D3 no tiene gobierno
D3 no tiene españa
27 24 27
