In [1]:
import numpy as np
import pandas as pd
import glob
import re
import unicodedata

# NLTK
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk import pos_tag
from nltk.util import ngrams

# Joblib
from joblib import Parallel, delayed

# Gensim
from gensim import corpora
from gensim import models
from gensim.summarization import keywords
from gensim.models import Phrases
from gensim.models.phrases import Phraser

# Spacy
import spacy
from spacy_spanish_lemmatizer import SpacyCustomLemmatizer

# Sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Carga de documentos

In [2]:
path_health = "../documents/health"
path_politics = "../documents/politics"
path_sports = "../documents/sports"
path_documents = "../documents"
path_stopwords = "../documents/stopwords.txt"

In [3]:
def load_document(path):
    return open(path,encoding='utf-8').read(), path.split("\\")[-2]

In [4]:
documents = Parallel(n_jobs = -1)(delayed(load_document)(path) for path in glob.glob(path_documents+"/*/*.txt"))
documents = pd.DataFrame(documents, columns=["text", "class"])
documents['text'] = documents['text'].astype('string')

In [5]:
documents.head()

Unnamed: 0,text,class
0,Aceptémoslo de una vez: perder peso de manera ...,health
1,"Sin tiempo para hacer recuento de daños, irrum...",health
2,Mucha gente intenta mostrar en las redes socia...,health
3,Una faceta clave en la frenética lucha global ...,health
4,La curva de contagios de coronavirus se mantie...,health


## Preprocesado

In [6]:
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)|(\⁰)|(\•)|(\\')")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "
    
nlp = spacy.load("es")
lemmatizer = SpacyCustomLemmatizer()

def load_stopwords(path):
    return [line.strip() for line in open(path_stopwords, encoding = "utf-8").readlines()]

STOP_WORDS = set(load_stopwords(path_stopwords))

def delete_stop_words(doc):
    tokens = wordpunct_tokenize(doc)
    clean = [token for token in tokens if token not in STOP_WORDS and len(token) > 2]
    return clean

def preprocess_document(document):
    document = REPLACE_NO_SPACE.sub(NO_SPACE, document.lower())
    document = REPLACE_WITH_SPACE.sub(SPACE, document)
    # tokens = wordpunct_tokenize(document)
    # tokens = delete_proper_nouns(tokens)
    return document

def lemmatize(tokens):
    tokens = nlp(" ".join(tokens))
    return [token.lemma_ for token in tokens]


# TODO: REVISAR ESTO

def delete_proper_nouns(tokens):
    # Tag the tokens with their type - ie are they nouns or not
    lTokens = pos_tag(tokens)
    # find all the proper nouns and print them out
    lTagDict = findtags('NNP', lTokens)
    return [token.lower() for token in tokens if token not in lTagDict]
    
def findtags(tag_prefix, tagged_text):
    """
    Find tokens matching the specified tag_prefix
    """
    cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text
                                  if tag.startswith(tag_prefix))
    print(cfd.conditions())
    return [list(cfd[tag].keys()) for tag in cfd.conditions()][0]

In [7]:
documents["preprocesado"] = documents["text"].apply(lambda x: preprocess_document(x))
documents["tokens"] = documents["preprocesado"].apply(lambda x: delete_stop_words(x))
# documents["lematizado"] = documents["preprocesado"].apply(lambda x: lemmatize(x))
documents.head()

Unnamed: 0,text,class,preprocesado,tokens
0,Aceptémoslo de una vez: perder peso de manera ...,health,aceptémoslo de una vez perder peso de manera r...,"[aceptémoslo, perder, peso, rápida, indolora, ..."
1,"Sin tiempo para hacer recuento de daños, irrum...",health,sin tiempo para hacer recuento de daños irrump...,"[recuento, daños, irrumpe, ola, virus, golpear..."
2,Mucha gente intenta mostrar en las redes socia...,health,mucha gente intenta mostrar en las redes socia...,"[gente, mostrar, redes, sociales, versión, fot..."
3,Una faceta clave en la frenética lucha global ...,health,una faceta clave en la frenética lucha global ...,"[faceta, clave, frenética, lucha, global, pfiz..."
4,La curva de contagios de coronavirus se mantie...,health,la curva de contagios de coronavirus se mantie...,"[curva, contagios, coronavirus, mantiene, espa..."


In [8]:
# Training
train_health = documents[documents["class"] == "health"].iloc[:15]
train_politics = documents[documents["class"] == "politics"].iloc[:15]
train_sports = documents[documents["class"] == "sports"].iloc[:15]

train_data = pd.concat([train_health, train_politics, train_sports])
print(f"Training data ==> {len(train_data)} documents")

Training data ==> 45 documents


In [9]:
# Testing
test_health = documents[documents["class"] == "health"].iloc[15:]
test_politics = documents[documents["class"] == "politics"].iloc[15:]
test_sports = documents[documents["class"] == "sports"].iloc[15:]

test_data = pd.concat([test_health, test_politics, test_sports])
print(f"Testing data ==> {len(test_data)} documents")

Testing data ==> 105 documents


### Bigramas

In [10]:
def get_bigrams(documents, threshold):
    token_ = [doc.split(" ") for doc in documents]
    bigram = Phrases(token_, min_count=1, threshold=50, delimiter=b' ')
    bigram_phraser = Phraser(bigram)
    bigram_token = []
    for sent in token_:
        for bigram in bigram_phraser[sent]:
            if len(bigram.split(" ")) > 1: # comprobamos que realmente es un bigrama
                bigram_token.append(bigram) 
    return bigram_token
           
def check_bigram(x, bigrams):
    if x.find("jamón serrano") != -1 or x.find("jamón") != -1:
        print(x)
    return [bigram for bigram in bigrams if x.find(bigram) != -1]

In [11]:
bigrams_sports = get_bigrams(train_sports["preprocesado"].values, 10)
bigrams_health = get_bigrams(train_health["preprocesado"].values, 10)
bigrams_politics = get_bigrams(train_politics["preprocesado"].values, 10)
bigrams = get_bigrams(train_data["preprocesado"].values, 50)


train_sports["bigrams"] = train_sports["preprocesado"].apply(lambda x: check_bigram(x, bigrams_sports))
train_health["bigrams"] = train_health["preprocesado"].apply(lambda x: check_bigram(x, bigrams_health))
train_politics["bigrams"] = train_politics["preprocesado"].apply(lambda x: check_bigram(x, bigrams_politics))

train_sports["tokens + bigrams"] = train_sports["tokens"] + train_sports["bigrams"]
train_health["tokens + bigrams"] = train_health["tokens"] + train_health["bigrams"]
train_politics["tokens + bigrams"] = train_politics["tokens"] + train_politics["bigrams"]

train_data["bigrams"] = train_data["preprocesado"].apply(lambda x: check_bigram(x, bigrams))
train_data["tokens + bigrams"] = train_data["tokens"] + train_data["bigrams"]

# Glosario

## Extracción de keywords

### Extracción propia

In [12]:
stopwords_dir = "../documents/stopwords.txt"

In [13]:
def get_k_tfidf_keywords(df, k):
    tokens = df["tokens + bigrams"].values
    dictionary = corpora.Dictionary(tokens)
    bow = [dictionary.doc2bow(doc) for doc in tokens]
    tfidf = models.TfidfModel(bow)
    bow_tfidf = tfidf[bow]
    tfidf_dic = {dictionary.get(id): value for doc in bow_tfidf for id, value in doc}
    tfidf_list = [k for k, v in sorted(tfidf_dic.items(), key=lambda item: item[1], reverse = True)]
    return tfidf_list[:k]

In [14]:
keywords_tfidf_health = get_k_tfidf_keywords(train_health, 100)
keywords_tfidf_politics = get_k_tfidf_keywords(train_politics, 100)
keywords_tfidf_sports = get_k_tfidf_keywords(train_sports, 100)

In [15]:
def remove_duplicates(d1, d2, d3):

    i1 = set(d1) & set(d2)
    i2 = set(d1) & set(d3)
    i3 = set(d2) & set(d3)
    
    deleted = set(list(i1.union(i2).union(i3)))
    
    for key in deleted:
        try:
            d1.remove(key)
        except:
            print(f"D1 no tiene {key}")
        try:
            d2.remove(key)
        except:
            print(f"D2 no tiene {key}")
        try:
            d3.remove(key)
        except:
            print(f"D3 no tiene {key}")
            
    return d1, d2, d3   

In [16]:
keywords_tfidf_health, keywords_tfidf_politics, keywords_tfidf_sports = remove_duplicates(keywords_tfidf_health, keywords_tfidf_politics, keywords_tfidf_sports)

D1 no tiene defensa
D1 no tiene récord


### Gensim

In [17]:
def get_k_gensim_keywords(data, k):
    data = data.copy()
    data["joined"] = data["tokens + bigrams"].apply(lambda x: " ".join(x))
    data['joined'] = data.joined.astype(str)
    data = " ".join(data["joined"].values)
    return [key[0] for key in keywords(data, scores=True, words=k, pos_filter=('NNP', 'JJ', "NNPS", "VB"))]

In [18]:
keywords_gensim_health = get_k_gensim_keywords(train_health, 100)
keywords_gensim_politics = get_k_gensim_keywords(train_politics, 100)
keywords_gensim_sports = get_k_gensim_keywords(train_sports, 100)

keywords_gensim_health, keywords_k_gensim_politics, keywords_k_gensim_sports = remove_duplicates(keywords_gensim_health, keywords_gensim_politics, keywords_gensim_sports)

D3 no tiene persona
D3 no tiene casos
D1 no tiene situacion
D2 no tiene horas
D1 no tiene espana
D3 no tiene dia
D1 no tiene partidos
D1 no tiene real
D3 no tiene trata
D1 no tiene partido
D2 no tiene grupo
D2 no tiene puntos
D1 no tiene presidente
D3 no tiene sin
D3 no tiene pandemia
D2 no tiene punto
D3 no tiene personas


### Kmeans

In [19]:
def get_k_kmeans_keywords(data, k):
    data = data.copy()
    data["joined"] = data["tokens"].apply(lambda x: " ".join(x))
    k_means_data = data["joined"].values
    vectorizer = TfidfVectorizer(ngram_range=(1,2))
    X = vectorizer.fit_transform(k_means_data)
    
    model = KMeans(n_clusters=3, init='k-means++', max_iter=1000, n_init=1, random_state = 5, algorithm="full")
    model.fit(X)
    
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    
    keywords_kmeans_politics = [terms[ind] for ind in order_centroids[0, :k]]
    keywords_kmeans_health = [terms[ind] for ind in order_centroids[1, :k]]
    keywords_kmeans_sports = [terms[ind] for ind in order_centroids[2, :k]]
    
    return keywords_kmeans_politics, keywords_kmeans_health, keywords_kmeans_sports

In [20]:
keywords_kmeans_politics, keywords_kmeans_health, keywords_kmeans_sports = get_k_kmeans_keywords(train_data, 100)
print(len(keywords_kmeans_politics), len(keywords_kmeans_health), len(keywords_kmeans_sports))
keywords_kmeans_politics, keywords_kmeans_health, keywords_kmeans_sports = remove_duplicates(keywords_kmeans_politics, keywords_kmeans_health, keywords_kmeans_sports)
print(len(keywords_kmeans_politics), len(keywords_kmeans_health), len(keywords_kmeans_sports))

100 100 100
D1 no tiene forma
D1 no tiene madrid
D3 no tiene sánchez
D3 no tiene presidente
D3 no tiene rey
D3 no tiene real
D3 no tiene españa
D3 no tiene gobierno
D1 no tiene estudio
D3 no tiene partido
D1 no tiene mundo
D3 no tiene pedro sánchez
D3 no tiene pedro
D3 no tiene psoe
D3 no tiene fiscal
D1 no tiene infarto
88 83 94


## Formación del glosario

### Automatizado

In [21]:
def check_relevant_keywords(d1, d2, d3):
    i1 = set(d1) & set(d2)
    i2 = set(d1) & set(d3)
    i3 = set(d2) & set(d3)
    
    deleted = set(list(i1.union(i2).union(i3)))
    return deleted

In [24]:
relevant_keywords_politics = check_relevant_keywords(keywords_kmeans_politics, keywords_gensim_politics, keywords_tfidf_politics)
relevant_keywords_health = check_relevant_keywords(keywords_kmeans_health, keywords_gensim_health, keywords_tfidf_health)
relevant_keywords_sports = check_relevant_keywords(keywords_kmeans_sports, keywords_gensim_sports, keywords_tfidf_sports)

print("Politics ==> ", relevant_keywords_politics)
print("Sports ==> ", relevant_keywords_sports)
print("Health ==> ", relevant_keywords_health)

Politics ==>  {'eutanasia', 'felipe gonzález', 'azurmendi', 'becerril', 'ascen', 'pnv', 'regulación', 'ayuntamiento', 'código', 'vivienda', 'eta', 'informes', 'desahucio', 'marín', 'militares', 'iceta', 'código penal', 'congreso', 'claro', 'comunidades', 'pablo iglesias', 'proyecto', 'jiménez becerril', 'presupuestos generales', 'dsn', 'bildu', 'gonzález', 'presupuestos', 'mal', 'vox', 'armonización fiscal', 'instituciones', 'jueves', 'alberto', 'independentistas', 'texto', 'navidades', 'erc', 'sociedad', 'atención', 'callar', 'casa real', 'penal', 'ministro', 'ciudadanos', 'ley', 'acuerdos', 'notas', 'armonización', 'injurias', 'diputados', 'generales'}
Sports ==>  {'balonmano', 'mercado', 'michael jordan', 'jugador', 'hombre', 'bla', 'liga', 'gasol', 'marc', 'nike', 'temporada', 'pasa', 'campazzo', 'situaciones', 'goles', 'atlético', 'realmente', 'haaland', 'ricky', 'campeones', 'jornet', 'estrella', 'rehabilitación', 'temporadas', 'vaccaro', 'escolta', 'bolt', 'barça', 'duro', 'fran