In [1]:
import numpy as np
import pandas as pd
import glob
import re
import unicodedata

# NLTK
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk import pos_tag
from nltk.util import ngrams

# Joblib
from joblib import Parallel, delayed

# Gensim
from gensim import corpora
from gensim import models
from gensim.summarization import keywords
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.similarities import Similarity

# Operatos
from operator import itemgetter

# Spacy
import spacy
from spacy_spanish_lemmatizer import SpacyCustomLemmatizer

# Sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
# statistics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# utils
from time import sleep
import os

# Carga de documentos

In [2]:
path_health = "../documents/health"
path_politics = "../documents/politics"
path_sports = "../documents/sports"
path_documents = "../documents"
path_stopwords = "../documents/stopwords.txt"

In [3]:
def load_document(path):
    return path.split("\\")[-1], open(path,encoding='utf-8').read(), path.split("\\")[-2]

In [4]:
documents = Parallel(n_jobs = -1)(delayed(load_document)(path) for path in glob.glob(path_documents+"/*/*.txt"))
documents = pd.DataFrame(documents, columns=["doc_name", "text", "class"])
documents['text'] = documents['text']

In [5]:
documents.head()

Unnamed: 0,doc_name,text,class
0,health_1.txt,Aceptémoslo de una vez: perder peso de manera ...,health
1,health_10.txt,"Sin tiempo para hacer recuento de daños, irrum...",health
2,health_11.txt,Mucha gente intenta mostrar en las redes socia...,health
3,health_12.txt,Una faceta clave en la frenética lucha global ...,health
4,health_13.txt,La curva de contagios de coronavirus se mantie...,health


## Preprocesado

In [6]:
REPLACE_NO_SPACE = re.compile("(\&)|(\%)|(\$)|(\€)|(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)|(\⁰)|(\•)|(\\')")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "
    
nlp = spacy.load("es")
lemmatizer = SpacyCustomLemmatizer()

def load_stopwords(path):
    return [line.strip() for line in open(path_stopwords, encoding = "utf-8").readlines()]

STOP_WORDS = set(load_stopwords(path_stopwords))

def delete_stop_words(doc):
    tokens = wordpunct_tokenize(doc)
    clean = [token for token in tokens if token not in STOP_WORDS and len(token) > 2]
    return clean

def preprocess_document(document):
    document = REPLACE_NO_SPACE.sub(NO_SPACE, document.lower())
    document = REPLACE_WITH_SPACE.sub(SPACE, document)
    # tokens = wordpunct_tokenize(document)
    # tokens = delete_proper_nouns(tokens)
    return document

def lemmatize(tokens):
    tokens = nlp(" ".join(tokens))
    return [token.lemma_ for token in tokens]


# TODO: REVISAR ESTO

def delete_proper_nouns(tokens):
    # Tag the tokens with their type - ie are they nouns or not
    lTokens = pos_tag(tokens)
    # find all the proper nouns and print them out
    lTagDict = findtags('NNP', lTokens)
    return [token.lower() for token in tokens if token not in lTagDict]
    
def findtags(tag_prefix, tagged_text):
    """
    Find tokens matching the specified tag_prefix
    """
    cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text
                                  if tag.startswith(tag_prefix))
    print(cfd.conditions())
    return [list(cfd[tag].keys()) for tag in cfd.conditions()][0]

In [7]:
documents["preprocesado"] = documents["text"].apply(lambda x: preprocess_document(x))
documents["tokens"] = documents["preprocesado"].apply(lambda x: delete_stop_words(x))
# documents["lematizado"] = documents["preprocesado"].apply(lambda x: lemmatize(x))
documents.head()

Unnamed: 0,doc_name,text,class,preprocesado,tokens
0,health_1.txt,Aceptémoslo de una vez: perder peso de manera ...,health,aceptémoslo de una vez perder peso de manera r...,"[aceptémoslo, perder, peso, rápida, indolora, ..."
1,health_10.txt,"Sin tiempo para hacer recuento de daños, irrum...",health,sin tiempo para hacer recuento de daños irrump...,"[recuento, daños, irrumpe, ola, virus, golpear..."
2,health_11.txt,Mucha gente intenta mostrar en las redes socia...,health,mucha gente intenta mostrar en las redes socia...,"[gente, mostrar, redes, sociales, versión, fot..."
3,health_12.txt,Una faceta clave en la frenética lucha global ...,health,una faceta clave en la frenética lucha global ...,"[faceta, clave, frenética, lucha, global, pfiz..."
4,health_13.txt,La curva de contagios de coronavirus se mantie...,health,la curva de contagios de coronavirus se mantie...,"[curva, contagios, coronavirus, mantiene, espa..."


In [8]:
# Training
train_health = documents[documents["class"] == "health"].iloc[:15]
train_politics = documents[documents["class"] == "politics"].iloc[:15]
train_sports = documents[documents["class"] == "sports"].iloc[:15]

train_data = pd.concat([train_health, train_politics, train_sports])
print(f"Training data ==> {len(train_data)} documents")

Training data ==> 45 documents


In [9]:
# Testing
test_health = documents[documents["class"] == "health"].iloc[15:]
test_politics = documents[documents["class"] == "politics"].iloc[15:]
test_sports = documents[documents["class"] == "sports"].iloc[15:]

test_data = pd.concat([test_health, test_politics, test_sports])
test_data.reset_index(inplace = True)
print(f"Testing data ==> {len(test_data)} documents")

Testing data ==> 105 documents


### Bigramas

In [10]:
def get_bigrams(documents, threshold):
    token_ = [doc.split(" ") for doc in documents]
    bigram = Phrases(token_, min_count=1, threshold=threshold, delimiter=b' ')
    bigram_phraser = Phraser(bigram)
    bigram_token = []
    for sent in token_:
        for bigram in bigram_phraser[sent]:
            if len(bigram.split(" ")) > 1: # comprobamos que realmente es un bigrama
                bigram_token.append(bigram) 
    return bigram_token
           
def check_bigram(x, bigrams):
    if x.find("jamón serrano") != -1 or x.find("jamón") != -1:
        print(x)
    return [bigram for bigram in bigrams if x.find(bigram) != -1]

In [11]:
bigrams_sports = get_bigrams(train_sports["preprocesado"].values, 50)
bigrams_health = get_bigrams(train_health["preprocesado"].values, 50)
bigrams_politics = get_bigrams(train_politics["preprocesado"].values, 50)
bigrams = get_bigrams(train_data["preprocesado"].values, 50)


train_sports["bigrams"] = train_sports["preprocesado"].apply(lambda x: check_bigram(x, bigrams_sports))
train_health["bigrams"] = train_health["preprocesado"].apply(lambda x: check_bigram(x, bigrams_health))
train_politics["bigrams"] = train_politics["preprocesado"].apply(lambda x: check_bigram(x, bigrams_politics))

train_sports["tokens + bigrams"] = train_sports["tokens"] + train_sports["bigrams"]
train_health["tokens + bigrams"] = train_health["tokens"] + train_health["bigrams"]
train_politics["tokens + bigrams"] = train_politics["tokens"] + train_politics["bigrams"]

train_data["bigrams"] = train_data["preprocesado"].apply(lambda x: check_bigram(x, bigrams))
train_data["tokens + bigrams"] = train_data["tokens"] + train_data["bigrams"]

# Glosario

## Extracción de keywords

### Extracción propia

In [12]:
stopwords_dir = "../documents/stopwords.txt"

In [13]:
def get_k_tfidf_keywords(df, k):
    tokens = df["tokens + bigrams"].values
    dictionary = corpora.Dictionary(tokens)
    bow = [dictionary.doc2bow(doc) for doc in tokens]
    tfidf = models.TfidfModel(bow)
    bow_tfidf = tfidf[bow]
    tfidf_dic = {dictionary.get(id): value for doc in bow_tfidf for id, value in doc}
    tfidf_list = [k for k, v in sorted(tfidf_dic.items(), key=lambda item: item[1], reverse = True)]
    return tfidf_list[:k]

In [14]:
keywords_tfidf_health = get_k_tfidf_keywords(train_health, 100)
keywords_tfidf_politics = get_k_tfidf_keywords(train_politics, 100)
keywords_tfidf_sports = get_k_tfidf_keywords(train_sports, 100)

In [15]:
def remove_duplicates(d1, d2, d3):

    i1 = set(d1) & set(d2)
    i2 = set(d1) & set(d3)
    i3 = set(d2) & set(d3)
    
    deleted = set(list(i1.union(i2).union(i3)))
    
    for key in deleted:
        try:
            d1.remove(key)
        except:
            print(f"D1 no tiene {key}")
        try:
            d2.remove(key)
        except:
            print(f"D2 no tiene {key}")
        try:
            d3.remove(key)
        except:
            print(f"D3 no tiene {key}")
            
    return d1, d2, d3   

In [16]:
keywords_tfidf_health, keywords_tfidf_politics, keywords_tfidf_sports = remove_duplicates(keywords_tfidf_health, keywords_tfidf_politics, keywords_tfidf_sports)

D1 no tiene defensa
D1 no tiene récord


### Gensim

In [17]:
def get_k_gensim_keywords(data, k):
    data = data.copy()
    data["joined"] = data["tokens + bigrams"].apply(lambda x: " ".join(x))
    data['joined'] = data.joined.astype(str)
    data = " ".join(data["joined"].values)
    return [key[0] for key in keywords(data, scores=True, words=k, pos_filter=('NNP', 'JJ', "NNPS", "VB"))]

In [18]:
keywords_gensim_health = get_k_gensim_keywords(train_health, 100)
keywords_gensim_politics = get_k_gensim_keywords(train_politics, 100)
keywords_gensim_sports = get_k_gensim_keywords(train_sports, 100)

keywords_gensim_health, keywords_k_gensim_politics, keywords_k_gensim_sports = remove_duplicates(keywords_gensim_health, keywords_gensim_politics, keywords_gensim_sports)

D3 no tiene pandemia
D1 no tiene partido
D1 no tiene presidente
D3 no tiene persona
D1 no tiene real
D2 no tiene horas
D2 no tiene punto
D3 no tiene personas
D3 no tiene dia
D3 no tiene trata
D3 no tiene sin
D2 no tiene puntos
D3 no tiene casos
D2 no tiene grupo
D1 no tiene partidos
D1 no tiene espana
D1 no tiene situacion


### Kmeans

In [19]:
def get_k_kmeans_keywords(data, k):
    data = data.copy()
    data["joined"] = data["tokens"].apply(lambda x: " ".join(x))
    k_means_data = data["joined"].values
    vectorizer = TfidfVectorizer(ngram_range=(1,2))
    X = vectorizer.fit_transform(k_means_data)
    
    model = KMeans(n_clusters=3, init='k-means++', max_iter=1000, n_init=1, random_state = 5, algorithm="full")
    model.fit(X)
    
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    
    keywords_kmeans_politics = [terms[ind] for ind in order_centroids[0, :k]]
    keywords_kmeans_health = [terms[ind] for ind in order_centroids[1, :k]]
    keywords_kmeans_sports = [terms[ind] for ind in order_centroids[2, :k]]
    
    return keywords_kmeans_politics, keywords_kmeans_health, keywords_kmeans_sports

In [20]:
keywords_kmeans_politics, keywords_kmeans_health, keywords_kmeans_sports = get_k_kmeans_keywords(train_data, 100)
print(len(keywords_kmeans_politics), len(keywords_kmeans_health), len(keywords_kmeans_sports))
keywords_kmeans_politics, keywords_kmeans_health, keywords_kmeans_sports = remove_duplicates(keywords_kmeans_politics, keywords_kmeans_health, keywords_kmeans_sports)
print(len(keywords_kmeans_politics), len(keywords_kmeans_health), len(keywords_kmeans_sports))

100 100 100
D3 no tiene sánchez
D3 no tiene rey
D1 no tiene forma
D1 no tiene infarto
D1 no tiene mundo
D3 no tiene fiscal
D3 no tiene partido
D3 no tiene presidente
D3 no tiene pedro sánchez
D3 no tiene psoe
D3 no tiene españa
D3 no tiene gobierno
D1 no tiene madrid
D3 no tiene real
D1 no tiene estudio
D3 no tiene pedro
88 83 94


## Formación del glosario

### Automatizado

In [21]:
def check_relevant_keywords(d1, d2, d3):
    i1 = set(d1) & set(d2)
    i2 = set(d1) & set(d3)
    i3 = set(d2) & set(d3)
    
    deleted = set(list(i1.union(i2).union(i3)))
    return deleted

In [22]:
relevant_keywords_politics = check_relevant_keywords(keywords_kmeans_politics, keywords_gensim_politics, keywords_tfidf_politics)
relevant_keywords_health = check_relevant_keywords(keywords_kmeans_health, keywords_gensim_health, keywords_tfidf_health)
relevant_keywords_sports = check_relevant_keywords(keywords_kmeans_sports, keywords_gensim_sports, keywords_tfidf_sports)

print("Politics ==> ", relevant_keywords_politics)
print("Sports ==> ", relevant_keywords_sports)
print("Health ==> ", relevant_keywords_health)

Politics ==>  {'bildu', 'militares', 'atención', 'vivienda', 'diputados', 'informes', 'texto', 'jiménez becerril', 'acuerdos', 'injurias', 'marín', 'ayuntamiento', 'código', 'desahucio', 'eta', 'eutanasia', 'erc', 'vox', 'presupuestos generales', 'navidades', 'iceta', 'felipe gonzález', 'independentistas', 'congreso', 'pablo iglesias', 'gonzález', 'ley', 'código penal', 'instituciones', 'armonización fiscal', 'mal', 'ascen', 'azurmendi', 'generales', 'proyecto', 'pnv', 'ministro', 'penal', 'casa real', 'becerril', 'ciudadanos', 'regulación', 'dsn', 'presupuestos', 'callar', 'armonización', 'jueves', 'sociedad', 'notas', 'alberto', 'comunidades', 'claro'}
Sports ==>  {'bla', 'campeones', 'equipo', 'franquicia', 'nike', 'balonmano', 'vaccaro', 'entrenador', 'haaland', 'campazzo', 'atlético', 'pasa', 'competir', 'realmente', 'curry', 'hablando', 'goles', 'liga', 'thompson', 'alfredo', 'bolt', 'escolta', 'duro', 'grupos', 'jornet', 'gasol', 'barça', 'historia', 'situaciones', 'michael jord

# Clasificador

## Carga de glosarios

In [23]:
path_keys_health = "../keywords/keys_health.txt"
path_keys_sports = "../keywords/keys_sports.txt"
path_keys_politics = "../keywords/keys_politics.txt"

In [24]:
keys_health = [key.strip() for key in open(path_keys_health, encoding="utf-8").readlines()]
keys_sports = [key.strip() for key in open(path_keys_sports, encoding="utf-8").readlines()]
keys_politics = [key.strip() for key in open(path_keys_politics, encoding="utf-8").readlines()]

keys_dic = {0: "health", 1: "sports", 2: "politics"}
inverted_keys_dic = {"health": 0, "sports": 1, "politics": 2}

## Bigramas de test data

In [25]:
test_data["bigrams"] = test_data["preprocesado"].apply(lambda x: check_bigram(x, bigrams))
test_data["tokens + bigrams"] = test_data["tokens"] + test_data["bigrams"]
test_data

Unnamed: 0,index,doc_name,text,class,preprocesado,tokens,bigrams,tokens + bigrams
0,15,health_23.txt,"Hace unos días Alejandro Díez, madrileño de 24...",health,hace unos días alejandro díez madrileño de añ...,"[días, alejandro, díez, madrileño, años, levan...","[se trata, este tipo, se trata, ha sido, es de...","[días, alejandro, díez, madrileño, años, levan..."
1,16,health_24.txt,Casi todos los planes contra el coronavirus un...,health,casi todos los planes contra el coronavirus un...,"[planes, coronavirus, pase, peor, basan, inmun...","[sobre todo, se trata, frente al, segunda ola,...","[planes, coronavirus, pase, peor, basan, inmun..."
2,17,health_25.txt,Un correcto descanso nocturno no sólo es impor...,health,un correcto descanso nocturno no sólo es impor...,"[correcto, descanso, nocturno, importante, sen...","[frente al, frente al, cada vez, new york, más...","[correcto, descanso, nocturno, importante, sen..."
3,18,health_26.txt,Los problemas de sueño son cada vez más frecue...,health,los problemas de sueño son cada vez más frecue...,"[problemas, sueño, frecuentes, sociedad, llega...","[muy probable, se trata, sino también, sin emb...","[problemas, sueño, frecuentes, sociedad, llega..."
4,19,health_27.txt,"El estrés de la rutina diaria, las preocupacio...",health,el estrés de la rutina diaria las preocupacion...,"[estrés, rutina, diaria, preocupaciones, labor...","[sin embargo, más allá, sin embargo, muchas pe...","[estrés, rutina, diaria, preocupaciones, labor..."
...,...,...,...,...,...,...,...,...
100,145,sports_50.txt,"La cuarentena que cumplen algunas gimnastas, l...",sports,la cuarentena que cumplen algunas gimnastas la...,"[cuarentena, cumplen, gimnastas, dificultades,...","[muchos casos, muchos casos, las autoridades, ...","[cuarentena, cumplen, gimnastas, dificultades,..."
101,146,sports_6.txt,El Sevilla obtuvo en Krasnodar su billete para...,sports,el sevilla obtuvo en krasnodar su billete para...,"[sevilla, obtuvo, krasnodar, billete, octavos,...","[todas las, todas las, todas las, todas las, t...","[sevilla, obtuvo, krasnodar, billete, octavos,..."
102,147,sports_7.txt,Ronald Koeman decidió dar una oportunidad a Ca...,sports,ronald koeman decidió dar una oportunidad a ca...,"[ronald, koeman, decidió, oportunidad, carles,...","[apostar por, apostar por, apostar por, frente...","[ronald, koeman, decidió, oportunidad, carles,..."
103,148,sports_8.txt,"\nChiellini, Bonucci, Barzagli, Zambrotta...la...",sports,\nchiellini bonucci barzagli zambrottala lista...,"[chiellini, bonucci, barzagli, zambrottala, li...","[frente al, sin embargo, frente al, sin embarg...","[chiellini, bonucci, barzagli, zambrottala, li..."


## TFIDF

In [26]:
glossaries = [keys_health, keys_sports, keys_politics]

In [27]:
dictionary = corpora.Dictionary(glossary for glossary in glossaries)
dictionary.save('keys.dict')  # store the dictionary, for future reference

In [28]:
class MyCorpus:
    
    def __init__(self, docs, dictionary):
        self.docs = docs
        self.dict = dictionary
        
    def __iter__(self):
        for doc in self.docs:
            # assume there's one document per line, tokens separated by whitespace
            yield self.dict.doc2bow(doc)

In [29]:
bow = MyCorpus(glossaries, dictionary)
corpora.MmCorpus.serialize("keys.mm", bow, metadata=True)

In [30]:
from gensim.test.utils import datapath, get_tmpfile
index_temp = get_tmpfile("index")
index = Similarity(index_temp, bow, num_features=len(dictionary))  # create index
index.save("keys.index")

In [31]:
model_tfidf = models.TfidfModel(bow)

In [32]:
def classify_document_tfidf(model, dictionary, bow, index, documents, i, verbose = False):
    """
    Given a specific document, computes the ranking of the classes and returns the current class, 
    the predicted class and the probabilities for each class.
    
    """
    document = documents.iloc[i]
    pq = document["tokens + bigrams"]
    vq = dictionary.doc2bow(pq)
    qtfidf = model[vq]
    sim = index[qtfidf]

    ranking = sorted(enumerate(sim), key=itemgetter(1), reverse=True)

    if verbose:
        print("Document ==> " + document["text"][:100])
        for doc, score in ranking:
            cat = keys_dic[doc]
            print(f"[{cat}] ==> %.3f" % round(score,3))
            
    
    return [i, get_info_document(document, ranking, sim)]

In [33]:
def get_info_document(document, ranking, sim):
    """
    Given a ranking of classes, returns the current class, the predicted class and the probabilities for each class.
    
    """
    current_class = inverted_keys_dic[document["class"]]
    
    if np.sum(sim) == 0.0:
        predicted_class = np.random.randint(3)
        probabilities = np.array([1/3, 1/3, 1/3])
    else:
        predicted_class = ranking[0][0]
        tfidf_scores = np.array(sim)
        probabilities = tfidf_scores / np.sum(tfidf_scores)
    
    return {"current_class": current_class, "predicted_class": predicted_class, 
            "probabilities": probabilities}

In [34]:
_ = classify_document_tfidf(model_tfidf, dictionary, bow, index, test_data, 3, verbose = True)

Document ==> Los problemas de sueño son cada vez más frecuentes en nuestra sociedad, y llegan a afectar a uno de 
[health] ==> 0.257
[politics] ==> 0.093
[sports] ==> 0.000


In [35]:
def classify_tfidf(function, model, dictionary, bow, index, data):
    def classify(doc_i):
        return function(model, dictionary, bow, index, data, doc_i)
    return classify

# Clasificación de documentos

## Funciones auxiliares

In [36]:
def fill_test_data(test_data, infos):
    """
    Auxiliary function to fill the dataframe with info about the classification.
    
    """
    data = test_data.copy()
    current_class = pd.Series([info[1]["current_class"] for info in infos])
    predicted_class = pd.Series([info[1]["predicted_class"] for info in infos])
    p_health = pd.Series([info[1]["probabilities"][0] for info in infos])
    p_sports = pd.Series([info[1]["probabilities"][1] for info in infos])
    p_politics = pd.Series([info[1]["probabilities"][2] for info in infos])

    data["current_class"] = current_class
    data["predicted_class"] = predicted_class
    data["p_health"] = p_health
    data["p_sports"] = p_sports
    data["p_politics"] = p_politics
    
    return data

In [37]:
def classify_documents(test_data, classify):
    """
    Classifies the documents given a specific classification function.
    
    """
    test_data = test_data.copy()
    
    infos = [classify(i) for i in range(len(test_data))]
    data = fill_test_data(test_data, infos)
        
    return data 

In [38]:
def get_filename(df):
    """
    Computes the filename for each document based on the performed classification.
    
    """
    confidence = "%.3f" % df["confidence"]
    current_class = df["class"]
    predicted_class = df["predicted_class_name"]
    correct = current_class == predicted_class
    name = df["doc_name"].split(".")[0]
    
    return f"../classification/{predicted_class}/{confidence}_{name}-{correct}-{current_class}-{predicted_class}.txt"

In [39]:
def write_file(path, content):
    """
    Writes a file given its path and content.
    
    """
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding = "utf-8") as f:
        f.write(content)

In [43]:
def move_files(data, tables = False):
    """
    Moves the files to their corresponding new directory after classification is done.
    
    """
    data = data.copy()
    classes = [[0, "p_health"], [1, "p_sports"], [2, "p_politics"]]

    for cl in classes:
        docs = data[data["current_class"] == cl[0]]
        docs = docs.sort_values(by=[cl[1]], ascending=False)
        docs["predicted_class_name"] = docs["predicted_class"].apply(lambda x : keys_dic[x])
        docs["confidence"] = docs[["p_health", "p_sports", "p_politics"]].max(axis=1)
        docs["file"] = docs.apply(lambda x: get_filename(x), axis=1)
        docs.apply(lambda row: write_file(row["file"], row["text"]), axis = 1)
        if tables:
            # tabla para la memoria
            docs = docs[["doc_name", "class", "p_health", "p_sports", "p_politics", "predicted_class_name"]]
            docs = docs.rename(columns = {"predicted_class_name": "predicted_class"})
            print(docs.to_latex(bold_rows = True, float_format="%.2f", column_format = "llllll", index=False))

In [44]:
def execute(test_data, classification_function, move = True, tables = False):
    """
    General function that classifies the documents.
    
    """
    data = test_data.copy()
    print("############################################################")
    print("Starting document´s classification...")
    data = classify_documents(data, classification_function)
    sleep(1)
    print("Document´s classification done...")
    if move:
        print("-----------------------------------------------------------")
        sleep(1)
        print("Moving files to the correct directories...")
        move_files(data, tables)
        sleep(1)
        print("Files moved.")
    print("############################################################")
    
    return data

## Clasificación

In [48]:
data = execute(test_data, classify_tfidf(classify_document_tfidf, model_tfidf, dictionary, bow, index, test_data))

############################################################
Starting document´s classification...
Document´s classification done...
-----------------------------------------------------------
Moving files to the correct directories...
Files moved.
############################################################


# Evaluación de modelos

## Funciones auxiliares

In [51]:
def evaluate_single_model(data, model, classify_function):
    """
    Function that evaluates the performance of a specific model.
    
    """
    data = data.copy()
    print("#######################################################")
    print("Evaluating "+ model + "...")
    print("-------------------------------------------------------")
    data = classify_documents(data, classify_function)
    
    y_true = data["current_class"]
    y_pred = data["predicted_class"]

    cm = confusion_matrix(y_true, y_pred)

    precisions = []
    recalls = []

    for i in range(len(cm[0])):
        name = keys_dic[i]
        print(f"Computing statistics about {name}:")
        precision = cm[i,i] / np.sum(cm[i,:])
        recall = cm[i,i] / np.sum(cm[:,i])
        print(f"\tPrecision ==> {precision}")
        print(f"\tRecall ==> {recall}")

        precisions.append(precision)
        recalls.append(recall)

    precisions = np.array(precisions)
    recalls = np.array(recalls)
    f1 = f1_score(y_true, y_pred, average = "macro")
    accuracy = accuracy_score(y_true, y_pred)
    print(f"Average precision ==> {precisions.mean()}")
    print(f"Average recall ==> {recalls.mean()}")
    print(f"F1-Score ==> {f1}")
    print(f"Overall accuracy score ==> {accuracy}")
    print("-------------------------------------------------------")
    print("Model evaluated")
    print("#######################################################")

In [52]:
evaluate_single_model(test_data, "tf-idf", classify_tfidf(classify_document_tfidf, model_tfidf, dictionary, bow, index, test_data))

#######################################################
Evaluating tf-idf...
-------------------------------------------------------
Computing statistics about health:
	Precision ==> 0.8857142857142857
	Recall ==> 0.7948717948717948
Computing statistics about sports:
	Precision ==> 0.8571428571428571
	Recall ==> 0.9090909090909091
Computing statistics about politics:
	Precision ==> 0.8857142857142857
	Recall ==> 0.9393939393939394
Average precision ==> 0.8761904761904762
Average recall ==> 0.8811188811188811
F1-Score ==> 0.8773184949655537
Overall accuracy score ==> 0.8761904761904762
-------------------------------------------------------
Model evaluated
#######################################################
