# Obtención de todos los textos de los tres temas
Para estos clasificadores no hace falta usar glosario ya que vamos a entrenar los modelos empleando los documentos escogidos para construir el glosario y se va a testear utilizando los documentos escogidos para test. A continuación se cargan, preprocesan dichos documentos y se guardan en una lista única. Habrá una lista para entrenamiento y otra lista para testeo. Además, hay que crear la clasificación real de los documentos en el orden correspondiente.

In [1]:
import os
import json
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
def preprocess_document(doc):
    stopset = set(stopwords.words('spanish'))
    tokens = wordpunct_tokenize(doc) # tokenizar palabras
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2 and not token.isnumeric()]
    return clean

In [3]:
def get_input(path, glosary_documents):
    total_docs = []
    for document in glosary_documents:
        with open(path + document) as file:
            content = file.readlines()
        content = [c.replace('\n', '') for c in content if c != '\n']
        content = ' '.join(content)
        total_docs.append(content)
        docs = [preprocess_document(d) for d in total_docs]
        processed_docs = []
        for doc in docs:
            processed_docs.append(' '.join(doc))
    return processed_docs

In [4]:
path_salud = './data/salud/'
path_economia = './data/economia/'
path_deportes = './data/deportes/'

In [5]:
train_documents_salud = os.listdir(path_salud)[:15]
train_documents_economia = os.listdir(path_economia)[:15]
train_documents_deportes = os.listdir(path_deportes)[:15]
processed_train_salud = get_input(path_salud, train_documents_salud)
processed_train_economia = get_input(path_economia, train_documents_economia)
processed_train_deportes = get_input(path_deportes, train_documents_deportes)
docs_train = processed_train_salud + processed_train_economia + processed_train_deportes

In [6]:
test_documents_salud = os.listdir(path_salud)[15:]
test_documents_economia = os.listdir(path_economia)[15:]
test_documents_deportes = os.listdir(path_deportes)[15:]
processed_docs_salud = get_input(path_salud, test_documents_salud)
processed_docs_economia = get_input(path_economia, test_documents_economia)
processed_docs_deportes = get_input(path_deportes, test_documents_deportes)
docs_test = processed_docs_salud + processed_docs_economia + processed_docs_deportes

In [10]:
y_salud = ['salud'] * 15
y_economia = ['economia'] * 15
y_deportes = ['deportes'] * 15
y = y_salud + y_economia + y_deportes

In [11]:
vectorizer = TfidfVectorizer()
vectorizer.fit(docs_train)
train_X_tfidf = vectorizer.transform(docs_train)
test_X_tfidf = vectorizer.transform(docs_test)

# Clasificador KNN para todos los temas

In [12]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(train_X_tfidf, y)
y_pred = neigh.predict(test_X_tfidf)

In [14]:
print(classification_report(y,y_pred))

              precision    recall  f1-score   support

    deportes       1.00      1.00      1.00        15
    economia       0.88      0.93      0.90        15
       salud       0.93      0.87      0.90        15

    accuracy                           0.93        45
   macro avg       0.93      0.93      0.93        45
weighted avg       0.93      0.93      0.93        45



# Clasificador Naive Bayes para todos los temas

In [18]:
gnb = GaussianNB()
gnb.fit(train_X_tfidf.toarray(), y)
y_pred = gnb.predict(test_X_tfidf.toarray())

In [19]:
print(classification_report(y,y_pred))

              precision    recall  f1-score   support

    deportes       1.00      1.00      1.00        15
    economia       0.94      1.00      0.97        15
       salud       1.00      0.93      0.97        15

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45

