# Clasificador documental VSM

In [1]:
import os
import json
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

## Cargamos los glosarios para poder utilizarlos en los tres temas
Definimos aquí las funciones necesarias para preprocesar los documentos y calcular la similitud del coseno.

In [2]:
def preprocess_document(doc):
    stopset = set(stopwords.words('spanish'))
    tokens = wordpunct_tokenize(doc) # tokenizar palabras
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2 and not token.isnumeric()]
    return clean

In [3]:
def get_input(path, glosary_documents):
    total_docs = []
    for document in glosary_documents:
        with open(path + document) as file:
            content = file.readlines()
        content = [c.replace('\n', '') for c in content if c != '\n']
        content = ' '.join(content)
        total_docs.append(content)
        docs = [preprocess_document(d) for d in total_docs]
        processed_docs = []
        for doc in docs:
            processed_docs.append(' '.join(doc))
    return processed_docs

In [4]:
def get_similar_articles(q, df, vectorizer, test_documents):
    # Funcion que hace la similitud del coseno entre los documentos y el glosario
    q = [q]
    q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
    sim = {}
    for i in range(df.shape[1]):
        sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
    #sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
    scores = []
    for k, v in sim.items():
        #print("[ Score = " + "%.3f" % round(v,3) + " ] " + test_documents[k]);
        scores.append(round(v,3))
    return scores

In [5]:
with open('./glosario/glosarios.json', 'r') as fp:
    data = json.load(fp)
query_salud = ' '.join(data['salud'])
query_economia = ' '.join(data['economia'])
query_deportes = ' '.join(data['deportes'])

# VSM para el tema salud

In [7]:
path_salud = './data/salud/'
#glosary_documents = os.listdir(path_salud)[:15]
test_documents = os.listdir(path_salud)[15:]

In [8]:
processed_docs = get_input(path_salud, test_documents)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed_docs)
X = X.T.toarray()
df_salud = pd.DataFrame(X, index=vectorizer.get_feature_names())

In [9]:
scores_query_salud = get_similar_articles(query_salud, df_salud, vectorizer, test_documents)
scores_query_economia = get_similar_articles(query_economia, df_salud, vectorizer, test_documents)
scores_query_deportes = get_similar_articles(query_deportes, df_salud, vectorizer, test_documents)

In [10]:
y_pred = []
for i in range(len(scores_query_salud)):
    argument_max = np.argmax([scores_query_salud[i],scores_query_economia[i],scores_query_deportes[i]])
    if argument_max == 0:
        y_pred.append('salud')
    elif argument_max == 1:
        y_pred.append('economia')
    elif argument_max == 2:
        y_pred.append('deportes')
y = []
for i in range(len(scores_query_salud)):
    y.append('salud')

In [11]:
print(classification_report(y,y_pred))

              precision    recall  f1-score   support

    economia       0.00      0.00      0.00         0
       salud       1.00      0.93      0.97        15

    accuracy                           0.93        15
   macro avg       0.50      0.47      0.48        15
weighted avg       1.00      0.93      0.97        15



  _warn_prf(average, modifier, msg_start, len(result))


# VSM para el tema economía

In [12]:
path_economia = './data/economia/'
#glosary_documents = os.listdir(path_salud)[:15]
test_documents = os.listdir(path_economia)[15:]

In [13]:
processed_docs = get_input(path_economia, test_documents)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed_docs)
X = X.T.toarray()
df_economia = pd.DataFrame(X, index=vectorizer.get_feature_names())

In [14]:
scores_query_salud = get_similar_articles(query_salud, df_economia, vectorizer, test_documents)
scores_query_economia = get_similar_articles(query_economia, df_economia, vectorizer, test_documents)
scores_query_deportes = get_similar_articles(query_deportes, df_economia, vectorizer, test_documents)

In [15]:
y_pred = []
for i in range(len(scores_query_economia)):
    argument_max = np.argmax([scores_query_salud[i],scores_query_economia[i],scores_query_deportes[i]])
    if argument_max == 0:
        y_pred.append('salud')
    elif argument_max == 1:
        y_pred.append('economia')
    elif argument_max == 2:
        y_pred.append('deportes')
y = []
for i in range(len(scores_query_economia)):
    y.append('economia')

In [16]:
print(classification_report(y,y_pred))

              precision    recall  f1-score   support

    deportes       0.00      0.00      0.00         0
    economia       1.00      0.87      0.93        15
       salud       0.00      0.00      0.00         0

    accuracy                           0.87        15
   macro avg       0.33      0.29      0.31        15
weighted avg       1.00      0.87      0.93        15



  _warn_prf(average, modifier, msg_start, len(result))


# VSM para el tema deportes

In [17]:
path_deportes = './data/deportes/'
#glosary_documents = os.listdir(path_salud)[:15]
test_documents = os.listdir(path_deportes)[15:]

In [18]:
processed_docs = get_input(path_deportes, test_documents)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed_docs)
X = X.T.toarray()
df_deportes = pd.DataFrame(X, index=vectorizer.get_feature_names())

In [19]:
scores_query_salud = get_similar_articles(query_salud, df_deportes, vectorizer, test_documents)
scores_query_economia = get_similar_articles(query_economia, df_deportes, vectorizer, test_documents)
scores_query_deportes = get_similar_articles(query_deportes, df_deportes, vectorizer, test_documents)

In [20]:
y_pred = []
for i in range(len(scores_query_salud)):
    argument_max = np.argmax([scores_query_salud[i],scores_query_economia[i],scores_query_deportes[i]])
    if argument_max == 0:
        y_pred.append('salud')
    elif argument_max == 1:
        y_pred.append('economia')
    elif argument_max == 2:
        y_pred.append('deportes')
y = []
for i in range(len(scores_query_salud)):
    y.append('deportes')

In [21]:
print(classification_report(y,y_pred))

              precision    recall  f1-score   support

    deportes       1.00      1.00      1.00        15

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

