#### Carga de los paquetes necesarios

In [1]:
import pandas as pd
import numpy as np
# Paquetes para el procesado y la limpieza (NLP)
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
# Extraccion de características
from sklearn.feature_extraction.text import TfidfVectorizer
# Clasificación y evaluación
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, naive_bayes, svm, neural_network
from sklearn.metrics import accuracy_score, confusion_matrix

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Particular\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Particular\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Particular\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Particular\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#Set Random seed
np.random.seed(500)

#### Carga del dataset

In [3]:
data = pd.read_excel("text_classification_dataset.xlsx", sheet_name = "Sheet1")
data.head()

Unnamed: 0,text,type
0,@ACNI2012 @TheToka920 Never knew having 1 or 2...,sports
1,"MYCA Magical Moments:\n\nSeptember, 2011: Sham...",sports
2,The current state of last year's @BBL finalist...,sports
3,@HOLLYJISOO Why did you bring a cricket...,sports
4,Babar Azam only Pakistani included in the ICC ...,sports


In [4]:
data["type"].unique()

array(['sports', 'entertainment', 'medical', 'politics'], dtype=object)

#### Limpieza y procesado del texto

In [5]:
# Paso 1: Eliminar valores nulos si es que los hay
data['text'].dropna(inplace=True)

In [6]:
# Paso 2: Pasar todo el texto a minúsculas. This is required as python interprets 'dog' and 'DOG' differently
data['text'] = [entry.lower() for entry in data['text']]

In [7]:
# Paso 3: Tokenización 
data['text']= [word_tokenize(entry) for entry in data['text']]

In [8]:
# Paso 4: Eliminar las palabras vacías (stop words) o caracterés no alfabéticos y realizar la lematización.

# WordNetLemmatizer() requiere del etiquetado gramatical (pos_tag()) para entender si la palabra es un nombre, verbo 
# o adjetivo, etc.
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

for index,entry in enumerate(data['text']):
    # Declarando la lista vacía para almacenar las palabras que siguen las reglas de este paso
    Final_words = []
    # Inicializando WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        # Comprobar si no es una palabra vacia y considerar solo carácteres alfabéticos
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # El conjunto final de palabras procesadas para cada iteración se almacenará en "text_final"
    data.loc[index,'text_final'] = str(Final_words)

#### Dividir la muestra en train y test

In [9]:
train_X, test_X, train_y, test_y = model_selection.train_test_split(data['text_final'], data['type'], test_size=0.3)

In [10]:
a = test_y

#### Codificar las etiquetas

In [11]:
encoder = LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

#### Vectorizar las palabras mediante *TF-IDF*

In [12]:
tfidf_vect = TfidfVectorizer(max_features=5000)
tfidf_vect.fit(data['text_final'])

train_X_tfidf = tfidf_vect.transform(train_X)
test_X_tfidf = tfidf_vect.transform(test_X)

#### Clasificación

In [13]:
labels = encoder.inverse_transform(np.unique(test_y))

In [14]:
# Naive Bayes
NB = naive_bayes.MultinomialNB()
NB.fit(train_X_tfidf,train_y)

predictions_NB = NB.predict(test_X_tfidf)

print("Naive Bayes Accuracy Score:  ", accuracy_score(test_y, predictions_NB)*100)


Naive Bayes Accuracy Score:   79.36962750716332


In [15]:
pd.DataFrame(confusion_matrix(test_y, predictions_NB), index=labels, columns=labels)

Unnamed: 0,entertainment,medical,politics,sports
entertainment,64,6,19,5
medical,1,67,14,1
politics,0,7,93,4
sports,1,5,9,53


In [16]:
# Support Vector Machine
SVM = svm.SVC(C=10.0, kernel='linear')
SVM.fit(train_X_tfidf,train_y)

predictions_SVM = SVM.predict(test_X_tfidf)

print("SVM Accuracy Score: ", accuracy_score(test_y, predictions_SVM)*100)

SVM Accuracy Score:  81.9484240687679


In [17]:
pd.DataFrame(confusion_matrix(test_y, predictions_SVM), index=labels, columns=labels)

Unnamed: 0,entertainment,medical,politics,sports
entertainment,74,3,14,3
medical,1,62,19,1
politics,3,6,94,1
sports,1,2,9,56


In [18]:
# Red Neuronal
RN = neural_network.MLPClassifier(activation = 'tanh', hidden_layer_sizes = (50, 50, 50), solver = 'sgd', max_iter=1000)
RN.fit(train_X_tfidf,train_y)

predictions_RN = RN.predict(test_X_tfidf)

print("RN Accuracy Score: ", accuracy_score(test_y, predictions_RN)*100)

RN Accuracy Score:  77.65042979942693


In [19]:
pd.DataFrame(confusion_matrix(test_y, predictions_RN), index=labels, columns=labels)

Unnamed: 0,entertainment,medical,politics,sports
entertainment,68,1,23,2
medical,0,53,29,1
politics,1,3,96,4
sports,1,0,13,54
