In [1]:
import spacy
import pandas as pd
from spacy.lang.es.stop_words import STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import unicodedata

nlp = spacy.load("es_core_news_md")

In [2]:
data = pd.read_csv("fake_news.csv")
data

Unnamed: 0,class,Text
0,True,Algunas de las voces extremistas más conocida...
1,True,Después de casi dos años y medio de luchas po...
2,True,Dos periodistas birmanos de la agencia Reuter...
3,True,El Cuerpo Nacional de Policía ha detenido a c...
4,True,El desfile de la firma en Roma se convierte e...
...,...,...
1995,True,El Consejo de Gobierno ha dado su visto bueno...
1996,True,Investigadores valencianos han desarrollado u...
1997,True,Los arrestados actuaban en coches y en establ...
1998,True,El Rey ha encargado este miércoles a Pedro Sá...


In [3]:
classes = data['class']
y = []
for i in classes:
    if i == True:
        y.append(1)
    else:
        y.append(0)

In [4]:
texts = data['Text'].values.tolist()

In [5]:
def normalize(text):
    text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")
    doc = nlp(text)
    tokens = [t for t in doc if t.text.isalpha() and t not in STOP_WORDS and len(t.text) > 2]
    words = []
    for t in tokens:
        words.append(t.text.lower())
    cleaned_text = " ".join(words)
    return cleaned_text


In [6]:
X = []
for i in texts:
    X.append(normalize(i))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

cv = CountVectorizer()
train_bow = cv.fit_transform(X_train)
test_bow = cv.transform(X_test)
tv = TfidfVectorizer()
train_tfidf=tv.fit_transform(X_train)
test_tfidf=tv.transform(X_test)

In [13]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

cv = CountVectorizer()
tv = TfidfVectorizer()
svd = TruncatedSVD(n_components=10)
lsa_pipe_bow = make_pipeline(cv, svd)
lsa_pipe_tfidf = make_pipeline(tv, svd)
lsa_train_bow = lsa_pipe_bow.fit_transform(X_train)
lsa_test_bow= lsa_pipe_bow.transform(X_test)
lsa_train_tfidf = lsa_pipe_tfidf.fit_transform(X_train)
lsa_test_tfidf= lsa_pipe_tfidf.transform(X_test)

In [14]:
from sklearn.decomposition import LatentDirichletAllocation

cv = CountVectorizer()
tv = TfidfVectorizer()
lda = LatentDirichletAllocation(n_components=10, max_iter=5,
 learning_method='online',
 learning_offset=50.,
 random_state=0)
lda_pipe_bow=make_pipeline(cv, lda)
lda_pipe_tfidf=make_pipeline(tv, lda)
lda_train_bow=lda_pipe_bow.fit_transform(X_train)
lda_test_bow=lda_pipe_bow.transform(X_test)
lda_train_tfidf=lda_pipe_tfidf.fit_transform(X_train)
lda_test_tfidf=lda_pipe_tfidf.transform(X_test)

In [20]:
from sklearn import metrics
import numpy as np

def get_metrics(true_labels, predicted_labels):
    """Calculamos distintas métricas sobre el
    rendimiento del modelo. Devuelve un diccionario
    con los parámetros medidos"""
    
    return {
        'Accuracy': np.round(
                        metrics.accuracy_score(true_labels, 
                                               predicted_labels),
                        3),
        'Precision': np.round(
                        metrics.precision_score(true_labels, 
                                               predicted_labels,
                                               average='weighted',
                                               zero_division=0),
                        3),
    'Recall': np.round(
                        metrics.recall_score(true_labels, 
                                               predicted_labels,
                                               average='weighted',
                                               zero_division=0),
                        3),
    'F1 Score': np.round(
                        metrics.f1_score(true_labels, 
                                               predicted_labels,
                                               average='weighted',
                                               zero_division=0),
                        3)}
                        

def train_predict_evaluate_model(classifier, 
                                 train_features, train_labels, 
                                 test_features, test_labels):
    """Función que entrena un modelo de clasificación sobre
    un conjunto de entrenamiento, lo aplica sobre un conjunto
    de test y devuelve la predicción sobre el conjunto de test
    y las métricas de rendimiento"""
    # genera modelo    
    classifier.fit(train_features, train_labels)
    # predice usando el modelo sobre test
    predictions = classifier.predict(test_features) 
    # evalúa rendimiento de la predicción   
    metricas = get_metrics(true_labels=test_labels, 
                predicted_labels=predictions)
    return predictions, metricas    

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import SVC

modelLR = LogisticRegression(solver='liblinear')
modelNB = GaussianNB()
modelSVM = SGDClassifier(loss='hinge', max_iter=1000)
modelRBFSVM = SVC(gamma='scale', C=2)


modelos = [('Logistic Regression', modelLR),
           ('Naive Bayes', modelNB),
           ('Linear SVM', modelSVM),
            ('Gauss kernel SVM', modelRBFSVM)]

metricas = []
resultados = []

# Modelos bow
bow_train_features2 = train_bow.toarray()
bow_test_features2 = test_bow.toarray()
for m, clf in modelos:
    prediccion, metrica = train_predict_evaluate_model(classifier=clf,
                                           train_features=bow_train_features2,
                                           train_labels=y_train,
                                           test_features=bow_test_features2,
                                           test_labels=y_test)
    metrica['modelo']=f'{m} bow'
    resultados.append(prediccion)
    metricas.append(metrica)

 # Modelos tfidf
bow_train_features2 = train_tfidf.toarray()
bow_test_features2 = test_tfidf.toarray()
for m, clf in modelos:
    prediccion, metrica = train_predict_evaluate_model(classifier=clf,
                                           train_features=bow_train_features2,
                                           train_labels=y_train,
                                           test_features=bow_test_features2,
                                           test_labels=y_test)
    metrica['modelo']=f'{m} tfidf'
    resultados.append(prediccion)
    metricas.append(metrica)

# Modelos LSA bow
bow_train_features2 = lsa_train_bow
bow_test_features2 = lsa_test_bow
for m, clf in modelos:
    prediccion, metrica = train_predict_evaluate_model(classifier=clf,
                                           train_features=bow_train_features2,
                                           train_labels=y_train,
                                           test_features=bow_test_features2,
                                           test_labels=y_test)
    metrica['modelo']=f'{m} LSA bow'
    resultados.append(prediccion)
    metricas.append(metrica)

# Modelos LSA tfidf
bow_train_features2 = lsa_train_tfidf
bow_test_features2 = lsa_test_tfidf
for m, clf in modelos:
    prediccion, metrica = train_predict_evaluate_model(classifier=clf,
                                           train_features=bow_train_features2,
                                           train_labels=y_train,
                                           test_features=bow_test_features2,
                                           test_labels=y_test)
    metrica['modelo']=f'{m} LSA tfidf'
    resultados.append(prediccion)
    metricas.append(metrica)

# Modelos LDA bow
bow_train_features2 = lda_train_bow
bow_test_features2 = lda_test_bow
for m, clf in modelos:
    prediccion, metrica = train_predict_evaluate_model(classifier=clf,
                                           train_features=bow_train_features2,
                                           train_labels=y_train,
                                           test_features=bow_test_features2,
                                           test_labels=y_test)
    metrica['modelo']=f'{m} LDA bow'
    resultados.append(prediccion)
    metricas.append(metrica)

# Modelos LDA bow
bow_train_features2 = lda_train_tfidf
bow_test_features2 = lda_test_tfidf
for m, clf in modelos:
    prediccion, metrica = train_predict_evaluate_model(classifier=clf,
                                           train_features=bow_train_features2,
                                           train_labels=y_train,
                                           test_features=bow_test_features2,
                                           test_labels=y_test)
    metrica['modelo']=f'{m} LDA tfidf'
    resultados.append(prediccion)
    metricas.append(metrica)

In [21]:
import pandas as pd
metricasDf=pd.DataFrame(metricas)
# Ya que es un modelo con varias etiquetas tiene más sentido seleccionar el mejor por la cantidad de veces que acertó, por lo que se usa el accuracy
metricasDf=metricasDf.sort_values("Accuracy")
print(metricasDf)

    Accuracy  Precision  Recall  F1 Score                         modelo
20     0.482      0.489   0.482     0.423  Logistic Regression LDA tfidf
23     0.488      0.501   0.488     0.430     Gauss kernel SVM LDA tfidf
22     0.495      0.513   0.495     0.437           Linear SVM LDA tfidf
21     0.520      0.537   0.520     0.369          Naive Bayes LDA tfidf
16     0.525      0.521   0.525     0.498    Logistic Regression LDA bow
10     0.525      0.540   0.525     0.407             Linear SVM LSA bow
18     0.527      0.684   0.527     0.375             Linear SVM LDA bow
19     0.535      0.624   0.535     0.405       Gauss kernel SVM LDA bow
17     0.537      0.691   0.537     0.399            Naive Bayes LDA bow
13     0.557      0.562   0.557     0.530          Naive Bayes LSA tfidf
9      0.575      0.600   0.575     0.558            Naive Bayes LSA bow
11     0.590      0.594   0.590     0.589       Gauss kernel SVM LSA bow
14     0.598      0.667   0.598     0.540          