Equipo:
Carvajal Hernandez, German Andres y
Veitsman, Yana

In [117]:
import spacy
import pandas as pd
from spacy.lang.es.stop_words import STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import unicodedata

nlp = spacy.load("es_core_news_md")

In [118]:
data = pd.read_csv("fake_news.csv")
data

Unnamed: 0,class,Text
0,True,Algunas de las voces extremistas más conocida...
1,True,Después de casi dos años y medio de luchas po...
2,True,Dos periodistas birmanos de la agencia Reuter...
3,True,El Cuerpo Nacional de Policía ha detenido a c...
4,True,El desfile de la firma en Roma se convierte e...
...,...,...
1995,True,El Consejo de Gobierno ha dado su visto bueno...
1996,True,Investigadores valencianos han desarrollado u...
1997,True,Los arrestados actuaban en coches y en establ...
1998,True,El Rey ha encargado este miércoles a Pedro Sá...


In [119]:
# Vamos a etiquetar todas las noticias no verdaderos con un "1" en nuestra lista de etiquetas. Esto se hace para que las medidas de performance del algoritmo estén relacionadas con la identifiación de noticias falsas.
classes = data['class']
y = []
for i in classes:
    if i == False:
        y.append(1)
    else:
        y.append(0)

In [120]:
texts = data['Text'].values.tolist()

In [121]:
# Vamos a preparar diferentes funciones del preprocessado para compararlas después de elegir un modelo mejor

# Esa función será una función básica para nuestra comparación
def normalize_without_lemma(text):
    text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")
    doc = nlp(text)
    tokens = [t for t in doc if t.text.isalpha() and t not in STOP_WORDS and len(t.text) > 2]
    words = []
    for t in tokens:
        words.append(t.text.lower())
    cleaned_text = " ".join(words)
    return cleaned_text

# Añadimos no palabras, pero lemas.
def normalize_with_lemma(text):
    text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")
    doc = nlp(text)
    tokens = [t.lemma_ for t in doc if t.text.isalpha() and t not in STOP_WORDS and len(t.text) > 2]
    words = []
    for t in tokens:
        words.append(t.lower())
    cleaned_text = " ".join(words)
    return cleaned_text

# Normalización sin extración de STOP_WORDs y palabras de longitud <= 2
def basic_normalizing(text):
    text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")
    doc = nlp(text)
    tokens = [t for t in doc if t.text.isalpha()]
    words = []
    for t in tokens:
        words.append(t.text.lower())
    cleaned_text = " ".join(words)
    return cleaned_text

In [122]:
X = []
for i in texts:
    X.append(normalize_without_lemma(i))

In [124]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

print("Logitud de train set: "+str(len(X_train)))
print("Logitud de test set: "+str(len(X_test)))

Logitud de train set: 1400
Logitud de test set: 600


In [126]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#Extraemos las caracteristicas usando las tecnicas base de BoW y TF-IDF

cv = CountVectorizer()
train_bow = cv.fit_transform(X_train)
test_bow = cv.transform(X_test)
tv = TfidfVectorizer()
train_tfidf=tv.fit_transform(X_train)
test_tfidf=tv.transform(X_test)

print("Dimensiones BoW Train: "+str(train_bow.shape))
print("Dimensiones BoW Test: "+str(test_bow.shape))
print("Dimensiones TF-IDF Train: "+str(train_tfidf.shape))
print("Dimensiones TF-IDF Test: "+str(test_tfidf.shape))

Dimensiones BoW Train: (1400, 10330)
Dimensiones BoW Test: (600, 10330)
Dimensiones TF-IDF Train: (1400, 10330)
Dimensiones TF-IDF Test: (600, 10330)


In [174]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

#Realizamos vectorizado con BoW y TF-IDF pero utilizando LSA para reducir dimensiones.

cv = CountVectorizer()
tv = TfidfVectorizer()
svd = TruncatedSVD(n_components=1000)
lsa_pipe_bow = make_pipeline(cv, svd)
lsa_pipe_tfidf = make_pipeline(tv, svd)
lsa_train_bow = lsa_pipe_bow.fit_transform(X_train)
lsa_test_bow= lsa_pipe_bow.transform(X_test)
lsa_train_tfidf = lsa_pipe_tfidf.fit_transform(X_train)
lsa_test_tfidf= lsa_pipe_tfidf.transform(X_test)

print("Dimensiones LSA BoW Train: "+str(lsa_train_bow.shape))
print("Dimensiones LSA BoW Test: "+str(lsa_test_bow.shape))
print("Dimensiones LSA TF-IDF Train: "+str(lsa_train_tfidf.shape))
print("Dimensiones LSA TF-IDF Test: "+str(lsa_test_tfidf.shape))

Dimensiones LSA BoW Train: (1400, 1000)
Dimensiones LSA BoW Test: (600, 1000)
Dimensiones LSA TF-IDF Train: (1400, 1000)
Dimensiones LSA TF-IDF Test: (600, 1000)


In [175]:
from sklearn.decomposition import LatentDirichletAllocation

#Realizamos vectorizado con BoW y TF-IDF pero utilizando LDA para reducir dimensiones.

cv = CountVectorizer()
tv = TfidfVectorizer()
lda = LatentDirichletAllocation(n_components=1000, max_iter=5,
 learning_method='online',
 learning_offset=50.,
 random_state=0)
lda_pipe_bow=make_pipeline(cv, lda)
lda_pipe_tfidf=make_pipeline(tv, lda)
lda_train_bow=lda_pipe_bow.fit_transform(X_train)
lda_test_bow=lda_pipe_bow.transform(X_test)
lda_train_tfidf=lda_pipe_tfidf.fit_transform(X_train)
lda_test_tfidf=lda_pipe_tfidf.transform(X_test)

print("Dimensiones LDA BoW Train: "+str(lda_train_bow.shape))
print("Dimensiones LDA BoW Test: "+str(lda_test_bow.shape))
print("Dimensiones LDA TF-IDF Train: "+str(lda_train_tfidf.shape))
print("Dimensiones LDA TF-IDF Test: "+str(lda_test_tfidf.shape))

  return np.exp(-1.0 * perword_bound)
Dimensiones LDA BoW Train: (1400, 1000)
Dimensiones LDA BoW Test: (600, 1000)
Dimensiones LDA TF-IDF Train: (1400, 1000)
Dimensiones LDA TF-IDF Test: (600, 1000)


In [176]:
from sklearn import metrics
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import SVC

def get_metrics(true_labels, predicted_labels):
    """Calculamos distintas métricas sobre el
    rendimiento del modelo. Devuelve un diccionario
    con los parámetros medidos"""
    
    return {
        'Accuracy': np.round(
                        metrics.accuracy_score(true_labels, 
                                               predicted_labels),
                        3),
        'Precision': np.round(
                        metrics.precision_score(true_labels, 
                                               predicted_labels,
                                               average='weighted',
                                               zero_division=0),
                        3),
    'Recall': np.round(
                        metrics.recall_score(true_labels, 
                                               predicted_labels,
                                               average='weighted',
                                               zero_division=0),
                        3),
    'F1 Score': np.round(
                        metrics.f1_score(true_labels, 
                                               predicted_labels,
                                               average='weighted',
                                               zero_division=0),
                        3)}
                        

def train_predict_evaluate_model(classifier, 
                                 train_features, train_labels, 
                                 test_features, test_labels):
    """Función que entrena un modelo de clasificación sobre
    un conjunto de entrenamiento, lo aplica sobre un conjunto
    de test y devuelve la predicción sobre el conjunto de test
    y las métricas de rendimiento"""
    # genera modelo    
    classifier.fit(train_features, train_labels)
    # predice usando el modelo sobre test
    predictions = classifier.predict(test_features) 
    # evalúa rendimiento de la predicción   
    metricas = get_metrics(true_labels=test_labels, 
                predicted_labels=predictions)
    return predictions, metricas    

modelLR = LogisticRegression(solver='liblinear')
modelNB = GaussianNB()
modelSVM = SGDClassifier(loss='hinge', max_iter=1000)
modelRBFSVM = SVC(gamma='scale', C=2)


modelos = [('Logistic Regression', modelLR),
           ('Naive Bayes', modelNB),
           ('Linear SVM', modelSVM),
            ('Gauss kernel SVM', modelRBFSVM)]

metricas = []
resultados = []

# Modelos bow
bow_train_features2 = train_bow.toarray()
bow_test_features2 = test_bow.toarray()
for m, clf in modelos:
    prediccion, metrica = train_predict_evaluate_model(classifier=clf,
                                           train_features=bow_train_features2,
                                           train_labels=y_train,
                                           test_features=bow_test_features2,
                                           test_labels=y_test)
    metrica['modelo']=f'{m} bow'
    resultados.append(prediccion)
    metricas.append(metrica)

 # Modelos tfidf
tfidf_train_features2 = train_tfidf.toarray()
tfidf_test_features2 = test_tfidf.toarray()
for m, clf in modelos:
    prediccion, metrica = train_predict_evaluate_model(classifier=clf,
                                           train_features=tfidf_train_features2,
                                           train_labels=y_train,
                                           test_features=tfidf_test_features2,
                                           test_labels=y_test)
    metrica['modelo']=f'{m} tfidf'
    resultados.append(prediccion)
    metricas.append(metrica)

# Modelos LSA bow
bow_train_features2 = lsa_train_bow
bow_test_features2 = lsa_test_bow
for m, clf in modelos:
    prediccion, metrica = train_predict_evaluate_model(classifier=clf,
                                           train_features=bow_train_features2,
                                           train_labels=y_train,
                                           test_features=bow_test_features2,
                                           test_labels=y_test)
    metrica['modelo']=f'{m} LSA bow'
    resultados.append(prediccion)
    metricas.append(metrica)

# Modelos LSA tfidf
tfidf_train_features2 = lsa_train_tfidf
tfidf_test_features2 = lsa_test_tfidf
for m, clf in modelos:
    prediccion, metrica = train_predict_evaluate_model(classifier=clf,
                                           train_features=tfidf_train_features2,
                                           train_labels=y_train,
                                           test_features=tfidf_test_features2,
                                           test_labels=y_test)
    metrica['modelo']=f'{m} LSA tfidf'
    resultados.append(prediccion)
    metricas.append(metrica)

# Modelos LDA bow
bow_train_features2 = lda_train_bow
bow_test_features2 = lda_test_bow
for m, clf in modelos:
    prediccion, metrica = train_predict_evaluate_model(classifier=clf,
                                           train_features=bow_train_features2,
                                           train_labels=y_train,
                                           test_features=bow_test_features2,
                                           test_labels=y_test)
    metrica['modelo']=f'{m} LDA bow'
    resultados.append(prediccion)
    metricas.append(metrica)

# Modelos LDA tfidf
tfidf_train_features2 = lda_train_tfidf
tfidf_test_features2 = lda_test_tfidf
for m, clf in modelos:
    prediccion, metrica = train_predict_evaluate_model(classifier=clf,
                                           train_features=tfidf_train_features2,
                                           train_labels=y_train,
                                           test_features=tfidf_test_features2,
                                           test_labels=y_test)
    metrica['modelo']=f'{m} LDA tfidf'
    resultados.append(prediccion)
    metricas.append(metrica)

In [177]:
import pandas as pd
metricasDf=pd.DataFrame(metricas)

"""
Suponemos que la mejor métrica para evaluar nuestro modelo es el recall, porque el recall es en realidad el valor de 
los verdaderos positivos que nuestro modelo fue capaz de reconocer sobre todos los positivos. 
Dado que, en nuestro caso, la detección de un artículo falso se considera un caso positivo y se etiqueta como un 1, 
deberíamos basar nuestras conclusiones en el recuerdo
"""

metricasDf=metricasDf.sort_values("Recall", ascending=False)
print(metricasDf)

    Accuracy  Precision  Recall  F1 Score                         modelo
7      0.782      0.782   0.782     0.782         Gauss kernel SVM tfidf
15     0.772      0.773   0.772     0.771     Gauss kernel SVM LSA tfidf
14     0.765      0.765   0.765     0.765           Linear SVM LSA tfidf
6      0.763      0.763   0.763     0.763               Linear SVM tfidf
12     0.757      0.757   0.757     0.757  Logistic Regression LSA tfidf
4      0.752      0.752   0.752     0.752      Logistic Regression tfidf
3      0.748      0.751   0.748     0.748           Gauss kernel SVM bow
0      0.738      0.741   0.738     0.738        Logistic Regression bow
8      0.737      0.738   0.737     0.737    Logistic Regression LSA bow
1      0.737      0.746   0.737     0.736                Naive Bayes bow
5      0.735      0.739   0.735     0.735              Naive Bayes tfidf
11     0.733      0.734   0.733     0.733       Gauss kernel SVM LSA bow
2      0.728      0.728   0.728     0.728          

In [178]:
# Vamos a elegir los dos mejores modelos y vamos a comparar si nuestra función del preprocessado cambiará algo

# Primero, probamos el preprocessado con lematización
X = []
for i in texts:
    X.append(normalize_with_lemma(i))
    
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

In [179]:
model = SVC(gamma='scale', C=2)

train_tfidf=tv.fit_transform(X_train)
test_tfidf=tv.transform(X_test)

lsa_train_tfidf = lsa_pipe_tfidf.fit_transform(X_train)
lsa_test_tfidf= lsa_pipe_tfidf.transform(X_test)

prediccion_lematizado, metrica_lematizado = train_predict_evaluate_model(classifier=model,
                                           train_features=train_tfidf,
                                           train_labels=y_train,
                                           test_features=test_tfidf,
                                           test_labels=y_test)

prediccion_lematizado_lsa, metrica_lematizado_lsa = train_predict_evaluate_model(classifier=model,
                                           train_features=lsa_train_tfidf,
                                           train_labels=y_train,
                                           test_features=lsa_test_tfidf,
                                           test_labels=y_test)

In [180]:
# Ahora, probamos con el preprocessado muy básico

X = []
for i in texts:
    X.append(basic_normalizing(i))
    
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

In [181]:
train_tfidf=tv.fit_transform(X_train)
test_tfidf=tv.transform(X_test)

lsa_train_tfidf = lsa_pipe_tfidf.fit_transform(X_train)
lsa_test_tfidf= lsa_pipe_tfidf.transform(X_test)

prediccion_basico, metrica_basico = train_predict_evaluate_model(classifier=model,
                                           train_features=train_tfidf,
                                           train_labels=y_train,
                                           test_features=test_tfidf,
                                           test_labels=y_test)

prediccion_basico_lsa, metrica_basico_lsa = train_predict_evaluate_model(classifier=model,
                                           train_features=lsa_train_tfidf,
                                           train_labels=y_train,
                                           test_features=lsa_test_tfidf,
                                           test_labels=y_test)

In [182]:
# Como podemos ver, hay pequeña diferencia entre diferentes maneras del preprocessado

metrica_sin_lema = metricasDf[metricasDf['modelo']=='Gauss kernel SVM tfidf'].to_dict('records')[0]
metrica_sin_lema_lsa = metricasDf[metricasDf['modelo']=='Gauss kernel SVM LSA tfidf'].to_dict('records')[0]
preprocessados = ['Sin lematizacion', 'Sin lematizacion LSA','Con lematizacion', 'Con lematizacion LSA','Basico con stopwords y palabras cortas','Basico con stopwords y palabras cortas LSA']
comparacion_prep = pd.DataFrame([metrica_sin_lema,metrica_sin_lema_lsa, metrica_lematizado,metrica_lematizado_lsa, metrica_basico,metrica_basico_lsa]).drop(['modelo'], axis=1)
comparacion_prep['Tipo de preprocessado'] = preprocessados
comparacion_prep.sort_values('Recall', ascending=False)

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,Tipo de preprocessado
0,0.782,0.782,0.782,0.782,Sin lematizacion
4,0.782,0.782,0.782,0.782,Basico con stopwords y palabras cortas
1,0.772,0.773,0.772,0.771,Sin lematizacion LSA
2,0.772,0.774,0.772,0.772,Con lematizacion
5,0.763,0.764,0.763,0.763,Basico con stopwords y palabras cortas LSA
3,0.76,0.77,0.76,0.759,Con lematizacion LSA


In [183]:
#Los resultados realmente no cambiaron. El modelo Gauss Kernel es el mejor en este caso, sin la reducción de dimensionalidad de LSA. Esto se debe posiblemente a que no hay suficientes documentos en el conjunto de datos para hacer una reducción significativa. Sin embargo, se observa que Ambos modelos prefieren un pre procesado sin lematización, ya que SVM es un modelo multidimensional que se beneficia de las diferencias entre los documentos.