In [1]:
import pandas as pd
import itertools
import glob

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from experiments_core import run_one_comb_experiment
from experiments_core import norm_repeated_letters
from experiments_core import stem_tokens
from experiments_core import lemmatizer
from experiments_core import save_excel_comb_results
from experiments_core import RepeatReplacer

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import RegexpTokenizer

import xlsxwriter

import re

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import importlib
import sys
importlib.reload(sys.modules[run_one_comb_experiment.__module__])
importlib.reload(sys.modules[norm_repeated_letters.__module__])
importlib.reload(sys.modules[stem_tokens.__module__])
importlib.reload(sys.modules[lemmatizer.__module__])
importlib.reload(sys.modules[save_excel_comb_results.__module__])

<module 'experiments_core' from '/home/ctorres9/EAFIT/trabajogrado/experiments/experiments_core.py'>

In [3]:
files_path = 'data/satirical_real_news_tweets/'
all_files = glob.glob(files_path + "/*.csv")
print("Numero de archivos:", len(all_files))
original_news_df = pd.DataFrame()
files_list = []
for file in all_files:
    df = pd.read_csv(file, index_col=None, header=0, sep='\t')
    files_list.append(df)
original_news_df = pd.concat(files_list, ignore_index=True)
original_news_df = original_news_df.drop(columns=['Unnamed: 0'])
print(original_news_df.describe())
print(original_news_df.head())

Numero de archivos: 16
                                                     text target
count                                               10000  10000
unique                                               9992      2
top     Fernández Díaz nombra comisario honorario de l...   fake
freq                                                    2   5000
                                                text target
0  Las lágrimas de un niño con autismo en un conc...   real
1  Vender carne de mono por ternera https://t.co/...   real
2  #Psicología Si buscas olvido en internet, verá...   real
3  El cantante congoleño Papa Wemba fallece en pl...   real
4  #Lomásvisto Vender carne de mono por ternera h...   real


In [26]:
pd.options.display.max_colwidth = 1000

In [38]:
print(original_news_df[original_news_df.target == 'fake'].sample(5).to_latex())

\begin{tabular}{lll}
\toprule
{} &                                                                                                                              text & target \\
\midrule
8344 &                    10 frases de Paulo Coelho para convencer al casero de que baje el precio del alquiler: https://t.co/0FhBStsl3M &   fake \\
9727 &                                                             Los 14 memes del 14 de febrero y San Valentín https://t.co/4nR3VvXhMb &   fake \\
4425 &                        Greenpeace dejará de proteger a las ballenas para que aprendan a defenderse solas: https://t.co/uHguRI6NBt &   fake \\
1530 &                         Cientificos descubren malformación genética que hace que a la gente le guste Maná https://t.co/VcyMkz7dIX &   fake \\
8394 &  Trucos para saber si estás viviendo en el típico piso compartido de una “sitcom” https://t.co/d5eH07joNJ https://t.co/GpSxK1jIa4 &   fake \\
\bottomrule
\end{tabular}



In [4]:
total_data_content = original_news_df.text
total_data_target = original_news_df.target

In [5]:
X_train, X_test, y_train, y_test = train_test_split(total_data_content, 
                                                    total_data_target, 
                                                    test_size=0.3, 
                                                    random_state=60)

In [6]:
print("Reales entrenamiento:", len(y_train[y_train == 'real']))
print("Fakes entrenamiento:", len(y_train[y_train == 'fake']))
print("Reales pruebas:", len(y_test[y_test == 'real']))
print("Fakes pruebas:", len(y_test[y_test == 'fake']))

Reales entrenamiento: 3485
Fakes entrenamiento: 3515
Reales pruebas: 1515
Fakes pruebas: 1485


In [7]:
spanish_stopwords = stopwords.words('spanish')
stemmer = SnowballStemmer('spanish')
replacer = RepeatReplacer()

# Combinaciones de experimentos para tweets

Los siguientes son los resultados de las combinaciones entre los diferentes metodos de limpieza y ponderacion en la representacion

In [8]:
pre_processing_tasks = ['stop_words','stemming','lemmatization','urls','norm_letters','pruning10','pruning5']

In [9]:
all_the_tasks = []
for r in range(1, len(pre_processing_tasks) + 1):
    all_the_tasks = all_the_tasks + list(itertools.combinations(pre_processing_tasks, r))

In [10]:
len(all_the_tasks)

127

In [11]:
count = 0
tasks_to_remove = []
for comb in all_the_tasks:
    if (('stemming' in comb) and ('lemmatization' in comb)) or (('pruning10' in comb) and ('pruning5' in comb)):
        #all_the_tasks.remove(comb)
        tasks_to_remove.append(comb)
        count += 1
print(count)
print(len(tasks_to_remove))

56
56


In [12]:
for task in tasks_to_remove:
    all_the_tasks.remove(task)

In [13]:
all_the_tasks

[('stop_words',),
 ('stemming',),
 ('lemmatization',),
 ('urls',),
 ('norm_letters',),
 ('pruning10',),
 ('pruning5',),
 ('stop_words', 'stemming'),
 ('stop_words', 'lemmatization'),
 ('stop_words', 'urls'),
 ('stop_words', 'norm_letters'),
 ('stop_words', 'pruning10'),
 ('stop_words', 'pruning5'),
 ('stemming', 'urls'),
 ('stemming', 'norm_letters'),
 ('stemming', 'pruning10'),
 ('stemming', 'pruning5'),
 ('lemmatization', 'urls'),
 ('lemmatization', 'norm_letters'),
 ('lemmatization', 'pruning10'),
 ('lemmatization', 'pruning5'),
 ('urls', 'norm_letters'),
 ('urls', 'pruning10'),
 ('urls', 'pruning5'),
 ('norm_letters', 'pruning10'),
 ('norm_letters', 'pruning5'),
 ('stop_words', 'stemming', 'urls'),
 ('stop_words', 'stemming', 'norm_letters'),
 ('stop_words', 'stemming', 'pruning10'),
 ('stop_words', 'stemming', 'pruning5'),
 ('stop_words', 'lemmatization', 'urls'),
 ('stop_words', 'lemmatization', 'norm_letters'),
 ('stop_words', 'lemmatization', 'pruning10'),
 ('stop_words', 'lemm

In [14]:
comb_test = [('stop_words'),('stemming'), ('lemmatization'), ('urls'), ('norm_letters'), ('pruning10'), ('pruning5')]

In [21]:
clf_types_weighting_types = [{'clf_name': 'Bayesiano', 'clf_type': MultinomialNB(), 
                              'weighting_type': 'TF', 'optimal_parameters': {'alpha': [1]}, 'random_state': 60},
                             {'clf_name': 'Bayesiano', 'clf_type': MultinomialNB(), 
                              'weighting_type': 'TF-IDF', 'optimal_parameters': {'alpha': [0.1]}, 'random_state': 40},
                             {'clf_name': 'Bayesiano', 'clf_type': MultinomialNB(), 
                              'weighting_type': 'Binario', 'optimal_parameters': {'alpha': [1]}, 'random_state': 60},
                             
                             {'clf_name': 'SVM_radial', 'clf_type': SVC(), 
                              'weighting_type': 'TF', 'optimal_parameters': {'kernel': ['rbf'], 'gamma': [0.001], 'C': [100]}, 'random_state': 90},
                             {'clf_name': 'SVM_radial', 'clf_type': SVC(), 
                              'weighting_type': 'TF-IDF', 'optimal_parameters': {'kernel': ['rbf'], 'gamma': [0.001], 'C': [1000]}, 'random_state': 20},
                             {'clf_name': 'SVM_radial', 'clf_type': SVC(), 
                              'weighting_type': 'Binario', 'optimal_parameters': {'kernel': ['rbf'], 'gamma': [0.001], 'C': [100]}, 'random_state': 90},
                            
                             {'clf_name': 'SVM_lineal', 'clf_type': LinearSVC(), 
                              'weighting_type': 'TF', 'optimal_parameters': {'C': [0.1]}, 'random_state': 90},
                             {'clf_name': 'SVM_lineal', 'clf_type': LinearSVC(), 
                              'weighting_type': 'TF-IDF', 'optimal_parameters': {'C': [100]}, 'random_state': 20},
                             {'clf_name': 'SVM_lineal', 'clf_type': LinearSVC(), 
                              'weighting_type': 'Binario', 'optimal_parameters': {'C': [0.1]}, 'random_state': 100},
                            
                             {'clf_name': 'Reg_Log', 'clf_type': LogisticRegression(), 
                              'weighting_type': 'TF', 'optimal_parameters': {'C': [1000]}, 'random_state': 90},
                             {'clf_name': 'Reg_Log', 'clf_type': LogisticRegression(), 
                              'weighting_type': 'TF-IDF', 'optimal_parameters': {'C': [1000]}, 'random_state': 60},
                             {'clf_name': 'Reg_Log', 'clf_type': LogisticRegression(), 
                              'weighting_type': 'Binario', 'optimal_parameters': {'C': [1000]}, 'random_state': 100}]

In [22]:
clf_types_weighting_types

[{'clf_name': 'Bayesiano',
  'clf_type': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
  'weighting_type': 'TF',
  'optimal_parameters': {'alpha': [1]},
  'random_state': 60},
 {'clf_name': 'Bayesiano',
  'clf_type': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
  'weighting_type': 'TF-IDF',
  'optimal_parameters': {'alpha': [0.1]},
  'random_state': 40},
 {'clf_name': 'Bayesiano',
  'clf_type': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
  'weighting_type': 'Binario',
  'optimal_parameters': {'alpha': [1]},
  'random_state': 60},
 {'clf_name': 'SVM_radial',
  'clf_type': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False),
  'weighting_type': 'TF',
  'optimal_parameters': {'kernel': ['rbf'], 'gamma': [0.001], 'C': [100]},
  'random_state': 90},
 {'clf_name': 'SVM_ra

In [23]:
vectorizers_list = []
results_list = []
workbook = xlsxwriter.Workbook('result_combinations_news_fakes.xlsx')
for clf_type_weighting_type in clf_types_weighting_types:
    ws_name = clf_type_weighting_type['clf_name'] + '_' + clf_type_weighting_type['weighting_type']
    worksheet = workbook.add_worksheet(ws_name)
    init_row = 0
    for comb in all_the_tasks:
        
        if clf_type_weighting_type['weighting_type'] == 'TF':
            vectorizer = CountVectorizer()
        if clf_type_weighting_type['weighting_type'] == 'TF-IDF':
            vectorizer = TfidfVectorizer()
        if clf_type_weighting_type['weighting_type'] == 'Binario':
            vectorizer = CountVectorizer(binary=True)
        
        clf_type = clf_type_weighting_type['clf_type']
        optimal_parameters = clf_type_weighting_type['optimal_parameters']
        random_state = clf_type_weighting_type['random_state']
        
        def tokenize_combinations(text):
            if ('urls' in comb):
                url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
                text = re.sub(url_regex, '', text, flags=re.MULTILINE)

            regexp_tokenizer = RegexpTokenizer(u'(?u)\\b\\w\\w+\\b')
            tokens =  regexp_tokenizer.tokenize(text)

            if ('norm_letters' in comb):
                tokens = norm_repeated_letters(tokens, replacer)

            if ('stemming' in comb):
                tokens = stem_tokens(tokens, stemmer)

            if ('lemmatization' in comb):
                tokens = lemmatizer(tokens)

            return tokens

        vectorizer.set_params(tokenizer = tokenize_combinations)
        
        if ('stop_words' in comb):
            vectorizer.set_params(stop_words = spanish_stopwords)
            
        if ('pruning10' in comb):
            vectorizer.set_params(min_df=10)
            
        if ('pruning5' in comb):
            vectorizer.set_params(min_df=5)
            
        print(ws_name + str(comb))
        exp_results = run_one_comb_experiment(total_data_content, total_data_target, vectorizer, optimal_parameters, clf_type, random_state)
        continue_row = save_excel_comb_results(worksheet, init_row, exp_results, str(optimal_parameters), random_state, str(comb))
        init_row = continue_row
        vectorizers_list.append(vectorizer)
        results_list.append(exp_results + (ws_name, str(comb)))
workbook.close()     

Bayesiano_TF('stop_words',)
train time: 9.904s
Bayesiano_TF('stemming',)
train time: 9.909s
Bayesiano_TF('lemmatization',)
train time: 10.341s
Bayesiano_TF('urls',)
train time: 9.970s
Bayesiano_TF('norm_letters',)
train time: 10.010s
Bayesiano_TF('pruning10',)
train time: 9.707s
Bayesiano_TF('pruning5',)
train time: 9.683s
Bayesiano_TF('stop_words', 'stemming')
train time: 9.978s
Bayesiano_TF('stop_words', 'lemmatization')
train time: 10.086s
Bayesiano_TF('stop_words', 'urls')
train time: 10.048s
Bayesiano_TF('stop_words', 'norm_letters')
train time: 9.861s
Bayesiano_TF('stop_words', 'pruning10')
train time: 10.036s
Bayesiano_TF('stop_words', 'pruning5')
train time: 10.276s
Bayesiano_TF('norm_letters', 'pruning10')
train time: 10.080s
Bayesiano_TF('norm_letters', 'pruning5')
train time: 10.241s
Bayesiano_TF('stop_words', 'stemming', 'urls')
train time: 10.438s
Bayesiano_TF('stop_words', 'stemming', 'norm_letters')
train time: 10.274s
Bayesiano_TF('stop_words', 'stemming', 'pruning10')


train time: 14.514s
Bayesiano_TF-IDF('stop_words', 'lemmatization', 'urls', 'norm_letters')
train time: 14.659s
Bayesiano_TF-IDF('stop_words', 'lemmatization', 'urls', 'pruning10')
train time: 14.787s
Bayesiano_TF-IDF('stop_words', 'lemmatization', 'urls', 'pruning5')
train time: 14.977s
Bayesiano_TF-IDF('stop_words', 'lemmatization', 'norm_letters', 'pruning10')
train time: 14.841s
Bayesiano_TF-IDF('stop_words', 'lemmatization', 'norm_letters', 'pruning5')
train time: 14.889s
Bayesiano_TF-IDF('stop_words', 'urls', 'norm_letters', 'pruning10')
train time: 14.902s
Bayesiano_TF-IDF('stop_words', 'urls', 'norm_letters', 'pruning5')
train time: 15.014s
Bayesiano_TF-IDF('stemming', 'urls', 'norm_letters', 'pruning10')
train time: 15.170s
Bayesiano_TF-IDF('stemming', 'urls', 'norm_letters', 'pruning5')
train time: 15.371s
Bayesiano_TF-IDF('lemmatization', 'urls', 'norm_letters', 'pruning10')
train time: 15.195s
Bayesiano_TF-IDF('lemmatization', 'urls', 'norm_letters', 'pruning5')
train time:

SVM_radial_TF('norm_letters', 'pruning5')
train time: 28.220s
SVM_radial_TF('stop_words', 'stemming', 'urls')
train time: 29.726s
SVM_radial_TF('stop_words', 'stemming', 'norm_letters')
train time: 29.592s
SVM_radial_TF('stop_words', 'stemming', 'pruning10')
train time: 26.328s
SVM_radial_TF('stop_words', 'stemming', 'pruning5')
train time: 27.206s
SVM_radial_TF('stop_words', 'lemmatization', 'urls')
train time: 29.986s
SVM_radial_TF('stop_words', 'lemmatization', 'norm_letters')
train time: 30.104s
SVM_radial_TF('stop_words', 'lemmatization', 'pruning10')
train time: 26.301s
SVM_radial_TF('stop_words', 'lemmatization', 'pruning5')
train time: 26.837s
SVM_radial_TF('stop_words', 'urls', 'norm_letters')
train time: 29.977s
SVM_radial_TF('stop_words', 'urls', 'pruning10')
train time: 26.240s
SVM_radial_TF('stop_words', 'urls', 'pruning5')
train time: 27.191s
SVM_radial_TF('stop_words', 'norm_letters', 'pruning10')
train time: 26.081s
SVM_radial_TF('stop_words', 'norm_letters', 'pruning5'

train time: 42.629s
Reg_Log_TF-IDF('lemmatization', 'urls', 'pruning10')
train time: 49.051s
Reg_Log_TF-IDF('lemmatization', 'urls', 'pruning5')
train time: 49.745s
Reg_Log_TF-IDF('lemmatization', 'norm_letters', 'pruning10')
train time: 44.329s
Reg_Log_TF-IDF('lemmatization', 'norm_letters', 'pruning5')
train time: 43.340s
Reg_Log_TF-IDF('urls', 'norm_letters', 'pruning10')
train time: 51.892s
Reg_Log_TF-IDF('urls', 'norm_letters', 'pruning5')
train time: 49.792s
Reg_Log_TF-IDF('stop_words', 'stemming', 'urls', 'norm_letters')
train time: 43.701s
Reg_Log_TF-IDF('stop_words', 'stemming', 'urls', 'pruning10')
train time: 43.897s
Reg_Log_TF-IDF('stop_words', 'stemming', 'urls', 'pruning5')
train time: 46.197s
Reg_Log_TF-IDF('stop_words', 'stemming', 'norm_letters', 'pruning10')
train time: 49.782s
Reg_Log_TF-IDF('stop_words', 'stemming', 'norm_letters', 'pruning5')
train time: 45.054s
Reg_Log_TF-IDF('stop_words', 'lemmatization', 'urls', 'norm_letters')
train time: 44.992s
Reg_Log_TF-IDF

In [24]:
max(results_list)

(0.8926666666666667,
 23179,
 0.8924709159429788,
 0.8945289958882208,
 0.8923292329232924,
 1403,
 1275,
 210,
 112,
 'Bayesiano_TF',
 "('stop_words',)")

In [39]:
print(len(results_list))

852


In [40]:
results_list_backup = results_list

In [43]:
results_df = pd.DataFrame(results_list)

In [49]:
results_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.892667,23179,0.892471,0.894529,0.892329,1403,1275,210,112,Bayesiano_TF,"('stop_words',)"
1,0.879,17645,0.87884,0.88018,0.878725,1373,1264,221,142,Bayesiano_TF,"('stemming',)"
2,0.877333,18669,0.877246,0.877826,0.877154,1356,1276,209,159,Bayesiano_TF,"('lemmatization',)"
3,0.856333,15636,0.856189,0.857082,0.856102,1332,1237,248,183,Bayesiano_TF,"('urls',)"
4,0.891,23296,0.890845,0.892348,0.890712,1393,1280,205,122,Bayesiano_TF,"('norm_letters',)"


In [61]:
results_df = results_df[[9, 10, 1, 0, 2, 3, 4, 5, 6, 7, 8]]

In [62]:
results_df

Unnamed: 0,9,10,1,0,2,3,4,5,6,7,8
0,Bayesiano_TF,"('stop_words',)",23179,0.892667,0.892471,0.894529,0.892329,1403,1275,210,112
1,Bayesiano_TF,"('stemming',)",17645,0.879000,0.878840,0.880180,0.878725,1373,1264,221,142
2,Bayesiano_TF,"('lemmatization',)",18669,0.877333,0.877246,0.877826,0.877154,1356,1276,209,159
3,Bayesiano_TF,"('urls',)",15636,0.856333,0.856189,0.857082,0.856102,1332,1237,248,183
4,Bayesiano_TF,"('norm_letters',)",23296,0.891000,0.890845,0.892348,0.890712,1393,1280,205,122
5,Bayesiano_TF,"('pruning10',)",1226,0.847333,0.846951,0.849716,0.846918,1346,1196,289,169
6,Bayesiano_TF,"('pruning5',)",2611,0.863333,0.863169,0.864319,0.863073,1347,1243,242,168
7,Bayesiano_TF,"('stop_words', 'stemming')",17565,0.873667,0.873500,0.874825,0.873391,1365,1256,229,150
8,Bayesiano_TF,"('stop_words', 'lemmatization')",18607,0.882000,0.881869,0.882946,0.881755,1373,1273,212,142
9,Bayesiano_TF,"('stop_words', 'urls')",15446,0.851667,0.851564,0.852094,0.851488,1317,1238,247,198


In [63]:
results_df.columns = ['clf_name','comb','features_number','accuracy','f1_score','precision','recall','true_positives','true_negatives','false_positives','false_negatives']

In [64]:
results_df

Unnamed: 0,clf_name,comb,features_number,accuracy,f1_score,precision,recall,true_positives,true_negatives,false_positives,false_negatives
0,Bayesiano_TF,"('stop_words',)",23179,0.892667,0.892471,0.894529,0.892329,1403,1275,210,112
1,Bayesiano_TF,"('stemming',)",17645,0.879000,0.878840,0.880180,0.878725,1373,1264,221,142
2,Bayesiano_TF,"('lemmatization',)",18669,0.877333,0.877246,0.877826,0.877154,1356,1276,209,159
3,Bayesiano_TF,"('urls',)",15636,0.856333,0.856189,0.857082,0.856102,1332,1237,248,183
4,Bayesiano_TF,"('norm_letters',)",23296,0.891000,0.890845,0.892348,0.890712,1393,1280,205,122
5,Bayesiano_TF,"('pruning10',)",1226,0.847333,0.846951,0.849716,0.846918,1346,1196,289,169
6,Bayesiano_TF,"('pruning5',)",2611,0.863333,0.863169,0.864319,0.863073,1347,1243,242,168
7,Bayesiano_TF,"('stop_words', 'stemming')",17565,0.873667,0.873500,0.874825,0.873391,1365,1256,229,150
8,Bayesiano_TF,"('stop_words', 'lemmatization')",18607,0.882000,0.881869,0.882946,0.881755,1373,1273,212,142
9,Bayesiano_TF,"('stop_words', 'urls')",15446,0.851667,0.851564,0.852094,0.851488,1317,1238,247,198


In [65]:
results_df.to_csv('result_news_fakes_combinations.csv', sep='\t')

In [118]:
results_tweets_df = results_df[results_df['clf_name'].str.contains('Reg_Log')]

In [119]:
len(results_tweets_df)

213

In [129]:
filter_result = results_tweets_df.nsmallest(5, columns='accuracy')[['clf_name','comb','features_number','accuracy','f1_score']]

In [130]:
filter_result

Unnamed: 0,clf_name,comb,features_number,accuracy,f1_score
839,Reg_Log_Binario,"('stop_words', 'lemmatization', 'urls', 'pruning5')",2327,0.748333,0.748319
827,Reg_Log_Binario,"('lemmatization', 'urls', 'pruning5')",2383,0.753,0.752988
851,Reg_Log_Binario,"('stop_words', 'lemmatization', 'urls', 'norm_letters', 'pruning5')",2316,0.755,0.754994
843,Reg_Log_Binario,"('stop_words', 'urls', 'norm_letters', 'pruning5')",2454,0.756,0.755982
817,Reg_Log_Binario,"('stop_words', 'urls', 'pruning5')",2457,0.757667,0.757657


In [132]:
filter_result.columns = ['Clf_Pon', 'Combinación', '# Car', 'Exact', 'V-F1']

In [87]:
pd.options.display.max_colwidth = 1000

In [133]:
print(filter_result.round(4).to_latex(index=False, column_format='|l|m{16em}|r|r|r|', decimal=','))

\begin{tabular}{|l|m{16em}|r|r|r|}
\toprule
         Clf\_Pon &                                                          Combinación &  \# Car &   Exact &    V-F1 \\
\midrule
 Reg\_Log\_Binario &                  ('stop\_words', 'lemmatization', 'urls', 'pruning5') &   2327 &  0,7483 &  0,7483 \\
 Reg\_Log\_Binario &                                ('lemmatization', 'urls', 'pruning5') &   2383 &  0,7530 &  0,7530 \\
 Reg\_Log\_Binario &  ('stop\_words', 'lemmatization', 'urls', 'norm\_letters', 'pruning5') &   2316 &  0,7550 &  0,7550 \\
 Reg\_Log\_Binario &                   ('stop\_words', 'urls', 'norm\_letters', 'pruning5') &   2454 &  0,7560 &  0,7560 \\
 Reg\_Log\_Binario &                                   ('stop\_words', 'urls', 'pruning5') &   2457 &  0,7577 &  0,7577 \\
\bottomrule
\end{tabular}

