In [31]:
import pandas as pd
import itertools

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from experiments_core import run_one_comb_experiment
from experiments_core import norm_repeated_letters
from experiments_core import stem_tokens
from experiments_core import lemmatizer
from experiments_core import save_excel_comb_results
from experiments_core import RepeatReplacer

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import RegexpTokenizer

import xlsxwriter

import re

import matplotlib.pyplot as plt
%matplotlib inline

In [33]:
import importlib
import sys
importlib.reload(sys.modules[run_one_comb_experiment.__module__])
importlib.reload(sys.modules[norm_repeated_letters.__module__])
importlib.reload(sys.modules[stem_tokens.__module__])
importlib.reload(sys.modules[lemmatizer.__module__])
importlib.reload(sys.modules[save_excel_comb_results.__module__])

<module 'experiments_core' from '/home/ctorres9/EAFIT/trabajogrado/experiments/experiments_core.py'>

In [3]:
file_path = 'data/TweetsPolaridadSEPLN.csv'
original_tweets_df = pd.read_csv(file_path, index_col=None, header=0, sep='\t')
print(original_tweets_df.describe())
print(original_tweets_df.head())

                      tweetid          user               content  \
count                   60798         60798                 60798   
unique                  60798           158                 60405   
top      148488915765047296,0  mariviromero  Buenos días a todos!   
freq                        1          7138                   111   

                       date   lang polarity  topic  
count                 60798  60798    60798  60798  
unique                60510      1        6     10  
top     2012-01-01T00:00:16     es     NONE  otros  
freq                      4  60798    21416  28189  
                 tweetid            user  \
0   142378325086715904,0     jesusmarana   
1   142379080808013824,0       EvaORegan   
2   142379173120442368,0  LosadaPescador   
3   142379815708803072,0    mgilguerrero   
4   142381190123499520,0  pedroj_ramirez   

                                             content                 date  \
0  Portada 'Público', viernes. Fabra al banquil

In [4]:
strong_negative_tweets = original_tweets_df[original_tweets_df.polarity == 'N+'][['content','polarity']]
standar_negative_tweets = original_tweets_df[original_tweets_df.polarity == 'N'][['content','polarity']]

strong_positive_tweets = original_tweets_df[original_tweets_df.polarity == 'P+'][['content','polarity']]
standar_positive_tweets = original_tweets_df[original_tweets_df.polarity == 'P'][['content','polarity']]

negative_tweets = pd.concat([strong_negative_tweets,standar_negative_tweets], ignore_index=True)
positive_tweets = pd.concat([strong_positive_tweets,standar_positive_tweets], ignore_index=True)

negative_tweets.polarity = "negative"
positive_tweets.polarity = "positive"

total_tweets = pd.concat([negative_tweets,positive_tweets], ignore_index=True)
print(total_tweets.describe())
print(total_tweets.head())

                                                  content  polarity
count                                               38077     38077
unique                                              37926         2
top     ¡Noticias descombacantes! está disponible! htt...  positive
freq                                                   63     22233
                                             content  polarity
0  Dado q la deuda privada es superior a la publi...  negative
1  TEPCO inyecta nitrógeno en los reactores de Fu...  negative
2  “@Declaracion: «Cualquier injusticia contra un...  negative
3  ¡Qué estrés!, la presidenta de la diputación d...  negative
4  Hoy entrego mi credencial en el Congreso. Una ...  negative


In [5]:
total_data_content = total_tweets.content
total_data_target = total_tweets.polarity

In [6]:
X_train, X_test, y_train, y_test = train_test_split(total_data_content, 
                                                    total_data_target, 
                                                    test_size=0.3, 
                                                    random_state=80)

In [7]:
print("Positivos entrenamiento:", len(y_train[y_train == 'positive']))
print("Negativos entrenamiento:", len(y_train[y_train == 'negative']))
print("Positivos pruebas:", len(y_test[y_test == 'positive']))
print("Negativos pruebas:", len(y_test[y_test == 'negative']))

Positivos entrenamiento: 15552
Negativos entrenamiento: 11101
Positivos pruebas: 6681
Negativos pruebas: 4743


In [22]:
spanish_stopwords = stopwords.words('spanish')
stemmer = SnowballStemmer('spanish')
replacer = RepeatReplacer()

# Combinaciones de experimentos para tweets

Los siguientes son los resultados de las combinaciones entre los diferentes metodos de limpieza y ponderacion en la representacion

In [9]:
pre_processing_tasks = ['stop_words','stemming','lemmatization','urls','norm_letters','pruning10','pruning5']

In [10]:
all_the_tasks = []
for r in range(1, len(pre_processing_tasks) + 1):
    all_the_tasks = all_the_tasks + list(itertools.combinations(pre_processing_tasks, r))

In [11]:
len(all_the_tasks)

127

In [12]:
count = 0
tasks_to_remove = []
for comb in all_the_tasks:
    if (('stemming' in comb) and ('lemmatization' in comb)) or (('pruning10' in comb) and ('pruning5' in comb)):
        #all_the_tasks.remove(comb)
        tasks_to_remove.append(comb)
        count += 1
print(count)
print(len(tasks_to_remove))

56
56


In [13]:
for task in tasks_to_remove:
    all_the_tasks.remove(task)

In [24]:
len(all_the_tasks)

71

In [15]:
comb_test = [('stop_words'),('stemming'), ('lemmatization'), ('urls'), ('norm_letters'), ('pruning10'), ('pruning5')]

In [39]:
clf_types_weighting_types = [{'clf_name': 'Bayesiano', 'clf_type': MultinomialNB(), 
                              'weighting_type': 'TF', 'optimal_parameters': {'alpha': [3]}, 'random_state': 40},
                             {'clf_name': 'Bayesiano', 'clf_type': MultinomialNB(), 
                              'weighting_type': 'TF-IDF', 'optimal_parameters': {'alpha': [1]}, 'random_state': 30},
                             {'clf_name': 'Bayesiano', 'clf_type': MultinomialNB(), 
                              'weighting_type': 'Binario', 'optimal_parameters': {'alpha': [3]}, 'random_state': 40},
                             
#                              {'clf_name': 'SVM_radial', 'clf_type': SVC(), 
#                               'weighting_type': 'TF', 'optimal_parameters': {'kernel': ['rbf'], 'gamma': [0.001], 'C': [100]}, 'random_state': 30},
#                              {'clf_name': 'SVM_radial', 'clf_type': SVC(), 
#                               'weighting_type': 'TF-IDF', 'optimal_parameters': {'kernel': ['rbf'], 'gamma': [0.001], 'C': [1000]}, 'random_state': 10},
#                              {'clf_name': 'SVM_radial', 'clf_type': SVC(), 
#                               'weighting_type': 'Binario', 'optimal_parameters': {'kernel': ['rbf'], 'gamma': [0.001], 'C': [100]}, 'random_state': 30},
                            
                             {'clf_name': 'SVM_lineal', 'clf_type': LinearSVC(), 
                              'weighting_type': 'TF', 'optimal_parameters': {'C': [0.1]}, 'random_state': 40},
                             {'clf_name': 'SVM_lineal', 'clf_type': LinearSVC(), 
                              'weighting_type': 'TF-IDF', 'optimal_parameters': {'C': [1]}, 'random_state': 30},
                             {'clf_name': 'SVM_lineal', 'clf_type': LinearSVC(), 
                              'weighting_type': 'Binario', 'optimal_parameters': {'C': [0.1]}, 'random_state': 40},
                            
                             {'clf_name': 'Reg_Log', 'clf_type': LogisticRegression(), 
                              'weighting_type': 'TF', 'optimal_parameters': {'C': [1]}, 'random_state': 40},
                             {'clf_name': 'Reg_Log', 'clf_type': LogisticRegression(), 
                              'weighting_type': 'TF-IDF', 'optimal_parameters': {'C': [10]}, 'random_state': 40},
                             {'clf_name': 'Reg_Log', 'clf_type': LogisticRegression(), 
                              'weighting_type': 'Binario', 'optimal_parameters': {'C': [1]}, 'random_state': 40}]

In [40]:
clf_types_weighting_types

[{'clf_name': 'Bayesiano',
  'clf_type': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
  'weighting_type': 'TF',
  'optimal_parameters': {'alpha': [3]},
  'random_state': 40},
 {'clf_name': 'Bayesiano',
  'clf_type': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
  'weighting_type': 'TF-IDF',
  'optimal_parameters': {'alpha': [1]},
  'random_state': 30},
 {'clf_name': 'Bayesiano',
  'clf_type': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
  'weighting_type': 'Binario',
  'optimal_parameters': {'alpha': [3]},
  'random_state': 40},
 {'clf_name': 'SVM_lineal',
  'clf_type': LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
       intercept_scaling=1, loss='squared_hinge', max_iter=1000,
       multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
       verbose=0),
  'weighting_type': 'TF',
  'optimal_parameters': {'C': [0.1]},
  'random_state': 40},
 {'clf_name': 'SVM_lineal',
  'clf_type': LinearSVC(C=1.0, class_weigh

In [23]:
vectorizers_list = []
results_list = []
workbook = xlsxwriter.Workbook('result_combinations_tweets.xlsx')
for clf_type_weighting_type in clf_types_weighting_types:
    ws_name = clf_type_weighting_type['clf_name'] + '_' + clf_type_weighting_type['weighting_type']
    worksheet = workbook.add_worksheet(ws_name)
    init_row = 0
    for comb in all_the_tasks:
        
        if clf_type_weighting_type['weighting_type'] == 'TF':
            vectorizer = CountVectorizer()
        if clf_type_weighting_type['weighting_type'] == 'TF-IDF':
            vectorizer = TfidfVectorizer()
        if clf_type_weighting_type['weighting_type'] == 'Binario':
            vectorizer = CountVectorizer(binary=True)
        
        clf_type = clf_type_weighting_type['clf_type']
        optimal_parameters = clf_type_weighting_type['optimal_parameters']
        random_state = clf_type_weighting_type['random_state']
        
        def tokenize_combinations(text):
            if ('urls' in comb):
                url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
                text = re.sub(url_regex, '', text, flags=re.MULTILINE)

            regexp_tokenizer = RegexpTokenizer(u'(?u)\\b\\w\\w+\\b')
            tokens =  regexp_tokenizer.tokenize(text)

            if ('norm_letters' in comb):
                tokens = norm_repeated_letters(tokens, replacer)

            if ('stemming' in comb):
                tokens = stem_tokens(tokens, stemmer)

            if ('lemmatization' in comb):
                tokens = lemmatizer(tokens)

            return tokens

        vectorizer.set_params(tokenizer = tokenize_combinations)
        
        if ('stop_words' in comb):
            vectorizer.set_params(stop_words = spanish_stopwords)
            
        if ('pruning10' in comb):
            vectorizer.set_params(min_df=10)
            
        if ('pruning5' in comb):
            vectorizer.set_params(min_df=5)
            
        print(ws_name + str(comb))
        exp_results = run_one_comb_experiment(total_data_content, total_data_target, vectorizer, optimal_parameters, clf_type, random_state)
        continue_row = save_excel_comb_results(worksheet, init_row, exp_results, str(optimal_parameters), random_state, str(comb))
        init_row = continue_row
        vectorizers_list.append(vectorizer)
        results_list.append(exp_results + (ws_name, str(comb)))
workbook.close()     

Bayesiano_TF('stop_words',)
train time: 6.414s
Bayesiano_TF('stemming',)
train time: 6.520s
Bayesiano_TF('lemmatization',)
train time: 6.475s
Bayesiano_TF('urls',)
train time: 6.537s
Bayesiano_TF('norm_letters',)
train time: 8.871s
Bayesiano_TF('pruning10',)
train time: 9.168s
Bayesiano_TF('pruning5',)
train time: 9.516s
Bayesiano_TF('stop_words', 'stemming')
train time: 9.502s
Bayesiano_TF('stop_words', 'lemmatization')
train time: 9.318s
Bayesiano_TF('stop_words', 'urls')
train time: 10.280s
Bayesiano_TF('stop_words', 'norm_letters')
train time: 9.794s
Bayesiano_TF('stop_words', 'pruning10')
train time: 10.011s
Bayesiano_TF('stop_words', 'pruning5')
train time: 10.194s
Bayesiano_TF('stemming', 'urls')
train time: 10.242s
Bayesiano_TF('stemming', 'norm_letters')
train time: 10.025s
Bayesiano_TF('stemming', 'pruning10')
train time: 10.615s
Bayesiano_TF('stemming', 'pruning5')
train time: 10.274s
Bayesiano_TF('lemmatization', 'urls')
train time: 10.887s
Bayesiano_TF('lemmatization', 'no

train time: 20.884s
Bayesiano_TF-IDF('stop_words', 'stemming', 'urls', 'pruning5')
train time: 20.143s
Bayesiano_TF-IDF('stop_words', 'stemming', 'norm_letters', 'pruning10')
train time: 19.966s
Bayesiano_TF-IDF('stop_words', 'stemming', 'norm_letters', 'pruning5')
train time: 19.750s
Bayesiano_TF-IDF('stop_words', 'lemmatization', 'urls', 'norm_letters')
train time: 21.121s
Bayesiano_TF-IDF('stop_words', 'lemmatization', 'urls', 'pruning10')
train time: 20.862s
Bayesiano_TF-IDF('stop_words', 'lemmatization', 'urls', 'pruning5')
train time: 21.118s
Bayesiano_TF-IDF('stop_words', 'lemmatization', 'norm_letters', 'pruning10')
train time: 20.863s
Bayesiano_TF-IDF('stop_words', 'lemmatization', 'norm_letters', 'pruning5')
train time: 21.965s
Bayesiano_TF-IDF('stop_words', 'urls', 'norm_letters', 'pruning10')
train time: 20.532s
Bayesiano_TF-IDF('stop_words', 'urls', 'norm_letters', 'pruning5')
train time: 21.388s
Bayesiano_TF-IDF('stemming', 'urls', 'norm_letters', 'pruning10')
train time:

train time: 30.743s
SVM_lineal_TF('urls', 'norm_letters')
train time: 30.415s
SVM_lineal_TF('urls', 'pruning10')
train time: 29.312s
SVM_lineal_TF('urls', 'pruning5')
train time: 29.790s
SVM_lineal_TF('norm_letters', 'pruning10')
train time: 31.789s
SVM_lineal_TF('norm_letters', 'pruning5')
train time: 31.625s
SVM_lineal_TF('stop_words', 'stemming', 'urls')
train time: 30.761s
SVM_lineal_TF('stop_words', 'stemming', 'norm_letters')
train time: 30.692s
SVM_lineal_TF('stop_words', 'stemming', 'pruning10')
train time: 32.271s
SVM_lineal_TF('stop_words', 'stemming', 'pruning5')
train time: 31.541s
SVM_lineal_TF('stop_words', 'lemmatization', 'urls')
train time: 30.278s
SVM_lineal_TF('stop_words', 'lemmatization', 'norm_letters')
train time: 31.083s
SVM_lineal_TF('stop_words', 'lemmatization', 'pruning10')
train time: 31.731s
SVM_lineal_TF('stop_words', 'lemmatization', 'pruning5')
train time: 31.051s
SVM_lineal_TF('stop_words', 'urls', 'norm_letters')
train time: 31.573s
SVM_lineal_TF('sto

train time: 39.937s
SVM_lineal_TF-IDF('stop_words', 'lemmatization', 'norm_letters', 'pruning5')
train time: 41.045s
SVM_lineal_TF-IDF('stop_words', 'urls', 'norm_letters', 'pruning10')
train time: 41.506s
SVM_lineal_TF-IDF('stop_words', 'urls', 'norm_letters', 'pruning5')
train time: 41.816s
SVM_lineal_TF-IDF('stemming', 'urls', 'norm_letters', 'pruning10')
train time: 42.067s
SVM_lineal_TF-IDF('stemming', 'urls', 'norm_letters', 'pruning5')
train time: 41.990s
SVM_lineal_TF-IDF('lemmatization', 'urls', 'norm_letters', 'pruning10')
train time: 42.119s
SVM_lineal_TF-IDF('lemmatization', 'urls', 'norm_letters', 'pruning5')
train time: 40.614s
SVM_lineal_TF-IDF('stop_words', 'stemming', 'urls', 'norm_letters', 'pruning10')
train time: 40.146s
SVM_lineal_TF-IDF('stop_words', 'stemming', 'urls', 'norm_letters', 'pruning5')
train time: 42.016s
SVM_lineal_TF-IDF('stop_words', 'lemmatization', 'urls', 'norm_letters', 'pruning10')
train time: 43.065s
SVM_lineal_TF-IDF('stop_words', 'lemmatizat

train time: 51.279s
Reg_Log_TF('stop_words', 'lemmatization', 'norm_letters')
train time: 50.434s
Reg_Log_TF('stop_words', 'lemmatization', 'pruning10')
train time: 51.912s
Reg_Log_TF('stop_words', 'lemmatization', 'pruning5')
train time: 50.615s
Reg_Log_TF('stop_words', 'urls', 'norm_letters')
train time: 52.344s
Reg_Log_TF('stop_words', 'urls', 'pruning10')
train time: 50.424s
Reg_Log_TF('stop_words', 'urls', 'pruning5')
train time: 51.136s
Reg_Log_TF('stop_words', 'norm_letters', 'pruning10')
train time: 52.185s
Reg_Log_TF('stop_words', 'norm_letters', 'pruning5')
train time: 52.479s
Reg_Log_TF('stemming', 'urls', 'norm_letters')
train time: 51.878s
Reg_Log_TF('stemming', 'urls', 'pruning10')
train time: 51.545s
Reg_Log_TF('stemming', 'urls', 'pruning5')
train time: 53.329s
Reg_Log_TF('stemming', 'norm_letters', 'pruning10')
train time: 53.134s
Reg_Log_TF('stemming', 'norm_letters', 'pruning5')
train time: 51.813s
Reg_Log_TF('lemmatization', 'urls', 'norm_letters')
train time: 52.66

train time: 62.529s
Reg_Log_Binario('stop_words',)
train time: 60.879s
Reg_Log_Binario('stemming',)
train time: 64.157s
Reg_Log_Binario('lemmatization',)
train time: 63.935s
Reg_Log_Binario('urls',)
train time: 64.340s
Reg_Log_Binario('norm_letters',)
train time: 64.409s
Reg_Log_Binario('pruning10',)
train time: 62.135s
Reg_Log_Binario('pruning5',)
train time: 64.082s
Reg_Log_Binario('stop_words', 'stemming')
train time: 64.056s
Reg_Log_Binario('stop_words', 'lemmatization')
train time: 64.224s
Reg_Log_Binario('stop_words', 'urls')
train time: 64.242s
Reg_Log_Binario('stop_words', 'norm_letters')
train time: 64.315s
Reg_Log_Binario('stop_words', 'pruning10')
train time: 63.976s
Reg_Log_Binario('stop_words', 'pruning5')
train time: 63.888s
Reg_Log_Binario('stemming', 'urls')
train time: 63.495s
Reg_Log_Binario('stemming', 'norm_letters')
train time: 65.236s
Reg_Log_Binario('stemming', 'pruning10')
train time: 64.759s
Reg_Log_Binario('stemming', 'pruning5')
train time: 64.824s
Reg_Log_Bi

In [37]:
max(results_list)

(0.9108018207282913,
 5987,
 0.9079317998182008,
 0.9088661072868451,
 0.9070698246507669,
 6211,
 4194,
 544,
 475,
 'SVM_lineal_TF',
 "('lemmatization', 'norm_letters', 'pruning5')")

In [45]:
len(results_list)

639

In [44]:
results_list_backup = results_list

In [46]:
results_df = pd.DataFrame(results_list)

In [47]:
results_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.868873,48099,0.864606,0.865588,0.863716,5977,3949,789,709,Bayesiano_TF,"('stop_words',)"
1,0.872286,32900,0.869419,0.867375,0.872382,5829,4136,602,857,Bayesiano_TF,"('stemming',)"
2,0.873162,36795,0.870209,0.868315,0.872822,5849,4126,612,837,Bayesiano_TF,"('lemmatization',)"
3,0.873687,40405,0.870782,0.86883,0.873517,5847,4134,604,839,Bayesiano_TF,"('urls',)"
4,0.874475,47550,0.871272,0.869888,0.87296,5896,4094,644,790,Bayesiano_TF,"('norm_letters',)"


In [48]:
results_df = results_df[[9, 10, 1, 0, 2, 3, 4, 5, 6, 7, 8]]

In [49]:
results_df

Unnamed: 0,9,10,1,0,2,3,4,5,6,7,8
0,Bayesiano_TF,"('stop_words',)",48099,0.868873,0.864606,0.865588,0.863716,5977,3949,789,709
1,Bayesiano_TF,"('stemming',)",32900,0.872286,0.869419,0.867375,0.872382,5829,4136,602,857
2,Bayesiano_TF,"('lemmatization',)",36795,0.873162,0.870209,0.868315,0.872822,5849,4126,612,837
3,Bayesiano_TF,"('urls',)",40405,0.873687,0.870782,0.868830,0.873517,5847,4134,604,839
4,Bayesiano_TF,"('norm_letters',)",47550,0.874475,0.871272,0.869888,0.872960,5896,4094,644,790
5,Bayesiano_TF,"('pruning10',)",4348,0.863708,0.861047,0.858698,0.865268,5724,4143,595,962
6,Bayesiano_TF,"('pruning5',)",7822,0.868610,0.866004,0.863641,0.870133,5758,4165,573,928
7,Bayesiano_TF,"('stop_words', 'stemming')",32814,0.872024,0.868328,0.867958,0.868714,5938,4024,714,748
8,Bayesiano_TF,"('stop_words', 'lemmatization')",36731,0.871148,0.867427,0.867058,0.867812,5933,4019,719,753
9,Bayesiano_TF,"('stop_words', 'urls')",40155,0.871236,0.867608,0.867007,0.868256,5922,4031,707,764


In [50]:
results_df.columns = ['clf_name','comb','features_number','accuracy','f1_score','precision','recall','true_positives','true_negatives','false_positives','false_negatives']

In [56]:
results_df

Unnamed: 0,clf_name,comb,features_number,accuracy,f1_score,precision,recall,true_positives,true_negatives,false_positives,false_negatives
0,Bayesiano_TF,"('stop_words',)",48099,0.868873,0.864606,0.865588,0.863716,5977,3949,789,709
1,Bayesiano_TF,"('stemming',)",32900,0.872286,0.869419,0.867375,0.872382,5829,4136,602,857
2,Bayesiano_TF,"('lemmatization',)",36795,0.873162,0.870209,0.868315,0.872822,5849,4126,612,837
3,Bayesiano_TF,"('urls',)",40405,0.873687,0.870782,0.868830,0.873517,5847,4134,604,839
4,Bayesiano_TF,"('norm_letters',)",47550,0.874475,0.871272,0.869888,0.872960,5896,4094,644,790
5,Bayesiano_TF,"('pruning10',)",4348,0.863708,0.861047,0.858698,0.865268,5724,4143,595,962
6,Bayesiano_TF,"('pruning5',)",7822,0.868610,0.866004,0.863641,0.870133,5758,4165,573,928
7,Bayesiano_TF,"('stop_words', 'stemming')",32814,0.872024,0.868328,0.867958,0.868714,5938,4024,714,748
8,Bayesiano_TF,"('stop_words', 'lemmatization')",36731,0.871148,0.867427,0.867058,0.867812,5933,4019,719,753
9,Bayesiano_TF,"('stop_words', 'urls')",40155,0.871236,0.867608,0.867007,0.868256,5922,4031,707,764


In [65]:
len(results_df)

639

In [52]:
results_df.to_csv('result_twitter_sa_combinations.csv', sep='\t')

In [222]:
results_tweets_df = results_df[results_df['clf_name'].str.contains('Reg_Log')]

In [223]:
len(results_tweets_df)

213

In [215]:
filter_result = results_tweets_df.nsmallest(5, columns='accuracy')[['clf_name','comb','features_number','accuracy','f1_score']]

In [216]:
filter_result

Unnamed: 0,clf_name,comb,features_number,accuracy,f1_score
508,Reg_Log_TF-IDF,"('stop_words', 'pruning10')",4163,0.881127,0.877392
532,Reg_Log_TF-IDF,"('stop_words', 'urls', 'pruning10')",4152,0.881127,0.877338
558,Reg_Log_TF-IDF,"('stop_words', 'urls', 'norm_letters', 'pruning10')",4155,0.881653,0.877849
534,Reg_Log_TF-IDF,"('stop_words', 'norm_letters', 'pruning10')",4165,0.882003,0.878272
437,Reg_Log_TF,"('stop_words', 'pruning10')",4163,0.884891,0.88105


In [217]:
filter_result.columns = ['Clf_Pon', 'Combinación', '# Car', 'Exact', 'V-F1']

In [198]:
pd.options.display.max_colwidth = 1000

In [218]:
print(filter_result.round(4).to_latex(index=False, column_format='|l|m{16em}|r|r|r|', decimal=','))

\begin{tabular}{|l|m{16em}|r|r|r|}
\toprule
        Clf\_Pon &                                          Combinación &  \# Car &   Exact &    V-F1 \\
\midrule
 Reg\_Log\_TF-IDF &                          ('stop\_words', 'pruning10') &   4163 &  0,8811 &  0,8774 \\
 Reg\_Log\_TF-IDF &                  ('stop\_words', 'urls', 'pruning10') &   4152 &  0,8811 &  0,8773 \\
 Reg\_Log\_TF-IDF &  ('stop\_words', 'urls', 'norm\_letters', 'pruning10') &   4155 &  0,8817 &  0,8778 \\
 Reg\_Log\_TF-IDF &          ('stop\_words', 'norm\_letters', 'pruning10') &   4165 &  0,8820 &  0,8783 \\
     Reg\_Log\_TF &                          ('stop\_words', 'pruning10') &   4163 &  0,8849 &  0,8810 \\
\bottomrule
\end{tabular}

