In [1]:
import pandas as pd
import itertools

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from experiments_core import run_one_comb_experiment
from experiments_core import norm_repeated_letters
from experiments_core import stem_tokens
from experiments_core import lemmatizer
from experiments_core import save_excel_comb_results
from experiments_core import RepeatReplacer

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import RegexpTokenizer

import xlsxwriter

import re

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import importlib
import sys
importlib.reload(sys.modules[run_one_comb_experiment.__module__])
importlib.reload(sys.modules[norm_repeated_letters.__module__])
importlib.reload(sys.modules[stem_tokens.__module__])
importlib.reload(sys.modules[lemmatizer.__module__])
importlib.reload(sys.modules[save_excel_comb_results.__module__])

<module 'experiments_core' from '/home/ctorres9/EAFIT/trabajogrado/experiments/experiments_core.py'>

In [3]:
file_path = 'data/TweetsPolaridadSEPLN.csv'
original_tweets_df = pd.read_csv(file_path, index_col=None, header=0, sep='\t')
print(original_tweets_df.describe())
print(original_tweets_df.head())

                      tweetid          user               content  \
count                   60798         60798                 60798   
unique                  60798           158                 60405   
top      151651854219100160,0  mariviromero  Buenos días a todos!   
freq                        1          7138                   111   

                       date   lang polarity  topic  
count                 60798  60798    60798  60798  
unique                60510      1        6     10  
top     2012-01-01T00:00:16     es     NONE  otros  
freq                      4  60798    21416  28189  
                 tweetid            user  \
0   142378325086715904,0     jesusmarana   
1   142379080808013824,0       EvaORegan   
2   142379173120442368,0  LosadaPescador   
3   142379815708803072,0    mgilguerrero   
4   142381190123499520,0  pedroj_ramirez   

                                             content                 date  \
0  Portada 'Público', viernes. Fabra al banquil

In [4]:
strong_negative_tweets = original_tweets_df[original_tweets_df.polarity == 'N+'][['content','polarity']]
standar_negative_tweets = original_tweets_df[original_tweets_df.polarity == 'N'][['content','polarity']]

strong_positive_tweets = original_tweets_df[original_tweets_df.polarity == 'P+'][['content','polarity']]
standar_positive_tweets = original_tweets_df[original_tweets_df.polarity == 'P'][['content','polarity']]

negative_tweets = pd.concat([strong_negative_tweets,standar_negative_tweets], ignore_index=True)
positive_tweets = pd.concat([strong_positive_tweets,standar_positive_tweets], ignore_index=True)

negative_tweets.polarity = "negative"
positive_tweets.polarity = "positive"

total_tweets = pd.concat([negative_tweets,positive_tweets], ignore_index=True)
print(total_tweets.describe())
print(total_tweets.head())

                                                  content  polarity
count                                               38077     38077
unique                                              37926         2
top     ¡Noticias descombacantes! está disponible! htt...  positive
freq                                                   63     22233
                                             content  polarity
0  Dado q la deuda privada es superior a la publi...  negative
1  TEPCO inyecta nitrógeno en los reactores de Fu...  negative
2  “@Declaracion: «Cualquier injusticia contra un...  negative
3  ¡Qué estrés!, la presidenta de la diputación d...  negative
4  Hoy entrego mi credencial en el Congreso. Una ...  negative


In [5]:
total_data_content = total_tweets.content
total_data_target = total_tweets.polarity

In [6]:
X_train, X_test, y_train, y_test = train_test_split(total_data_content, 
                                                    total_data_target, 
                                                    test_size=0.3, 
                                                    random_state=80)

In [7]:
print("Positivos entrenamiento:", len(y_train[y_train == 'positive']))
print("Negativos entrenamiento:", len(y_train[y_train == 'negative']))
print("Positivos pruebas:", len(y_test[y_test == 'positive']))
print("Negativos pruebas:", len(y_test[y_test == 'negative']))

Positivos entrenamiento: 15552
Negativos entrenamiento: 11101
Positivos pruebas: 6681
Negativos pruebas: 4743


In [8]:
spanish_stopwords = stopwords.words('spanish')
stemmer = SnowballStemmer('spanish')
replacer = RepeatReplacer()

# Combinaciones de experimentos para tweets

Los siguientes son los resultados de las combinaciones entre los diferentes metodos de limpieza y ponderacion en la representacion

In [9]:
pre_processing_tasks = ['stop_words','stemming','lemmatization','urls','norm_letters','pruning10','pruning5']

In [10]:
all_the_tasks = []
for r in range(1, len(pre_processing_tasks) + 1):
    all_the_tasks = all_the_tasks + list(itertools.combinations(pre_processing_tasks, r))

In [11]:
len(all_the_tasks)

127

In [12]:
count = 0
tasks_to_remove = []
for comb in all_the_tasks:
    if (('stemming' in comb) and ('lemmatization' in comb)) or (('pruning10' in comb) and ('pruning5' in comb)):
        #all_the_tasks.remove(comb)
        tasks_to_remove.append(comb)
        count += 1
print(count)
print(len(tasks_to_remove))

56
56


In [13]:
for task in tasks_to_remove:
    all_the_tasks.remove(task)

In [15]:
all_the_tasks

[('stop_words',),
 ('stemming',),
 ('lemmatization',),
 ('urls',),
 ('norm_letters',),
 ('pruning10',),
 ('pruning5',),
 ('stop_words', 'stemming'),
 ('stop_words', 'lemmatization'),
 ('stop_words', 'urls'),
 ('stop_words', 'norm_letters'),
 ('stop_words', 'pruning10'),
 ('stop_words', 'pruning5'),
 ('stemming', 'urls'),
 ('stemming', 'norm_letters'),
 ('stemming', 'pruning10'),
 ('stemming', 'pruning5'),
 ('lemmatization', 'urls'),
 ('lemmatization', 'norm_letters'),
 ('lemmatization', 'pruning10'),
 ('lemmatization', 'pruning5'),
 ('urls', 'norm_letters'),
 ('urls', 'pruning10'),
 ('urls', 'pruning5'),
 ('norm_letters', 'pruning10'),
 ('norm_letters', 'pruning5'),
 ('stop_words', 'stemming', 'urls'),
 ('stop_words', 'stemming', 'norm_letters'),
 ('stop_words', 'stemming', 'pruning10'),
 ('stop_words', 'stemming', 'pruning5'),
 ('stop_words', 'lemmatization', 'urls'),
 ('stop_words', 'lemmatization', 'norm_letters'),
 ('stop_words', 'lemmatization', 'pruning10'),
 ('stop_words', 'lemm

In [16]:
comb_test = [('stop_words'),('stemming'), ('lemmatization'), ('urls'), ('norm_letters'), ('pruning10'), ('pruning5')]

In [20]:
clf_types_weighting_types = [{'clf_name': 'SVM_radial', 'clf_type': SVC(), 
                              'weighting_type': 'TF', 'optimal_parameters': {'kernel': ['rbf'], 'gamma': [0.001], 'C': [100]}, 'random_state': 30},
                             {'clf_name': 'SVM_radial', 'clf_type': SVC(), 
                              'weighting_type': 'TF-IDF', 'optimal_parameters': {'kernel': ['rbf'], 'gamma': [0.001], 'C': [1000]}, 'random_state': 10},
                             {'clf_name': 'SVM_radial', 'clf_type': SVC(), 
                              'weighting_type': 'Binario', 'optimal_parameters': {'kernel': ['rbf'], 'gamma': [0.001], 'C': [100]}, 'random_state': 30}]

In [22]:
clf_types_weighting_types

[{'clf_name': 'SVM_radial',
  'clf_type': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False),
  'weighting_type': 'TF',
  'optimal_parameters': {'kernel': ['rbf'], 'gamma': [0.001], 'C': [100]},
  'random_state': 30},
 {'clf_name': 'SVM_radial',
  'clf_type': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False),
  'weighting_type': 'TF-IDF',
  'optimal_parameters': {'kernel': ['rbf'], 'gamma': [0.001], 'C': [1000]},
  'random_state': 10},
 {'clf_name': 'SVM_radial',
  'clf_type': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probabilit

In [23]:
vectorizers_list = []
results_list = []
workbook = xlsxwriter.Workbook('result_combinations_tweets_svm_rbf.xlsx')
for clf_type_weighting_type in clf_types_weighting_types:
    ws_name = clf_type_weighting_type['clf_name'] + '_' + clf_type_weighting_type['weighting_type']
    worksheet = workbook.add_worksheet(ws_name)
    init_row = 0
    for comb in all_the_tasks:
        
        if clf_type_weighting_type['weighting_type'] == 'TF':
            vectorizer = CountVectorizer()
        if clf_type_weighting_type['weighting_type'] == 'TF-IDF':
            vectorizer = TfidfVectorizer()
        if clf_type_weighting_type['weighting_type'] == 'Binario':
            vectorizer = CountVectorizer(binary=True)
        
        clf_type = clf_type_weighting_type['clf_type']
        optimal_parameters = clf_type_weighting_type['optimal_parameters']
        random_state = clf_type_weighting_type['random_state']
        
        def tokenize_combinations(text):
            if ('urls' in comb):
                url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
                text = re.sub(url_regex, '', text, flags=re.MULTILINE)

            regexp_tokenizer = RegexpTokenizer(u'(?u)\\b\\w\\w+\\b')
            tokens =  regexp_tokenizer.tokenize(text)

            if ('norm_letters' in comb):
                tokens = norm_repeated_letters(tokens, replacer)

            if ('stemming' in comb):
                tokens = stem_tokens(tokens, stemmer)

            if ('lemmatization' in comb):
                tokens = lemmatizer(tokens)

            return tokens

        vectorizer.set_params(tokenizer = tokenize_combinations)
        
        if ('stop_words' in comb):
            vectorizer.set_params(stop_words = spanish_stopwords)
            
        if ('pruning10' in comb):
            vectorizer.set_params(min_df=10)
            
        if ('pruning5' in comb):
            vectorizer.set_params(min_df=5)
            
        print(ws_name + str(comb))
        exp_results = run_one_comb_experiment(total_data_content, total_data_target, vectorizer, optimal_parameters, clf_type, random_state)
        continue_row = save_excel_comb_results(worksheet, init_row, exp_results, str(optimal_parameters), random_state, str(comb))
        init_row = continue_row
        vectorizers_list.append(vectorizer)
        results_list.append(exp_results + (ws_name, str(comb)))
workbook.close()     

SVM_radial_TF('stop_words',)
train time: 256.494s
SVM_radial_TF('stemming',)
train time: 275.685s
SVM_radial_TF('lemmatization',)
train time: 283.215s
SVM_radial_TF('urls',)
train time: 355.468s
SVM_radial_TF('norm_letters',)
train time: 402.208s
SVM_radial_TF('pruning10',)
train time: 197.587s
SVM_radial_TF('pruning5',)
train time: 269.428s
SVM_radial_TF('stop_words', 'stemming')
train time: 243.926s
SVM_radial_TF('stop_words', 'lemmatization')
train time: 242.433s
SVM_radial_TF('stop_words', 'urls')
train time: 247.232s
SVM_radial_TF('stop_words', 'norm_letters')
train time: 265.853s
SVM_radial_TF('stop_words', 'pruning10')
train time: 118.505s
SVM_radial_TF('stop_words', 'pruning5')
train time: 159.644s
SVM_radial_TF('stemming', 'urls')
train time: 251.837s
SVM_radial_TF('stemming', 'norm_letters')
train time: 294.671s
SVM_radial_TF('stemming', 'pruning10')
train time: 177.006s
SVM_radial_TF('stemming', 'pruning5')
train time: 205.871s
SVM_radial_TF('lemmatization', 'urls')
train ti

train time: 321.359s
SVM_radial_TF-IDF('lemmatization', 'urls', 'pruning10')
train time: 164.426s
SVM_radial_TF-IDF('lemmatization', 'urls', 'pruning5')
train time: 206.690s
SVM_radial_TF-IDF('lemmatization', 'norm_letters', 'pruning10')
train time: 180.150s
SVM_radial_TF-IDF('lemmatization', 'norm_letters', 'pruning5')
train time: 221.822s
SVM_radial_TF-IDF('urls', 'norm_letters', 'pruning10')
train time: 195.198s
SVM_radial_TF-IDF('urls', 'norm_letters', 'pruning5')
train time: 240.677s
SVM_radial_TF-IDF('stop_words', 'stemming', 'urls', 'norm_letters')
train time: 258.252s
SVM_radial_TF-IDF('stop_words', 'stemming', 'urls', 'pruning10')
train time: 140.852s
SVM_radial_TF-IDF('stop_words', 'stemming', 'urls', 'pruning5')
train time: 176.106s
SVM_radial_TF-IDF('stop_words', 'stemming', 'norm_letters', 'pruning10')
train time: 143.598s
SVM_radial_TF-IDF('stop_words', 'stemming', 'norm_letters', 'pruning5')
train time: 174.414s
SVM_radial_TF-IDF('stop_words', 'lemmatization', 'urls', 'n

In [24]:
max(results_list)

(0.9072128851540616,
 32068,
 0.9046244339777281,
 0.9043875156512795,
 0.9048673367705735,
 6123,
 4241,
 520,
 540,
 'SVM_radial_TF-IDF',
 "('stemming', 'norm_letters')")

In [28]:
len(results_list)

213

In [29]:
results_list_backup = results_list

In [30]:
results_df = pd.DataFrame(results_list)

In [31]:
results_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.892069,48118,0.888335,0.891067,0.886149,6140,4051,712,521,SVM_radial_TF,"('stop_words',)"
1,0.906688,32850,0.903828,0.904772,0.902962,6164,4194,569,497,SVM_radial_TF,"('stemming',)"
2,0.904762,36771,0.901873,0.902673,0.901131,6148,4188,575,513,SVM_radial_TF,"('lemmatization',)"
3,0.903274,40441,0.900356,0.901084,0.899676,6137,4182,581,524,SVM_radial_TF,"('urls',)"
4,0.902836,47582,0.899864,0.900771,0.899031,6141,4173,590,520,SVM_radial_TF,"('norm_letters',)"


In [32]:
results_df = results_df[[9, 10, 1, 0, 2, 3, 4, 5, 6, 7, 8]]

In [33]:
results_df

Unnamed: 0,9,10,1,0,2,3,4,5,6,7,8
0,SVM_radial_TF,"('stop_words',)",48118,0.892069,0.888335,0.891067,0.886149,6140,4051,712,521
1,SVM_radial_TF,"('stemming',)",32850,0.906688,0.903828,0.904772,0.902962,6164,4194,569,497
2,SVM_radial_TF,"('lemmatization',)",36771,0.904762,0.901873,0.902673,0.901131,6148,4188,575,513
3,SVM_radial_TF,"('urls',)",40441,0.903274,0.900356,0.901084,0.899676,6137,4182,581,524
4,SVM_radial_TF,"('norm_letters',)",47582,0.902836,0.899864,0.900771,0.899031,6141,4173,590,520
5,SVM_radial_TF,"('pruning10',)",4348,0.895221,0.892039,0.892834,0.891304,6094,4133,630,567
6,SVM_radial_TF,"('pruning5',)",7854,0.897496,0.894332,0.895351,0.893405,6115,4138,625,546
7,SVM_radial_TF,"('stop_words', 'stemming')",32764,0.897759,0.894385,0.896377,0.892703,6149,4107,656,512
8,SVM_radial_TF,"('stop_words', 'lemmatization')",36707,0.898810,0.895443,0.897568,0.893664,6159,4109,654,502
9,SVM_radial_TF,"('stop_words', 'urls')",40187,0.891632,0.887872,0.890651,0.885654,6139,4047,716,522


In [34]:
results_df.columns = ['clf_name','comb','features_number','accuracy','f1_score','precision','recall','true_positives','true_negatives','false_positives','false_negatives']

In [35]:
results_df

Unnamed: 0,clf_name,comb,features_number,accuracy,f1_score,precision,recall,true_positives,true_negatives,false_positives,false_negatives
0,SVM_radial_TF,"('stop_words',)",48118,0.892069,0.888335,0.891067,0.886149,6140,4051,712,521
1,SVM_radial_TF,"('stemming',)",32850,0.906688,0.903828,0.904772,0.902962,6164,4194,569,497
2,SVM_radial_TF,"('lemmatization',)",36771,0.904762,0.901873,0.902673,0.901131,6148,4188,575,513
3,SVM_radial_TF,"('urls',)",40441,0.903274,0.900356,0.901084,0.899676,6137,4182,581,524
4,SVM_radial_TF,"('norm_letters',)",47582,0.902836,0.899864,0.900771,0.899031,6141,4173,590,520
5,SVM_radial_TF,"('pruning10',)",4348,0.895221,0.892039,0.892834,0.891304,6094,4133,630,567
6,SVM_radial_TF,"('pruning5',)",7854,0.897496,0.894332,0.895351,0.893405,6115,4138,625,546
7,SVM_radial_TF,"('stop_words', 'stemming')",32764,0.897759,0.894385,0.896377,0.892703,6149,4107,656,512
8,SVM_radial_TF,"('stop_words', 'lemmatization')",36707,0.898810,0.895443,0.897568,0.893664,6159,4109,654,502
9,SVM_radial_TF,"('stop_words', 'urls')",40187,0.891632,0.887872,0.890651,0.885654,6139,4047,716,522


In [37]:
len(results_df)

213

In [36]:
results_df.to_csv('result_twitter_sa_combinations_svm_rbf.csv', sep='\t')

In [40]:
results_tweets_df = results_df[results_df['clf_name'].str.contains('SVM_radial')]

In [41]:
len(results_tweets_df)

213

In [58]:
filter_result = results_tweets_df.nsmallest(5, columns='accuracy')[['clf_name','comb','features_number','accuracy','f1_score']]

In [59]:
filter_result

Unnamed: 0,clf_name,comb,features_number,accuracy,f1_score
106,SVM_radial_TF-IDF,"('stop_words', 'urls', 'pruning10')",4165,0.877013,0.873224
108,SVM_radial_TF-IDF,"('stop_words', 'norm_letters', 'pruning10')",4181,0.878764,0.875206
132,SVM_radial_TF-IDF,"('stop_words', 'urls', 'norm_letters', 'pruning10')",4175,0.878852,0.87517
82,SVM_radial_TF-IDF,"('stop_words', 'pruning10')",4171,0.879814,0.876317
153,SVM_radial_Binario,"('stop_words', 'pruning10')",4169,0.881127,0.876978


In [60]:
filter_result.columns = ['Clf_Pon', 'Combinación', '# Car', 'Exact', 'V-F1']

In [47]:
pd.options.display.max_colwidth = 1000

In [61]:
print(filter_result.round(4).to_latex(index=False, column_format='|l|m{16em}|r|r|r|', decimal=','))

\begin{tabular}{|l|m{16em}|r|r|r|}
\toprule
            Clf\_Pon &                                          Combinación &  \# Car &   Exact &    V-F1 \\
\midrule
  SVM\_radial\_TF-IDF &                  ('stop\_words', 'urls', 'pruning10') &   4165 &  0,8770 &  0,8732 \\
  SVM\_radial\_TF-IDF &          ('stop\_words', 'norm\_letters', 'pruning10') &   4181 &  0,8788 &  0,8752 \\
  SVM\_radial\_TF-IDF &  ('stop\_words', 'urls', 'norm\_letters', 'pruning10') &   4175 &  0,8789 &  0,8752 \\
  SVM\_radial\_TF-IDF &                          ('stop\_words', 'pruning10') &   4171 &  0,8798 &  0,8763 \\
 SVM\_radial\_Binario &                          ('stop\_words', 'pruning10') &   4169 &  0,8811 &  0,8770 \\
\bottomrule
\end{tabular}

