In [1]:
!pip install sklearn



In [2]:
!pip install matplotlib



In [17]:
import pandas as pd
#import csvnltk.download('stopwords')
import re
import unicodedata
import numpy as np
#from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.stem import SnowballStemmer
import time
import scipy.sparse
import warnings

#classifiers
import pickle
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

#graphs
%matplotlib inline
import matplotlib.pyplot as plt

#parallel
from sklearn.externals.joblib import Parallel, delayed

In [18]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\apula\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Loading data into memory
In the following section, we load the training and test data

In [12]:
df_train = pd.read_csv('./data/train', sep='\t', index_col=0)
df_test = pd.read_csv('./data/test', sep='\t', index_col=0)

Xtrain=df_train['text']
Ytrain=df_train['label']

Xtest=df_test['text']
Ytest=df_test['label']
print('datasets were loaded', len(df_train), len(df_test))

datasets were loaded 131940 54976


In [None]:
print(df_train.head())

Next, we define "split_into_tokens", function to process the text giving as result a list of tokens where the following steps have been made:
<li> Accents are removed
<li> Non-alphanumeric characters are filtered
<li> Shift to lower case and split text in tokens
<li> Deleted stopwords and replacement of the remaining words by their root (stemming)

In [None]:
def split_into_tokens(text):
    #stemmer = SnowballStemmer('spanish')
    stemmer = None
    min_length = 3
    # 1. Remove accent marks
    review_text = ''.join((c for c in unicodedata.normalize('NFD',str(text)) if unicodedata.category(c) != 'Mn'))
    #
    # 2. Remove non-alphanumeric
    #letters_only = re.sub("[^A-Za-z0-9]", " ", review_text) 
    letters_only = re.sub("[^\w\d]", " ", review_text) 
    #
    # 2. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 3. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("spanish"))                  
    # 
    # 4. Remove stop words and apply or not stemming
    if stemmer:
        filtered_tokens = [stemmer.stem(w) for w in words if not w in stops and len(w)>=min_length]
    else:
        filtered_tokens = [w for w in words if not w in stops and len(w)>=min_length]
    #
    # 5. return the result
    return filtered_tokens

The "trainModel" function receives the following arguments: the name of the classification algorithm, the class, its parameters, and the data sets. This function trains the model and returns a tuple with the name of the algorithm and the model already trained.

In [10]:
def trainModel(name, clazz, params, Xtrain, Ytrain):
    print("training ", name)
    model = clazz(**params)
    start = time.time() # Start time
    model.fit(Xtrain, Ytrain)
    end = time.time()
    elapsed = end - start
    print("-> done ", name, " - Time taken for training:", elapsed, "seconds")
    return (name, model)

## "Classical" algorithms

In this section, we create a dictionary that contains all the necessary data of the classification algorithms that are going to be used, as well as the parameters of each one of them. (In case the parameters are not indicated, those configured by default in scikit-learn are used).

In [26]:
"""estimators = {"KNeighbors": (KNeighborsClassifier, {}),
              "MultinomialNB" : (MultinomialNB, {}),
              "RandomForest" : (RandomForestClassifier, {"n_estimators":100}),
              "LogisticRegression" : (LogisticRegression, {}),
              "MLP" : (MLPClassifier, {"hidden_layer_sizes":100}),
              "SVM" : (SVC, {"cache_size":1000}),
              "LinearSVC" : (LinearSVC, {})
             }"""

In [92]:
estimators1 = {"KNeighbors": (KNeighborsClassifier, {}),
              "MultinomialNB" : (MultinomialNB, {}),
              "RandomForest" : (RandomForestClassifier, {"n_estimators":100}),
              "LogisticRegression" : (LogisticRegression, {})
             }

In [5]:
estimators2 = {"MLP" : (MLPClassifier, {"hidden_layer_sizes":100}),
              "SVM" : (SVC, {"cache_size":1000}),
              "LinearSVC" : (LinearSVC, {})
             }

## Bag of words  (unbalanced, with all "classic" algorithms)


In this section, training and test data are transformed into a bag of words, going from a set of tokens to a set of occurrences per token.

In [6]:
from nltk.corpus import stopwords
import nltk
#from nltk import nltk.download('stopwords')

In [25]:
"""
bow = CountVectorizer(analyzer=split_into_tokens)

print("Creando matriz de bolsa de palabras...")

%time bow.fit(Xtrain, Ytrain)
%time Xtrain_bow = bow.transform(Xtrain)
%time Xtest_bow = bow.transform(Xtest)

scipy.sparse.save_npz('data/Xtrain_bow.npz', Xtrain_bow)
scipy.sparse.save_npz('data/Xtest_bow.npz', Xtest_bow) """

Creando matriz de bolsa de palabras...
Wall time: 1min 56s
Wall time: 1min 55s
Wall time: 46.6 s


This section, loads in memory the bags of words. Use only if you have previously obtained the bags of words and do not have them loaded into memory.

In [7]:
Xtest_bow = scipy.sparse.load_npz('data/Xtest_bow.npz').astype(np.int16, casting='same_kind')
Xtrain_bow = scipy.sparse.load_npz('data/Xtrain_bow.npz').astype(np.int16, casting='same_kind')

Next, we create as many processes as we have estimators. These processes will be in charge of training the different models in parallel.<br> 
As a result, we get a list of tuples formed by: (the name of the algorithm, the model already trained).

In [35]:
#models = Parallel(n_jobs=len(estimators))(delayed(trainModel)(name, clazz, params, Xtrain_bow, Ytrain) for (name, (clazz, params)) in estimators.items())

In [None]:
print(model)

In [94]:
models1 = Parallel(n_jobs=len(estimators1))(delayed(trainModel)(name, clazz, params, Xtrain_bow, Ytrain) for (name, (clazz, params)) in estimators1.items())

In [101]:
print(models1)

[('KNeighbors', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')), ('MultinomialNB', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)), ('RandomForest', RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)), ('LogisticRegression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=N

In [32]:
models2 = Parallel(n_jobs=len(estimators2))(delayed(trainModel)(name, clazz, params, Xtrain_bow, Ytrain) for (name, (clazz, params)) in estimators2.items())

In [42]:
print(models2)

[('MLP', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=100, learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)), ('SVM', SVC(C=1.0, break_ties=False, cache_size=1000, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)), ('LinearSVC', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=

Once the models are trained, we obtain the labels from the test set and evaluate the results of each of them. 

In [None]:
print("Obteniendo resultados:")
for (name, model) in models:
    start = time.time() # Star t time
    if name == "KNeighbors":
        result = [y for x in [Xtest_bow[i:i+5000,:] for i in range(0,Xtest_bow.shape[0],5000)] for y in model.predict(x)]
    else:
        result = model.predict(Xtest_bow)
    end = time.time()
    elapsed = end - start
    print("---------- Modelo: ", name, " ---------- Time taken for prediction:", elapsed,"seconds\n", classification_report(Ytest, result, digits=4), "\n")
    

In [105]:
import pandas as pd
print("Obteniendo resultados:")
for (name, model) in models1:
    start = time.time() # Star t time
    if name =="KNeighbors": 
        result = [y for x in [Xtest_bow[i:i+5000,:] for i in range(0,Xtest_bow.shape[0],5000)] for y in model.predict(x)]
    else:
        result = model.predict(Xtest_bow)
    end = time.time()
    elapsed = end - start   
    print("---------- Modelo: ", name, " ---------- Time taken for prediction:", elapsed,"seconds\n", classification_report(Ytest, result, digits=4), "\n")
    
    #Save classification report and trained model to BoW folder
    if name =="KNeighbors": 
        report_KNeighbors = classification_report(Ytest, result, digits=4, output_dict=True)
        print(report_KNeighbors)
        dataframe = pd.DataFrame.from_dict(report_KNeighbors)
        dataframe.to_csv('BoW/KNeighbors_clasification_report.csv', index=True)
        save_classifier = open('BoW/KNeighbors_classifier.pickle', 'wb')  #write in bytes
        pickle.dump(result, save_classifier)
        save_classifier.close()
    elif name =="MultinomialNB":
        report_MultinomialNB = classification_report(Ytest, result, digits=4, output_dict=True)
        print(report_MultinomialNB)
        dataframe = pd.DataFrame.from_dict(report_MultinomialNB)
        dataframe.to_csv('BoW/MultinomialNB_clasification_report.csv', index=True)
        save_classifier = open('BoW/MultinomialNB_classifier.pickle', 'wb')  #write in bytes
        pickle.dump(result, save_classifier)
        save_classifier.close()
    elif name =="RandomForest":
        report_RandomForest = classification_report(Ytest, result, digits=4, output_dict=True)
        print(report_RandomForest)
        dataframe = pd.DataFrame.from_dict(report_RandomForest)
        dataframe.to_csv('BoW/RandomForest_clasification_report.csv', index=True)
        save_classifier = open('BoW/RandomForest_classifier.pickle', 'wb')  #write in bytes
        pickle.dump(result, save_classifier)
        save_classifier.close()
    else:
        report_LogisticRegression = classification_report(Ytest, result, digits=4, output_dict=True)
        print(report_LogisticRegression)
        dataframe = pd.DataFrame.from_dict(report_LogisticRegression)
        dataframe.to_csv('BoW/LogisticRegression_clasification_report.csv', index=True)
        save_classifier = open('BoW/LogisticRegression_classifier.pickle', 'wb')  #write in bytes
        pickle.dump(result, save_classifier)
        save_classifier.close()
    

Obteniendo resultados:
---------- Modelo:  KNeighbors  ---------- Time taken for prediction: 601.23597240448 seconds
               precision    recall  f1-score   support

       False     0.9982    1.0000    0.9991     54473
        True     1.0000    0.8072    0.8933       503

    accuracy                         0.9982     54976
   macro avg     0.9991    0.9036    0.9462     54976
weighted avg     0.9982    0.9982    0.9981     54976
 

{'False': {'precision': 0.9982224665567161, 'recall': 1.0, 'f1-score': 0.9991104426694056, 'support': 54473}, 'True': {'precision': 1.0, 'recall': 0.8071570576540755, 'f1-score': 0.8932893289328934, 'support': 503}, 'accuracy': 0.9982355937136205, 'macro avg': {'precision': 0.999111233278358, 'recall': 0.9035785288270377, 'f1-score': 0.9461998858011496, 'support': 54976}, 'weighted avg': {'precision': 0.998238730004802, 'recall': 0.9982355937136205, 'f1-score': 0.9981422379944663, 'support': 54976}}
---------- Modelo:  MultinomialNB  ---------- Ti

In [90]:
import pandas as pd
print("Obteniendo resultados:")
for (name, model) in models2:
    start = time.time() # Star t time
    result = model.predict(Xtest_bow)
    end = time.time()
    elapsed = end - start   
    print("---------- Modelo: ", name, " ---------- Time taken for prediction:", elapsed,"seconds\n", classification_report(Ytest, result, digits=4), "\n")
    
    #Save classification report and trained model to BoW folder
    if name =="MLP":
        report_MLP = classification_report(Ytest, result, digits=4, output_dict=True)
        print(report_MLP)
        dataframe = pd.DataFrame.from_dict(report_MLP)
        dataframe.to_csv('BoW/MLP_clasification_report.csv', index=True)
        save_classifier = open('BoW/MLP_classifier.pickle', 'wb')  #write in bytes
        pickle.dump(result, save_classifier)
        save_classifier.close()
    elif name =="SVM":
        report_SVM = classification_report(Ytest, result, digits=4, output_dict=True)
        print(report_SVM)
        dataframe = pd.DataFrame.from_dict(report_SVM)
        dataframe.to_csv('BoW/SVM_clasification_report.csv', index=True)
        save_classifier = open('BoW/SVM_classifier.pickle', 'wb')  #write in bytes
        pickle.dump(result, save_classifier)
        save_classifier.close()
    else:
        report_LinearSVC = classification_report(Ytest, result, digits=4, output_dict=True)
        print(report_LinearSVC)
        dataframe = pd.DataFrame.from_dict(report_LinearSVC)
        dataframe.to_csv('BoW/LinearSVC_clasification_report.csv', index=True)
        save_classifier = open('BoW/LinearSVC_classifier.pickle', 'wb')  #write in bytes
        pickle.dump(result, save_classifier)
        save_classifier.close()

Obteniendo resultados:
---------- Modelo:  MLP  ---------- Time taken for prediction: 0.6775062084197998 seconds
               precision    recall  f1-score   support

       False     0.9995    0.9995    0.9995     54473
        True     0.9501    0.9463    0.9482       503

    accuracy                         0.9991     54976
   macro avg     0.9748    0.9729    0.9739     54976
weighted avg     0.9991    0.9991    0.9991     54976
 

{'False': {'precision': 0.9995043597980725, 'recall': 0.9995410570374313, 'f1-score': 0.9995227080809193, 'support': 54473}, 'True': {'precision': 0.9500998003992016, 'recall': 0.9463220675944334, 'f1-score': 0.9482071713147411, 'support': 503}, 'accuracy': 0.9990541327124564, 'macro avg': {'precision': 0.974802080098637, 'recall': 0.9729315623159324, 'f1-score': 0.9738649396978302, 'support': 54976}, 'weighted avg': {'precision': 0.9990523353987413, 'recall': 0.9990541327124564, 'f1-score': 0.9990531992953876, 'support': 54976}}
---------- Modelo:  S

In [None]:
import pickle
#for (name, model) in models:

save_classifier = open('saved_classifier_BoW1.pickle', 'wb')  #write in bytes
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [19]:
import pickle
open_model = open('saved_classifier_BoW.pickle', 'rb')

In [20]:
show_model = pickle.load(open_model)
open_model.close()
#print(show_model)

EOFError: Ran out of input

## Tf-idf model (unbalanced, with all "classic" algorithms)

To compare with the results obtained, we will use the representation of the training and test data in Tf-idf format. To achieve this, the bag of words is transformed into the frequency of occurrence of terms in the collection of documents.<br>
The process that is carried out, from this point on, is the same as the employee with the bag of words previously.

In [95]:
"""tfidf = TfidfTransformer()

print("Creando matriz de tf-idf...")

%time tfidf.fit(Xtrain_bow, Ytrain)
%time Xtrain_tfidf = tfidf.transform(Xtrain_bow)
%time Xtest_tfidf = tfidf.transform(Xtest_bow)

scipy.sparse.save_npz('data/Xtrain_tfidf.npz', Xtrain_tfidf)
scipy.sparse.save_npz('data/Xtest_tfidf.npz', Xtest_tfidf)"""

Creando matriz de tf-idf...
Wall time: 109 ms
Wall time: 627 ms
Wall time: 237 ms


This section, loads in memory the bags of words. Use only if you have previously obtained the bags of words and do not have them loaded into memory.

In [8]:
Xtest_tfidf = scipy.sparse.load_npz('data/Xtest_tfidf.npz').astype(np.float32)
Xtrain_tfidf = scipy.sparse.load_npz('data/Xtrain_tfidf.npz').astype(np.float32)

Next, we create as many processes as we have estimators. These processes will be in charge of training the different models in parallel.<br> 
As a result, we get a list of tuples formed by: (the name of the algorithm, the model already trained).

In [96]:
models_tfidf1 = Parallel(n_jobs=len(estimators1))(delayed(trainModel)(name, clazz, params, Xtrain_tfidf, Ytrain) for (name, (clazz, params)) in estimators1.items())

In [13]:
models_tfidf2 = Parallel(n_jobs=len(estimators2))(delayed(trainModel)(name, clazz, params, Xtrain_tfidf, Ytrain) for (name, (clazz, params)) in estimators2.items())

In [14]:
print(models_tfidf2)

[('MLP', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=100, learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)), ('SVM', SVC(C=1.0, break_ties=False, cache_size=1000, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)), ('LinearSVC', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=

Once the models are trained, we obtain the labels from the test set and evaluate the results of each of them. 

In [None]:
print("Obteniendo resultados:")
for (name, model) in models_tfidf:
    start = time.time() # Start time
    if name == "KNeighbors":
        result = [y for x in [Xtest_tfidf[i:i+5000,:] for i in range(0,Xtest_tfidf.shape[0],5000)] for y in model.predict(x)]
    else:
        result = model.predict(Xtest_tfidf)
    end = time.time()
    elapsed = end - start
    print("---------- Modelo: ", name, " ---------- Time taken for prediction:", elapsed,"seconds\n", classification_report(Ytest, result, digits=4), "\n")

In [104]:
import pandas as pd
print("Obteniendo resultados:")
for (name, model) in models_tfidf1:
    start = time.time() # Star t time
    if name =="KNeighbors": 
        result = [y for x in [Xtest_bow[i:i+5000,:] for i in range(0,Xtest_bow.shape[0],5000)] for y in model.predict(x)]
    else:
        result = model.predict(Xtest_bow)
    end = time.time()
    elapsed = end - start   
    print("---------- Modelo: ", name, " ---------- Time taken for prediction:", elapsed,"seconds\n", classification_report(Ytest, result, digits=4), "\n")
    
    #Save classification report and trained model to BoW folder
    if name =="KNeighbors": 
        report_KNeighbors = classification_report(Ytest, result, digits=4, output_dict=True)
        print(report_KNeighbors)
        dataframe = pd.DataFrame.from_dict(report_KNeighbors)
        dataframe.to_csv('BoW_tfidf/KNeighbors_clasification_report.csv', index=True)
        save_classifier = open('BoW_tfidf/KNeighbors_classifier.pickle', 'wb')  #write in bytes
        pickle.dump(result, save_classifier)
        save_classifier.close()
    elif name =="MultinomialNB":
        report_MultinomialNB = classification_report(Ytest, result, digits=4, output_dict=True)
        print(report_MultinomialNB)
        dataframe = pd.DataFrame.from_dict(report_MultinomialNB)
        dataframe.to_csv('BoW_tfidf/MultinomialNB_clasification_report.csv', index=True)
        save_classifier = open('BoW_tfidf/MultinomialNB_classifier.pickle', 'wb')  #write in bytes
        pickle.dump(result, save_classifier)
        save_classifier.close()
    elif name =="RandomForest":
        report_RandomForest = classification_report(Ytest, result, digits=4, output_dict=True)
        print(report_RandomForest)
        dataframe = pd.DataFrame.from_dict(report_RandomForest)
        dataframe.to_csv('BoW_tfidf/RandomForest_clasification_report.csv', index=True)
        save_classifier = open('BoW_tfidf/RandomForest_classifier.pickle', 'wb')  #write in bytes
        pickle.dump(result, save_classifier)
        save_classifier.close()
    else:
        report_LogisticRegression = classification_report(Ytest, result, digits=4, output_dict=True)
        print(report_LogisticRegression)
        dataframe = pd.DataFrame.from_dict(report_LogisticRegression)
        dataframe.to_csv('BoW_tfidf/LogisticRegression_clasification_report.csv', index=True)
        save_classifier = open('BoW_tfidf/LogisticRegression_classifier.pickle', 'wb')  #write in bytes
        pickle.dump(result, save_classifier)
        save_classifier.close()
    

Obteniendo resultados:
---------- Modelo:  KNeighbors  ---------- Time taken for prediction: 555.8283228874207 seconds
               precision    recall  f1-score   support

       False     0.9992    1.0000    0.9996     54473
        True     0.9978    0.9085    0.9511       503

    accuracy                         0.9991     54976
   macro avg     0.9985    0.9543    0.9753     54976
weighted avg     0.9991    0.9991    0.9991     54976
 

{'False': {'precision': 0.9991562419751275, 'recall': 0.9999816422814972, 'f1-score': 0.9995687717334458, 'support': 54473}, 'True': {'precision': 0.9978165938864629, 'recall': 0.9085487077534792, 'f1-score': 0.9510926118626433, 'support': 503}, 'accuracy': 0.9991450814901047, 'macro avg': {'precision': 0.9984864179307953, 'recall': 0.9542651750174882, 'f1-score': 0.9753306917980445, 'support': 54976}, 'weighted avg': {'precision': 0.9991439849358996, 'recall': 0.9991450814901047, 'f1-score': 0.9991252416764206, 'support': 54976}}
---------- Mod

In [19]:
import pandas as pd
print("Obteniendo resultados:")
for (name, model) in models_tfidf2:
    start = time.time() # Star t time
    result = model.predict(Xtest_bow)
    end = time.time()
    elapsed = end - start   
    print("---------- Modelo: ", name, " ---------- Time taken for prediction:", elapsed,"seconds\n", classification_report(Ytest, result, digits=4), "\n")
    
    #Save classification report and trained model to BoW folder
    if name =="MLP":
        report_MLP = classification_report(Ytest, result, digits=4, output_dict=True)
        print(report_MLP)
        dataframe = pd.DataFrame.from_dict(report_MLP)
        dataframe.to_csv('BoW_tfidf/MLP_clasification_report.csv', index=True)
        save_classifier = open('BoW_tfidf/MLP_classifier.pickle', 'wb')  #write in bytes
        pickle.dump(result, save_classifier)
        save_classifier.close()
    elif name =="SVM":
        report_SVM = classification_report(Ytest, result, digits=4, output_dict=True)
        print(report_SVM)
        dataframe = pd.DataFrame.from_dict(report_SVM)
        dataframe.to_csv('BoW_tfidf/SVM_clasification_report.csv', index=True)
        save_classifier = open('BoW_tfidf/SVM_classifier.pickle', 'wb')  #write in bytes
        pickle.dump(result, save_classifier)
        save_classifier.close()
    else:
        report_LinearSVC = classification_report(Ytest, result, digits=4, output_dict=True)
        print(report_LinearSVC)
        dataframe = pd.DataFrame.from_dict(report_LinearSVC)
        dataframe.to_csv('BoW_tfidf/LinearSVC_clasification_report.csv', index=True)
        save_classifier = open('BoW_tfidf/LinearSVC_classifier.pickle', 'wb')  #write in bytes
        pickle.dump(result, save_classifier)
        save_classifier.close()    
    

Obteniendo resultados:
---------- Modelo:  MLP  ---------- Time taken for prediction: 0.6115198135375977 seconds
               precision    recall  f1-score   support

       False     0.9998    0.9287    0.9630     54473
        True     0.1131    0.9841    0.2028       503

    accuracy                         0.9292     54976
   macro avg     0.5565    0.9564    0.5829     54976
weighted avg     0.9917    0.9292    0.9560     54976
 

{'False': {'precision': 0.9998418909838334, 'recall': 0.9287169790538432, 'f1-score': 0.9629678978976121, 'support': 54473}, 'True': {'precision': 0.11306532663316583, 'recall': 0.9840954274353877, 'f1-score': 0.20282728948985865, 'support': 503}, 'accuracy': 0.929223661233993, 'macro avg': {'precision': 0.5564536088084996, 'recall': 0.9564062032446154, 'f1-score': 0.5828975936937354, 'support': 54976}, 'weighted avg': {'precision': 0.9917283757795917, 'recall': 0.929223661233993, 'f1-score': 0.9560130316645449, 'support': 54976}}


  _warn_prf(average, modifier, msg_start, len(result))


---------- Modelo:  SVM  ---------- Time taken for prediction: 128.29665064811707 seconds
               precision    recall  f1-score   support

       False     0.9909    1.0000    0.9954     54473
        True     0.0000    0.0000    0.0000       503

    accuracy                         0.9909     54976
   macro avg     0.4954    0.5000    0.4977     54976
weighted avg     0.9818    0.9909    0.9863     54976
 

{'False': {'precision': 0.990850552968568, 'recall': 1.0, 'f1-score': 0.9954042522087913, 'support': 54473}, 'True': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 503}, 'accuracy': 0.990850552968568, 'macro avg': {'precision': 0.495425276484284, 'recall': 0.5, 'f1-score': 0.4977021261043956, 'support': 54976}, 'weighted avg': {'precision': 0.9817848183181171, 'recall': 0.990850552968568, 'f1-score': 0.9862968537283449, 'support': 54976}}
---------- Modelo:  LinearSVC  ---------- Time taken for prediction: 0.03124260902404785 seconds
               precision 

## Voting (Bag of Words)

In [45]:
ensemble = VotingClassifier(models, n_jobs=-1)
start = time.time() # Start time
voting_model_bow=ensemble.fit(Xtrain_bow,Ytrain)
end = time.time()
elapsed = end - start
print("VotingClassifier - Time taken for training:", elapsed, "seconds")

VotingClassifier - Time taken for training: 10572.805789470673 seconds


In [None]:
warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)
start = time.time() # Start time
predictions1 = [y for x in [Xtest_bow[i:i+2000,:] for i in range(0,Xtest_bow.shape[0],2000)] for y in voting_model_bow.predict(x)]
end = time.time()
elapsed = end - start
print("VotingClassifier - Time taken for prediction:", elapsed, "seconds")
cr1=classification_report(Ytest, predictions1, digits=4)
print(cr1)

## Voting (Tf-idf)

In [None]:
ensemble = VotingClassifier(models_tfidf, n_jobs=-1)
start = time.time() # Start time
voting_model_tfidf=ensemble.fit(Xtrain_tfidf,Ytrain)
end = time.time()
elapsed = end - start
print("VotingClassifier - Time taken for training:", elapsed, "seconds")

In [None]:
warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)
start = time.time() # Start time
predictions2 = [y for x in [Xtest_tfidf[i:i+2000,:] for i in range(0,Xtest_tfidf.shape[0],2000)] for y in voting_model_tfidf.predict(x)]
end = time.time()
elapsed = end - start
print("VotingClassifier - Time taken for prediction:", elapsed, "seconds")
cr2=classification_report(Ytest, predictions2, digits=4)
print(cr2)