In [1]:
from platform import python_version
print(python_version())

3.8.5


In [2]:
!jupyter --version

jupyter core     : 4.6.3
jupyter-notebook : 6.1.4
qtconsole        : 4.7.7
ipython          : 7.19.0
ipykernel        : 5.3.4
jupyter client   : 6.1.7
jupyter lab      : 2.2.6
nbconvert        : 6.0.7
ipywidgets       : 7.5.1
nbformat         : 5.0.8
traitlets        : 5.0.5


### Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split 
import numpy as np
import pandas as pd
import time
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from scipy import stats
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier
from bert_serving.client import BertClient
from sklearn.multiclass import OneVsOneClassifier
import gensim
from gensim.utils import simple_preprocess
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
import gensim.corpora as corpora
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.sparse import hstack
import gensim.downloader as api
from gensim.corpora import Dictionary
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import WordEmbeddingSimilarityIndex
from gensim.models import TfidfModel
from gensim.models import Word2Vec

### Methods

In [2]:
import warnings
warnings.filterwarnings('ignore')

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def vectorization_TF_IDF(DATASET, text_field_name, label_name):
    
    X_train, X_test, y_train, y_test = train_test_split( DATASET[text_field_name] ,
                                                    DATASET[label_name], 
                                                    train_size=0.80, random_state=28)

    encoder = preprocessing.LabelEncoder()
    y_train = encoder.fit_transform(y_train)
    y_test = encoder.fit_transform(y_test)

    vectorizer = TfidfVectorizer()
    X_train_v = vectorizer.fit_transform(X_train.apply(str))
    X_test_v = vectorizer.transform(X_test.apply(str))
    
    return X_train_v, X_test_v, y_train, y_test

def saving_results(results, path, file_name):

    results.to_csv(path + file_name,  index = False, sep=";")
    
def train_model(classifier, X_train_v, X_test_v, y_train, y_test):
    ini = time.time()
    classifier.fit(X_train_v, y_train)
    predictions = classifier.predict(X_test_v)
    fim = time.time()
    
    return fim-ini, metrics.accuracy_score(predictions, y_test), predictions

def get_tests_result(X_train_v, X_test_v, y_train, y_test):
    
    all_res = []
    ### MLPClassifier
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(70, ), random_state=1, verbose=True)
    train_test_time, accuracy, Y_Previsto_MLP = train_model(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "MLPClassifier: " + str( accuracy) ) 
    all_res.append(["MLPClassifier: ", train_test_time, accuracy])
    
    ### AdaBoostClassifier
    clf = AdaBoostClassifier(n_estimators= 50, learning_rate=1)
    train_test_time, accuracy, Y_Previsto_ada = train_model(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "AdaBoostClassifier: " + str( accuracy) ) 
    all_res.append(["AdaBoostClassifier: ", train_test_time, accuracy])
    
    ### Voting_LR3_SVC1_ETC2
    ini = time.time()
    clf2 = LogisticRegression(random_state=0)
    clf5 = SVC(kernel='rbf', probability=True)
    clf6 = ExtraTreesClassifier(n_estimators=100, random_state=0)
    eclf = VotingClassifier(estimators=[ ('lr', clf2), ('svc', clf5), ('etc', clf6)], voting='soft', weights=[3, 1, 2])
    clf2 = clf2.fit(X_train_v, y_train)
    clf5 = clf5.fit(X_train_v, y_train)
    clf6 = clf6.fit(X_train_v, y_train)
    eclf = eclf.fit(X_train_v, y_train)
    Y_previsto_vc1 = eclf.predict(X_test_v.toarray())
    fim = time.time()
    train_test_time = fim - ini
    accuracy = metrics.accuracy_score(y_test, Y_previsto_vc1)
    #print( "Voting_LR3_SVC1_ETC2: " + str( accuracy) ) 
    all_res.append(["Voting_LR3_SVC1_ETC2: ", train_test_time, accuracy])
    
    ### Voting_LR1_SVC1_ETC1
    ini = time.time()
    clf2 = LogisticRegression(random_state=0)
    clf5 = SVC(kernel='rbf', probability=True)
    clf6 = ExtraTreesClassifier(n_estimators=100, random_state=0)
    eclf = VotingClassifier(estimators=[ ('lr', clf2), ('svc', clf5), ('etc', clf6)], voting='soft', weights=[1, 1, 1])
    clf2 = clf2.fit(X_train_v, y_train)
    clf5 = clf5.fit(X_train_v, y_train)
    clf6 = clf6.fit(X_train_v, y_train)
    eclf = eclf.fit(X_train_v, y_train)
    Y_previsto_vc2 = eclf.predict(X_test_v.toarray())
    fim = time.time()
    train_test_time = fim - ini
    accuracy = metrics.accuracy_score(y_test, Y_previsto_vc2)
    #print( "Voting_LR1_SVC1_ETC1: " + str( accuracy) ) 
    all_res.append(["Voting_LR1_SVC1_ETC1: ", train_test_time, accuracy])
    
    ### DecisionTreeClassifier
    clf = DecisionTreeClassifier(random_state=0)
    train_test_time, accuracy, Y_previsto_DTC = train_model(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "DecisionTreeClassifier: " + str( accuracy) ) 
    all_res.append(["DecisionTreeClassifier: ", train_test_time, accuracy])
    
    ### GaussianNB
    clf = GaussianNB()
    train_test_time, accuracy, Y_previsto_NB_G = train_model(clf, X_train_v.toarray(), X_test_v.toarray(), y_train, y_test)
    #print( "GaussianNB: " + str( accuracy) ) 
    all_res.append(["GaussianNB: ", train_test_time, accuracy])
    
    ### BernoulliNB
    clf = BernoulliNB()
    train_test_time, accuracy, Y_previsto_NB_B = train_model(clf, X_train_v.toarray(), X_test_v.toarray(), y_train, y_test)
    #print( "BernoulliNB: " + str( accuracy) ) 
    all_res.append(["BernoulliNB: ", train_test_time, accuracy])
    
    ### MultinomialNB
    clf = MultinomialNB()
    train_test_time, accuracy, Y_previsto_NB_M = train_model(clf, X_train_v.toarray(), X_test_v.toarray(), y_train, y_test)
    #print( "MultinomialNB: " + str( accuracy) ) 
    all_res.append(["MultinomialNB: ", train_test_time, accuracy])
    
    ### RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=50, random_state=1)
    train_test_time, accuracy, Y_previsto_RF = train_model(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "RandomForestClassifier: " + str( accuracy) ) 
    all_res.append(["RandomForestClassifier: ", train_test_time, accuracy])
    
    ### ExtraTreesClassifier
    clf = ExtraTreesClassifier(n_estimators=100, random_state=0)
    train_test_time, accuracy, Y_previsto_ERT = train_model(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "ExtraTreesClassifier: " + str( accuracy) ) 
    all_res.append(["ExtraTreesClassifier: ", train_test_time, accuracy])
    
    ### LogisticRegression
    clf = LogisticRegression(random_state=0)
    train_test_time, accuracy, Y_previsto_LR = train_model(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "LogisticRegression: " + str( accuracy) ) 
    all_res.append(["LogisticRegression: ", train_test_time, accuracy])
    
    ### svm
    clf = svm.SVC()
    train_test_time, accuracy, Y_previsto_SVM = train_model(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "svm: " + str( accuracy) ) 
    all_res.append(["svm: ", train_test_time, accuracy])
    
    ### svm_rbf
    clf = svm.SVC(kernel='rbf', probability=True)
    train_test_time, accuracy, Y_previsto_svc = train_model(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "svm_rbf: " + str( accuracy) ) 
    all_res.append(["svm_rbf: ", train_test_time, accuracy])
        
    ### Stacking 1
    stack_result = []
    for a in range(len(Y_Previsto_MLP)):
        elements = []
        elements.append( np.floor( Y_Previsto_MLP[a] ) )
        elements.append( np.floor( Y_Previsto_ada[a] ) )
        elements.append( np.floor( Y_previsto_vc1[a] ) )
        elements.append( np.floor( Y_previsto_vc2[a] ) )
        elements.append( np.floor( Y_previsto_DTC[a] ) )
        elements.append( np.floor( Y_previsto_NB_G[a] ) )
        elements.append( np.floor( Y_previsto_NB_B[a] ) )
        elements.append( np.floor( Y_previsto_RF[a] ) )
        elements.append( np.floor( Y_previsto_ERT[a] ) )
        elements.append( np.floor( Y_previsto_LR[a] ) )
        elements.append( np.floor( Y_previsto_SVM[a] ) )
        elements.append( np.floor( Y_previsto_svc[a] ) )
        stack_result.append( stats.mode(elements).mode[0])
        
    accuracy = metrics.accuracy_score(y_test,  np.array(stack_result, dtype="int64" )  )
    all_res.append(["stacking_1: ", 0 , accuracy])
    
    ### Stacking 2
    stack_result = []
    for a in range(len(Y_Previsto_MLP)):
        elements = []
        #elements.append( np.floor( Y_Previsto_MLP[a] ) )
        #elements.append( np.floor( Y_Previsto_ada[a] ) )
        elements.append( np.floor( Y_previsto_vc1[a] ) )
        elements.append( np.floor( Y_previsto_vc2[a] ) )
        #elements.append( np.floor( Y_previsto_DTC[a] ) )
        #elements.append( np.floor( Y_previsto_NB_G[a] ) )
        elements.append( np.floor( Y_previsto_NB_B[a] ) )
        elements.append( np.floor( Y_previsto_RF[a] ) )
        #elements.append( np.floor( Y_previsto_ERT[a] ) )
        #elements.append( np.floor( Y_previsto_LR[a] ) )
        #elements.append( np.floor( Y_previsto_SVM[a] ) )
        elements.append( np.floor( Y_previsto_svc[a] ) )
        stack_result.append( stats.mode(elements).mode[0])
        
    accuracy = metrics.accuracy_score(y_test,  np.array(stack_result, dtype="int64" )  )
    all_res.append(["stacking_2: ", 0 , accuracy])
    
    ### Stacking 3
    stack_result = []
    for a in range(len(Y_Previsto_MLP)):
        elements = []
        elements.append( np.floor( Y_Previsto_MLP[a] ) )
        #elements.append( np.floor( Y_Previsto_ada[a] ) )
        #elements.append( np.floor( Y_previsto_vc1[a] ) )
        elements.append( np.floor( Y_previsto_vc2[a] ) )
        elements.append( np.floor( Y_previsto_DTC[a] ) )
        #elements.append( np.floor( Y_previsto_NB_G[a] ) )
        #elements.append( np.floor( Y_previsto_NB_B[a] ) )
        elements.append( np.floor( Y_previsto_RF[a] ) )
        elements.append( np.floor( Y_previsto_ERT[a] ) )
        #elements.append( np.floor( Y_previsto_LR[a] ) )
        #elements.append( np.floor( Y_previsto_SVM[a] ) )
        elements.append( np.floor( Y_previsto_svc[a] ) )
        stack_result.append( stats.mode(elements).mode[0])
        
    accuracy = metrics.accuracy_score(y_test,  np.array(stack_result, dtype="int64" )  )
    all_res.append(["stacking_3: ", 0 , accuracy]) 
    
    ### Stacking_scikit
    clf1 = RandomForestClassifier(n_estimators=50, random_state=1)
    clf2 = LogisticRegression(random_state=0)
    clf3 = SVC()
    clf4 = ExtraTreesClassifier(n_estimators=100, random_state=0)
    estimators = [('rf', clf1), ('lr', clf2), ('svc', clf3), ('etc', clf4)]
    ini = time.time()
    clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(), n_jobs=None)
    clf.fit(X_train_v, y_train)
    Y_Previsto_stacking = clf.predict(X_test_v)
    fim = time.time()
    train_test_time = fim-ini
    accuracy = metrics.accuracy_score(y_test, Y_Previsto_stacking)
    all_res.append(["Stacking_scikit: ", train_test_time , accuracy])
    
    ### XGBClassifier
    clf = XGBClassifier(eval_metric='mlogloss')
    train_test_time, accuracy, Y_previsto_svc = train_model(clf, X_train_v.tocsc(), X_test_v.tocsc(), y_train, y_test)
    #print( "XGBClassifier: " + str( accuracy) ) 
    all_res.append(["XGBClassifier: ", train_test_time, accuracy])
    
    ### OneVsRestClassifier_RF
    clf = RandomForestClassifier(n_estimators=50, random_state=1)
    ovr = OneVsRestClassifier(clf)
    ini = time.time()
    ovr.fit(X_train_v, y_train)
    Y_Previsto_OVR_RF = ovr.predict(X_test_v)
    fim = time.time()
    train_test_time = fim-ini
    accuracy = metrics.accuracy_score(Y_Previsto_OVR_RF, y_test)
    all_res.append(["OvR_RF: ", train_test_time , accuracy])
    
    return pd.DataFrame( all_res, columns=["ALGORITHM","TRAIN_TEST_TIME", "ACCURACY"]  )

def train_model_ovo(classifier, X_train_v, X_test_v, y_train, y_test):
    
    ini = time.time()
    ovo = OneVsOneClassifier(classifier)
    ovo.fit(X_train_v, y_train)
    predictions = ovo.predict(X_test_v)
    fim = time.time()
    
    return fim-ini, metrics.accuracy_score(predictions, y_test), predictions

def get_tests_result_ovo(X_train_v, X_test_v, y_train, y_test):
    
    all_res = []
    ### MLPClassifier
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(70, ), random_state=1, verbose=True)
    train_test_time, accuracy, Y_Previsto_MLP = train_model_ovo(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "MLPClassifier: " + str( accuracy) ) 
    all_res.append(["MLPClassifier: ", train_test_time, accuracy])
    
    ### AdaBoostClassifier
    clf = AdaBoostClassifier(n_estimators= 50, learning_rate=1)
    train_test_time, accuracy, Y_Previsto_ada = train_model_ovo(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "AdaBoostClassifier: " + str( accuracy) ) 
    all_res.append(["AdaBoostClassifier: ", train_test_time, accuracy])
    
    ### Voting_LR3_SVC1_ETC2
    all_res.append(["Voting_LR3_SVC1_ETC2: ", 0 , 0 ])
    
    ### Voting_LR1_SVC1_ETC1
    all_res.append(["Voting_LR1_SVC1_ETC1: ", 0, 0])
    
    ### DecisionTreeClassifier
    clf = DecisionTreeClassifier(random_state=0)
    train_test_time, accuracy, Y_previsto_DTC = train_model_ovo(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "DecisionTreeClassifier: " + str( accuracy) ) 
    all_res.append(["DecisionTreeClassifier: ", train_test_time, accuracy])
    
    ### GaussianNB
    clf = GaussianNB()
    train_test_time, accuracy, Y_previsto_NB_G = train_model_ovo(clf, X_train_v.toarray(), X_test_v.toarray(), y_train, y_test)
    #print( "GaussianNB: " + str( accuracy) ) 
    all_res.append(["GaussianNB: ", train_test_time, accuracy])
    
    ### BernoulliNB
    clf = BernoulliNB()
    train_test_time, accuracy, Y_previsto_NB_B = train_model_ovo(clf, X_train_v.toarray(), X_test_v.toarray(), y_train, y_test)
    #print( "BernoulliNB: " + str( accuracy) ) 
    all_res.append(["BernoulliNB: ", train_test_time, accuracy])
    
    ### MultinomialNB
    clf = MultinomialNB()
    train_test_time, accuracy, Y_previsto_NB_M = train_model_ovo(clf, X_train_v.toarray(), X_test_v.toarray(), y_train, y_test)
    #print( "MultinomialNB: " + str( accuracy) ) 
    all_res.append(["MultinomialNB: ", train_test_time, accuracy])
    
    ### RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=50, random_state=1)
    train_test_time, accuracy, Y_previsto_RF = train_model_ovo(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "RandomForestClassifier: " + str( accuracy) ) 
    all_res.append(["RandomForestClassifier: ", train_test_time, accuracy])
    
    ### ExtraTreesClassifier
    clf = ExtraTreesClassifier(n_estimators=100, random_state=0)
    train_test_time, accuracy, Y_previsto_ERT = train_model_ovo(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "ExtraTreesClassifier: " + str( accuracy) ) 
    all_res.append(["ExtraTreesClassifier: ", train_test_time, accuracy])
    
    ### LogisticRegression
    clf = LogisticRegression(random_state=0)
    train_test_time, accuracy, Y_previsto_LR = train_model_ovo(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "LogisticRegression: " + str( accuracy) ) 
    all_res.append(["LogisticRegression: ", train_test_time, accuracy])
    
    ### svm
    clf = svm.SVC()
    train_test_time, accuracy, Y_previsto_SVM = train_model_ovo(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "svm: " + str( accuracy) ) 
    all_res.append(["svm: ", train_test_time, accuracy])
    
    ### svm_rbf
    clf = svm.SVC(kernel='rbf', probability=True)
    train_test_time, accuracy, Y_previsto_svc = train_model_ovo(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "svm_rbf: " + str( accuracy) ) 
    all_res.append(["svm_rbf: ", train_test_time, accuracy])
        
    ### Stacking 1
    stack_result = []
    for a in range(len(Y_Previsto_MLP)):
        elements = []
        elements.append( np.floor( Y_Previsto_MLP[a] ) )
        elements.append( np.floor( Y_Previsto_ada[a] ) )
        elements.append( np.floor( Y_previsto_DTC[a] ) )
        elements.append( np.floor( Y_previsto_NB_G[a] ) )
        elements.append( np.floor( Y_previsto_NB_B[a] ) )
        elements.append( np.floor( Y_previsto_RF[a] ) )
        elements.append( np.floor( Y_previsto_ERT[a] ) )
        elements.append( np.floor( Y_previsto_LR[a] ) )
        elements.append( np.floor( Y_previsto_SVM[a] ) )
        elements.append( np.floor( Y_previsto_svc[a] ) )
        stack_result.append( stats.mode(elements).mode[0])
        
    accuracy = metrics.accuracy_score(y_test,  np.array(stack_result, dtype="int64" )  )
    all_res.append(["stacking_1: ", 0 , accuracy])
    
    ### Stacking 2
    stack_result = []
    for a in range(len(Y_Previsto_MLP)):
        elements = []
        #elements.append( np.floor( Y_Previsto_MLP[a] ) )
        #elements.append( np.floor( Y_Previsto_ada[a] ) )
        #elements.append( np.floor( Y_previsto_DTC[a] ) )
        #elements.append( np.floor( Y_previsto_NB_G[a] ) )
        elements.append( np.floor( Y_previsto_NB_B[a] ) )
        elements.append( np.floor( Y_previsto_RF[a] ) )
        #elements.append( np.floor( Y_previsto_ERT[a] ) )
        #elements.append( np.floor( Y_previsto_LR[a] ) )
        #elements.append( np.floor( Y_previsto_SVM[a] ) )
        elements.append( np.floor( Y_previsto_svc[a] ) )
        stack_result.append( stats.mode(elements).mode[0])
        
    accuracy = metrics.accuracy_score(y_test,  np.array(stack_result, dtype="int64" )  )
    all_res.append(["stacking_2: ", 0 , accuracy])
    
    ### Stacking 3
    stack_result = []
    for a in range(len(Y_Previsto_MLP)):
        elements = []
        elements.append( np.floor( Y_Previsto_MLP[a] ) )
        #elements.append( np.floor( Y_Previsto_ada[a] ) )
        elements.append( np.floor( Y_previsto_DTC[a] ) )
        #elements.append( np.floor( Y_previsto_NB_G[a] ) )
        #elements.append( np.floor( Y_previsto_NB_B[a] ) )
        elements.append( np.floor( Y_previsto_RF[a] ) )
        elements.append( np.floor( Y_previsto_ERT[a] ) )
        #elements.append( np.floor( Y_previsto_LR[a] ) )
        #elements.append( np.floor( Y_previsto_SVM[a] ) )
        elements.append( np.floor( Y_previsto_svc[a] ) )
        stack_result.append( stats.mode(elements).mode[0])
        
    accuracy = metrics.accuracy_score(y_test,  np.array(stack_result, dtype="int64" )  )
    all_res.append(["stacking_3: ", 0 , accuracy]) 
    
    ### Stacking_scikit
    clf1 = RandomForestClassifier(n_estimators=50, random_state=1)
    clf2 = LogisticRegression(random_state=0)
    clf3 = SVC()
    clf4 = ExtraTreesClassifier(n_estimators=100, random_state=0)
    estimators = [('rf', clf1), ('lr', clf2), ('svc', clf3), ('etc', clf4)]
    ini = time.time()
    clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(), n_jobs=None)
    
    ovo = OneVsOneClassifier(clf)
    ovo.fit(X_train_v, y_train)
    Y_Previsto_stacking = ovo.predict(X_test_v)
    
    fim = time.time()
    train_test_time = fim-ini
    accuracy = metrics.accuracy_score(y_test, Y_Previsto_stacking)
    all_res.append(["Stacking_scikit_ovo: ", train_test_time , accuracy])
    
    ### XGBClassifier
    clf = XGBClassifier(eval_metric='mlogloss')
    train_test_time, accuracy, Y_previsto_svc = train_model_ovo(clf, X_train_v.tocsc(), X_test_v.tocsc(), y_train, y_test)
    #print( "XGBClassifier: " + str( accuracy) ) 
    all_res.append(["XGBClassifier: ", train_test_time, accuracy])
    
    return pd.DataFrame( all_res, columns=["ALGORITHM","TRAIN_TEST_TIME", "ACCURACY"]  )
    
    
def train_model_ovr(classifier, X_train_v, X_test_v, y_train, y_test):
    
    ini = time.time()
    ovr = OneVsRestClassifier(classifier)
    try:
        ovr.fit(X_train_v, y_train)
    except:
        ovr.fit(X_train_v.toarray(), y_train)
    predictions = ovr.predict(X_test_v)
    fim = time.time()
    
    return fim-ini, metrics.accuracy_score(predictions, y_test), predictions

def get_tests_result_ovr(X_train_v, X_test_v, y_train, y_test):
    
    all_res = []    
    ### MLPClassifier
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(70, ), random_state=1, verbose=True)
    train_test_time, accuracy, Y_Previsto_MLP = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "MLPClassifier: " + str( accuracy) ) 
    all_res.append(["MLPClassifier: ", train_test_time, accuracy])
    
    ### AdaBoostClassifier
    clf = AdaBoostClassifier(n_estimators= 50, learning_rate=1)
    train_test_time, accuracy, Y_Previsto_ada = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "AdaBoostClassifier: " + str( accuracy) ) 
    all_res.append(["AdaBoostClassifier: ", train_test_time, accuracy])
    
    ### Voting_LR3_SVC1_ETC2
    all_res.append(["Voting_LR3_SVC1_ETC2: ", 0 , 0 ])
    
    ### Voting_LR1_SVC1_ETC1
    all_res.append(["Voting_LR1_SVC1_ETC1: ", 0, 0])
    
    ### DecisionTreeClassifier
    clf = DecisionTreeClassifier(random_state=0)
    train_test_time, accuracy, Y_previsto_DTC = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "DecisionTreeClassifier: " + str( accuracy) ) 
    all_res.append(["DecisionTreeClassifier: ", train_test_time, accuracy])
    
    ### GaussianNB
    Y_previsto_NB_G = 1
    try:
        clf = GaussianNB()
        train_test_time, accuracy, Y_previsto_NB_G = train_model_ovr(clf, X_train_v.toarray(), X_test_v.toarray(), y_train, y_test)
        #print( "GaussianNB: " + str( accuracy) ) 
        all_res.append(["GaussianNB: ", train_test_time, accuracy])
    except:
        clf = GaussianNB()
        train_test_time, accuracy, Y_previsto_NB_G = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
        #print( "GaussianNB: " + str( accuracy) ) 
        all_res.append(["GaussianNB: ", train_test_time, accuracy])
      
    
    ### BernoulliNB
    Y_previsto_NB_B = 1
    try:
        clf = BernoulliNB()
        train_test_time, accuracy, Y_previsto_NB_B = train_model_ovr(clf, X_train_v.toarray(), X_test_v.toarray(), y_train, y_test)
        #print( "BernoulliNB: " + str( accuracy) ) 
        all_res.append(["BernoulliNB: ", train_test_time, accuracy])
    except:
        clf = BernoulliNB()
        train_test_time, accuracy, Y_previsto_NB_B = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
        #print( "BernoulliNB: " + str( accuracy) ) 
        all_res.append(["BernoulliNB: ", train_test_time, accuracy])
    
    ### MultinomialNB
    Y_previsto_NB_M = 1
    try:
        clf = MultinomialNB()
        train_test_time, accuracy, Y_previsto_NB_M = train_model_ovr(clf, X_train_v.toarray(), X_test_v.toarray(), y_train, y_test)
        #print( "MultinomialNB: " + str( accuracy) ) 
        all_res.append(["MultinomialNB: ", train_test_time, accuracy])
    except:
        clf = MultinomialNB()
        train_test_time, accuracy, Y_previsto_NB_M = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
        #print( "MultinomialNB: " + str( accuracy) ) 
        all_res.append(["MultinomialNB: ", train_test_time, accuracy])
    
    ### RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=50, random_state=1)
    train_test_time, accuracy, Y_previsto_RF = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "RandomForestClassifier: " + str( accuracy) ) 
    all_res.append(["RandomForestClassifier: ", train_test_time, accuracy])
    
    ### ExtraTreesClassifier
    clf = ExtraTreesClassifier(n_estimators=100, random_state=0)
    train_test_time, accuracy, Y_previsto_ERT = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "ExtraTreesClassifier: " + str( accuracy) ) 
    all_res.append(["ExtraTreesClassifier: ", train_test_time, accuracy])
    
    ### LogisticRegression
    clf = LogisticRegression(random_state=0)
    train_test_time, accuracy, Y_previsto_LR = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "LogisticRegression: " + str( accuracy) ) 
    all_res.append(["LogisticRegression: ", train_test_time, accuracy])
    
    ### svm
    clf = svm.SVC()
    train_test_time, accuracy, Y_previsto_SVM = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "svm: " + str( accuracy) ) 
    all_res.append(["svm: ", train_test_time, accuracy])
    
    ### svm_rbf
    clf = svm.SVC(kernel='rbf', probability=True)
    train_test_time, accuracy, Y_previsto_svc = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "svm_rbf: " + str( accuracy) ) 
    all_res.append(["svm_rbf: ", train_test_time, accuracy])
        
    ### Stacking 1
    stack_result = []
    for a in range(len(Y_Previsto_MLP)):
        elements = []
        elements.append( np.floor( Y_Previsto_MLP[a] ) )
        elements.append( np.floor( Y_Previsto_ada[a] ) )
        elements.append( np.floor( Y_previsto_DTC[a] ) )
        elements.append( np.floor( Y_previsto_NB_G[a] ) )
        elements.append( np.floor( Y_previsto_NB_B[a] ) )
        elements.append( np.floor( Y_previsto_RF[a] ) )
        elements.append( np.floor( Y_previsto_ERT[a] ) )
        elements.append( np.floor( Y_previsto_LR[a] ) )
        elements.append( np.floor( Y_previsto_SVM[a] ) )
        elements.append( np.floor( Y_previsto_svc[a] ) )
        stack_result.append( stats.mode(elements).mode[0])
        
    accuracy = metrics.accuracy_score(y_test,  np.array(stack_result, dtype="int64" )  )
    all_res.append(["stacking_1: ", 0 , accuracy])
    
    ### Stacking 2
    stack_result = []
    for a in range(len(Y_Previsto_MLP)):
        elements = []
        #elements.append( np.floor( Y_Previsto_MLP[a] ) )
        #elements.append( np.floor( Y_Previsto_ada[a] ) )
        #elements.append( np.floor( Y_previsto_DTC[a] ) )
        #elements.append( np.floor( Y_previsto_NB_G[a] ) )
        elements.append( np.floor( Y_previsto_NB_B[a] ) )
        elements.append( np.floor( Y_previsto_RF[a] ) )
        #elements.append( np.floor( Y_previsto_ERT[a] ) )
        #elements.append( np.floor( Y_previsto_LR[a] ) )
        #elements.append( np.floor( Y_previsto_SVM[a] ) )
        elements.append( np.floor( Y_previsto_svc[a] ) )
        stack_result.append( stats.mode(elements).mode[0])
        
    accuracy = metrics.accuracy_score(y_test,  np.array(stack_result, dtype="int64" )  )
    all_res.append(["stacking_2: ", 0 , accuracy])
    
    ### Stacking 3
    stack_result = []
    for a in range(len(Y_Previsto_MLP)):
        elements = []
        elements.append( np.floor( Y_Previsto_MLP[a] ) )
        #elements.append( np.floor( Y_Previsto_ada[a] ) )
        elements.append( np.floor( Y_previsto_DTC[a] ) )
        #elements.append( np.floor( Y_previsto_NB_G[a] ) )
        #elements.append( np.floor( Y_previsto_NB_B[a] ) )
        elements.append( np.floor( Y_previsto_RF[a] ) )
        elements.append( np.floor( Y_previsto_ERT[a] ) )
        #elements.append( np.floor( Y_previsto_LR[a] ) )
        #elements.append( np.floor( Y_previsto_SVM[a] ) )
        elements.append( np.floor( Y_previsto_svc[a] ) )
        stack_result.append( stats.mode(elements).mode[0])
        
    accuracy = metrics.accuracy_score(y_test,  np.array(stack_result, dtype="int64" )  )
    all_res.append(["stacking_3: ", 0 , accuracy]) 
    
    ### Stacking_scikit
    clf1 = RandomForestClassifier(n_estimators=50, random_state=1)
    clf2 = LogisticRegression(random_state=0)
    clf3 = SVC()
    clf4 = ExtraTreesClassifier(n_estimators=100, random_state=0)
    estimators = [('rf', clf1), ('lr', clf2), ('svc', clf3), ('etc', clf4)]
    ini = time.time()
    clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(), n_jobs=None)
    
    ovr = OneVsRestClassifier(clf)
    ovr.fit(X_train_v, y_train)
    Y_Previsto_stacking = ovr.predict(X_test_v)
    
    fim = time.time()
    train_test_time = fim-ini
    accuracy = metrics.accuracy_score(y_test, Y_Previsto_stacking)
    all_res.append(["Stacking_scikit_ovr: ", train_test_time , accuracy])
    
    ### XGBClassifier
    Y_previsto_xgbc = 1
    try:
        clf = XGBClassifier(eval_metric='mlogloss')
        train_test_time, accuracy, Y_previsto_xgbc = train_model_ovr(clf, X_train_v.tocsc(), X_test_v.tocsc(), y_train, y_test)
        #print( "XGBClassifier: " + str( accuracy) ) 
        all_res.append(["XGBClassifier: ", train_test_time, accuracy])
    except:
        clf = XGBClassifier(eval_metric='mlogloss')
        train_test_time, accuracy, Y_previsto_xgbc = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
        #print( "XGBClassifier: " + str( accuracy) ) 
        all_res.append(["XGBClassifier: ", train_test_time, accuracy])
    
    return pd.DataFrame( all_res, columns=["ALGORITHM","TRAIN_TEST_TIME", "ACCURACY"]  )

### Opening exemple Dataset

In [4]:
path = "C:/PMON2021-NLP"
file = "DATASET_CLEAN.csv"
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
DATASET.head(4)

Unnamed: 0,RELATOCLIENTE,PROBLEMA,RELATOCLIENTE_CLEAN
0,cliente entrou em contato informando que está ...,Queda / Intermitência,cliente entrou contato informando esta sem sin...
1,CLIENTE COM QUEDAS REALIZEI OS TESTE E ENCAMIN...,Queda / Intermitência,cliente com quedas realizei teste encaminhei s...
2,"Cliente reclama de quedas e intermitência , pr...",Queda / Intermitência,cliente reclama quedas intermitencia procedime...
3,CLIENTE INFORMA QUE INTERNET ESTA COM QUEDAS H...,Queda / Intermitência,cliente informa internet esta com quedas mais ...


### Test 0 with the exemple dataset

In [80]:
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,0.234905,0.616667
1,AdaBoostClassifier:,0.078104,0.533333
2,Voting_LR3_SVC1_ETC2:,0.374932,0.65
3,Voting_LR1_SVC1_ETC1:,0.374116,0.65
4,DecisionTreeClassifier:,0.015619,0.583333
5,GaussianNB:,0.0,0.5
6,BernoulliNB:,0.0,0.616667
7,MultinomialNB:,0.0,0.633333
8,RandomForestClassifier:,0.062485,0.666667
9,ExtraTreesClassifier:,0.124585,0.633333


### Test 1 with the complete dataset
* Todas as palavras da base;
* Removendo apenas stopwords;
* 3000 registros para cada classe de problema;

In [4]:
path = "C:/PMON2021-NLP"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,22.525683,0.8075
1,AdaBoostClassifier:,1.187195,0.729444
2,Voting_LR3_SVC1_ETC2:,232.752228,0.845833
3,Voting_LR1_SVC1_ETC1:,229.672287,0.844722
4,DecisionTreeClassifier:,1.187206,0.7925
5,GaussianNB:,4.31149,0.522222
6,BernoulliNB:,1.765204,0.755278
7,MultinomialNB:,0.687306,0.766667
8,RandomForestClassifier:,9.506714,0.833056
9,ExtraTreesClassifier:,27.601219,0.837222


### Test 2 with the complete dataset
* Todas as palavras da base;
* Removendo apenas stopwords;
* 7000 registros para cada classe de problema;

In [5]:
path = "C:/PMON2021-NLP"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,44.014814,0.820476
1,AdaBoostClassifier:,2.796191,0.728095
2,Voting_LR3_SVC1_ETC2:,1023.333115,0.84619
3,Voting_LR1_SVC1_ETC1:,1021.191428,0.847857
4,DecisionTreeClassifier:,4.787648,0.803095
5,GaussianNB:,48.597214,0.489167
6,BernoulliNB:,9.263345,0.764881
7,MultinomialNB:,2.827443,0.774524
8,RandomForestClassifier:,37.561203,0.841071
9,ExtraTreesClassifier:,109.020018,0.840595


### Test 3 with the complete dataset
* Removendo as 6 palavras mais frequentes da base;
* Removendo stopwords;
* 3000 registros para cada classe de problema;


In [6]:
path = "C:/PMON2021-NLP"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,21.032017,0.760278
1,AdaBoostClassifier:,0.968526,0.628611
2,Voting_LR3_SVC1_ETC2:,217.203632,0.792222
3,Voting_LR1_SVC1_ETC1:,217.516733,0.789722
4,DecisionTreeClassifier:,1.124703,0.735833
5,GaussianNB:,4.022254,0.511667
6,BernoulliNB:,1.702696,0.697778
7,MultinomialNB:,0.6717,0.711944
8,RandomForestClassifier:,9.653968,0.779167
9,ExtraTreesClassifier:,28.250865,0.778333


### Test 4 with the complete dataset
* Base composta pelas 700 palavras mais frequentes, removendo todas as palavras restantes;
* Removendo stopwords;
* 3000 registros para cada classe de problema;


In [7]:
path = "C:/PMON2021-NLP"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,7.648513,0.8025
1,AdaBoostClassifier:,0.906034,0.714722
2,Voting_LR3_SVC1_ETC2:,133.119194,0.826111
3,Voting_LR1_SVC1_ETC1:,133.209067,0.826389
4,DecisionTreeClassifier:,0.718576,0.769167
5,GaussianNB:,0.343667,0.615833
6,BernoulliNB:,0.140561,0.750556
7,MultinomialNB:,0.046831,0.7475
8,RandomForestClassifier:,4.545769,0.818333
9,ExtraTreesClassifier:,11.325487,0.821111


### Test 5 with the complete dataset
* Base composta pelas 700 palavras mais frequentes, removendo todas as palavras restantes;
* Removendo stopwords;
* 7000 registros para cada classe de problema;


In [8]:
path = "C:/PMON2021-NLP"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,16.311365,0.809167
1,AdaBoostClassifier:,2.253941,0.736905
2,Voting_LR3_SVC1_ETC2:,567.599483,0.835833
3,Voting_LR1_SVC1_ETC1:,569.039031,0.839286
4,DecisionTreeClassifier:,2.874319,0.792976
5,GaussianNB:,1.187207,0.551429
6,BernoulliNB:,0.531105,0.752976
7,MultinomialNB:,0.140596,0.755
8,RandomForestClassifier:,16.049357,0.83131
9,ExtraTreesClassifier:,42.435655,0.83381


### Test 6 with the complete dataset
* Base composta pelas 4000 palavras mais frequentes, removendo todas as palavras restantes;
* Removendo stopwords;
* 3000 registros para cada classe de problema;


In [9]:
path = "C:/PMON2021-NLP"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,10.767277,0.797778
1,AdaBoostClassifier:,1.031028,0.719444
2,Voting_LR3_SVC1_ETC2:,180.855142,0.831944
3,Voting_LR1_SVC1_ETC1:,180.751031,0.8325
4,DecisionTreeClassifier:,0.921629,0.7775
5,GaussianNB:,1.218483,0.579722
6,BernoulliNB:,0.437374,0.754722
7,MultinomialNB:,0.187454,0.761944
8,RandomForestClassifier:,5.93609,0.822778
9,ExtraTreesClassifier:,15.566128,0.826667


### Test 7 with the complete dataset
* Base composta pelas 4000 palavras mais frequentes, removendo todas as palavras restantes;
* Removendo stopwords;
* 7000 registros para cada classe de problema;

In [10]:
path = "C:/PMON2021-NLP"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,20.798002,0.824881
1,AdaBoostClassifier:,2.53065,0.735119
2,Voting_LR3_SVC1_ETC2:,775.397416,0.847024
3,Voting_LR1_SVC1_ETC1:,774.824264,0.848929
4,DecisionTreeClassifier:,3.811598,0.797738
5,GaussianNB:,3.296098,0.590714
6,BernoulliNB:,1.218468,0.764881
7,MultinomialNB:,0.499856,0.770714
8,RandomForestClassifier:,20.413749,0.838214
9,ExtraTreesClassifier:,53.21635,0.842143


### Test 8 with the complete dataset
* Todas as palavras da base;
* Removendo stopwords;
* BERT as service para português;
* 3000 registros para cada classe de problema;

In [None]:
path = "C:/PMON2021-NLP"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train, X_test, y_train, y_test = train_test_split( DATASET["RELATOCLIENTE_CLEAN"] ,
                                                    DATASET["PROBLEMA"], 
                                                    train_size=0.8, random_state=42)

bc = BertClient()
X_train_bert = bc.encode(X_train.tolist())
X_test_bert = bc.encode(X_test.tolist())
X_train_v = X_train_bert.copy()
X_test_v = X_test_bert.copy()

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

### Test 9 with the complete dataset
* Base composta pelas 5000 palavras mais frequentes, removendo todas as palavras restantes;
* Removendo stopwords;
* 7000 registros para cada classe de problema;
* Staking personalizado: a moda das classificações de todos os algoritmos foi tomada como resultado da classificação;
* Staking 1: todos os algoritmos;
* Staking 2: 4 algoritmos com maior acurácia;


In [7]:
path = "C:/PMON2021-NLP"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,21.357614,0.841608
1,AdaBoostClassifier:,2.614887,0.748437
2,Voting_LR3_SVC1_ETC2:,827.047537,0.868824
3,Voting_LR1_SVC1_ETC1:,851.638438,0.871644
4,DecisionTreeClassifier:,3.955573,0.82371
5,GaussianNB:,4.040643,0.611867
6,BernoulliNB:,1.586694,0.785093
7,MultinomialNB:,0.581613,0.797352
8,RandomForestClassifier:,21.848946,0.862082
9,ExtraTreesClassifier:,57.878238,0.86674


### Test 10 with the complete dataset
* Base composta pelas 10.000 palavras mais frequentes, removendo todas as palavras restantes;
* Removendo stopwords;
* 7000 registros para cada classe de problema;
* Staking personalizado: a moda das classificações de todos os algoritmos foi tomada como resultado da classificação;
* Staking 1: todos os algoritmos;
* Staking 2: 4 algoritmos com maior acurácia;

In [8]:
path = "C:/PMON2021-NLP"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,25.286812,0.847125
1,AdaBoostClassifier:,2.717698,0.752973
2,Voting_LR3_SVC1_ETC2:,895.120084,0.873973
3,Voting_LR1_SVC1_ETC1:,852.675155,0.877283
4,DecisionTreeClassifier:,4.014678,0.828736
5,GaussianNB:,6.021779,0.584038
6,BernoulliNB:,2.452552,0.787912
7,MultinomialNB:,0.874766,0.800049
8,RandomForestClassifier:,23.470762,0.870541
9,ExtraTreesClassifier:,64.24999,0.871889


### Test 11 with the complete dataset
* Sem remover palavras mais frequentes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Vetorização: TF-IDF;
* Staking personalizado: a moda das classificações de todos os algoritmos foi tomada como resultado da classificação;
* Staking 1: todos os algoritmos;
* Staking 2: 4 algoritmos com maior acurácia;


In [9]:
path = "C:/PMON2021-NLP"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,43.982335,0.820476
1,AdaBoostClassifier:,2.780555,0.728095
2,Voting_LR3_SVC1_ETC2:,1019.636085,0.84631
3,Voting_LR1_SVC1_ETC1:,1020.984297,0.847619
4,DecisionTreeClassifier:,4.826985,0.803095
5,GaussianNB:,18.308182,0.489167
6,BernoulliNB:,9.130423,0.764881
7,MultinomialNB:,2.88992,0.774524
8,RandomForestClassifier:,37.389365,0.841071
9,ExtraTreesClassifier:,108.988802,0.840595


### Test 12 with the complete dataset
* Sem remover palavras mais frequentes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Vetorização: CountVectorizer;
* Staking personalizado: a moda das classificações de todos os algoritmos foi tomada como resultado da classificação;
* Staking 1: todos os algoritmos;
* Staking 2: 4 algoritmos com maior acurácia;

In [10]:
path = "C:/PMON2021-NLP"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')

def cv(data):
    count_vectorizer = CountVectorizer()

    emb = count_vectorizer.fit_transform(data)

    return emb, count_vectorizer

list_corpus = DATASET["RELATOCLIENTE_CLEAN2"].tolist()
list_labels = DATASET["PROBLEMA_N"].tolist()

X_train, X_test, y_train, y_test = train_test_split(list_corpus, list_labels, train_size=0.80, 
                                                                                random_state=28)
X_train_v, count_vectorizer = cv(X_train)
X_test_v = count_vectorizer.transform(X_test)

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,44.799707,0.854113
1,AdaBoostClassifier:,1.568557,0.741326
2,Voting_LR3_SVC1_ETC2:,979.443391,0.871767
3,Voting_LR1_SVC1_ETC1:,979.797481,0.87336
4,DecisionTreeClassifier:,3.844656,0.831678
5,GaussianNB:,17.186179,0.457153
6,BernoulliNB:,44.483205,0.78166
7,MultinomialNB:,40.640288,0.779331
8,RandomForestClassifier:,43.309963,0.874954
9,ExtraTreesClassifier:,112.096646,0.87569


### Test 13 with the complete dataset
* Sem remover palavras mais frequentes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Vetorização: CountVectorizer;
* Stemmed_RSLP ( radicais das palavras)
* Staking personalizado: a moda das classificações de todos os algoritmos foi tomada como resultado da classificação;
* Staking 1: todos os algoritmos;
* Staking 2: 4 algoritmos com maior acurácia;

In [11]:
path = "C:/PMON2021-NLP"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')

def cv(data):
    count_vectorizer = CountVectorizer()

    emb = count_vectorizer.fit_transform(data)

    return emb, count_vectorizer

list_corpus = DATASET["RELATOCLIENTE_CLEAN"].tolist()
list_labels = DATASET["PROBLEMA"].tolist()

X_train, X_test, y_train, y_test = train_test_split(list_corpus, list_labels, train_size=0.80, 
                                                                                random_state=28)
X_train_v, count_vectorizer = cv(X_train)
X_test_v = count_vectorizer.transform(X_test)

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,44.671921,0.854113
1,AdaBoostClassifier:,1.568852,0.741326
2,Voting_LR3_SVC1_ETC2:,980.240971,0.871644
3,Voting_LR1_SVC1_ETC1:,977.95193,0.87336
4,DecisionTreeClassifier:,3.857208,0.831678
5,GaussianNB:,17.01232,0.457153
6,BernoulliNB:,44.407019,0.78166
7,MultinomialNB:,40.612171,0.779331
8,RandomForestClassifier:,43.229818,0.874954
9,ExtraTreesClassifier:,112.225228,0.87569


### Test 14 with the complete dataset
* Base composta pelas 10.000 palavras mais frequentes, removendo todas as palavras restantes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Vetorização: TF-IDF
* Abordagem One vs One
* Staking personalizado: a moda das classificações de todos os algoritmos foi tomada como resultado da classificação;
* Staking 1: todos os algoritmos;
* Staking 2: 4 algoritmos com maior acurácia;

In [3]:
path = "C:/PMON2021-NLP"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")

results = get_tests_result_ovo(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,161.366183,0.841486
1,AdaBoostClassifier:,35.083257,0.81758
2,Voting_LR3_SVC1_ETC2:,0.0,0.0
3,Voting_LR1_SVC1_ETC1:,0.0,0.0
4,DecisionTreeClassifier:,9.281981,0.832904
5,GaussianNB:,47.473135,0.58931
6,BernoulliNB:,25.576598,0.787912
7,MultinomialNB:,8.850272,0.800049
8,RandomForestClassifier:,60.533612,0.870173
9,ExtraTreesClassifier:,230.284074,0.87005


### Test 15 with the complete dataset
* Base composta pelas 10.000 palavras mais frequentes, removendo todas as palavras restantes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Vetorização: TF-IDF
* Abordagem One vs Rest
* Staking personalizado: a moda das classificações de todos os algoritmos foi tomada como resultado da classificação;
* Staking 1: todos os algoritmos;
* Staking 2: 4 algoritmos com maior acurácia;

In [6]:
path = "C:/PMON2021-NLP"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")

results = get_tests_result_ovr(X_train_v, X_test_v, y_train, y_test)
results
#### ... pasta 7

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,128.284923,0.853868
1,AdaBoostClassifier:,14.277206,0.799068
2,Voting_LR3_SVC1_ETC2:,0.0,0.0
3,Voting_LR1_SVC1_ETC1:,0.0,0.0
4,DecisionTreeClassifier:,34.466916,0.802869
5,GaussianNB:,24.768145,0.377835
6,BernoulliNB:,13.213651,0.796371
7,MultinomialNB:,3.695135,0.800172
8,RandomForestClassifier:,64.141342,0.87238
9,ExtraTreesClassifier:,196.13537,0.873238


### Test 16 with the complete dataset
* Sem remover palavras mais frequentes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Vetorização: CountVectorizer
* Abordagem One vs Rest
* Staking Scikit-Learn

In [17]:
path = "C:/PMON2021-NLP"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')


def cv(data):
    count_vectorizer = CountVectorizer()

    emb = count_vectorizer.fit_transform(data)

    return emb, count_vectorizer

list_corpus = DATASET["RELATOCLIENTE_CLEAN"].tolist()
list_labels = DATASET["PROBLEMA"].tolist()

X_train, X_test, y_train, y_test = train_test_split(list_corpus, list_labels, train_size=0.80, 
                                                                                random_state=28)
X_train_v, count_vectorizer = cv(X_train)
X_test_v = count_vectorizer.transform(X_test)

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result_ovr(X_train_v, X_test_v, y_train, y_test)
results
#### ... pasta 7

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,207.553618,0.868334
1,AdaBoostClassifier:,8.248807,0.802501
2,Voting_LR3_SVC1_ETC2:,0.0,0.0
3,Voting_LR1_SVC1_ETC1:,0.0,0.0
4,DecisionTreeClassifier:,15.962435,0.820767
5,GaussianNB:,92.246801,0.397205
6,BernoulliNB:,144.855197,0.791467
7,MultinomialNB:,86.603116,0.787912
8,RandomForestClassifier:,107.395313,0.880348
9,ExtraTreesClassifier:,283.766477,0.881819


### Test 17 with the complete dataset
* Sem remover palavras mais frequentes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Passagem para forma numérica:
* gensim.corpora.Dictionary e bag of words;
* técnica para topic modelling: Latent Dirichlet Allocation (LDA)
* 30 tópicos 
* Usando somente as colunas dos 30 tópicos criados


In [18]:
path = "C:/PMON2021-NLP"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
DATASET["RELATOCLIENTE_CLEAN_T"] = list(sent_to_words(DATASET.RELATOCLIENTE_CLEAN.values.tolist()))
# Create Dictionary
id2word = corpora.Dictionary(DATASET.RELATOCLIENTE_CLEAN_T.values.tolist())
# Create Corpus
texts = DATASET.RELATOCLIENTE_CLEAN_T.values.tolist()
corpus = [id2word.doc2bow(text) for text in texts]
# number of topics
num_topics = 30
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       workers=2)

# DATASET with topics
for a in range( num_topics ):
    column = "TP" + str(a + 1)
    DATASET[column] = " "

for i in range(len(DATASET)):
    top_topics = (
        lda_model.get_document_topics(corpus[i],
                                      minimum_probability=0.0)
    )
    topic_vec = [top_topics[j][1] for j in range(num_topics)]
    
    for a in range(num_topics):
        column = "TP" + str(a + 1)
        DATASET[column].iloc[i] = topic_vec[a]

column = []
for a in range(num_topics):
    column.append("TP" + str(a + 1))

X = np.array(DATASET[column])
y = np.array(DATASET.PROBLEMA)        
        
kf = KFold(50, shuffle=True, random_state=42)

for train_ind, val_ind in kf.split(X, y):
    # Assign CV IDX
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind]
    
    # Scale Data
    scaler = StandardScaler()
    X_train_scale = scaler.fit_transform(X_train)
    X_val_scale = scaler.transform(X_val)

    # Logisitic Regression
    lr = LogisticRegression(
        class_weight= 'balanced',
        solver='newton-cg',
        fit_intercept=True
    ).fit(X_train_scale, y_train)

    y_pred = lr.predict(X_val_scale)
    
    p_ac = metrics.accuracy_score(y_val,  y_pred)
    print("Acurácia: " + str(p_ac))


Acurácia: 0.5428921568627451
Acurácia: 0.5416666666666666
Acurácia: 0.5379901960784313
Acurácia: 0.5281862745098039
Acurácia: 0.5306372549019608
Acurácia: 0.5453431372549019
Acurácia: 0.5379901960784313
Acurácia: 0.5294117647058824
Acurácia: 0.5232843137254902
Acurácia: 0.5208333333333334
Acurácia: 0.553921568627451
Acurácia: 0.5294117647058824
Acurácia: 0.5220588235294118
Acurácia: 0.5294117647058824
Acurácia: 0.5343137254901961
Acurácia: 0.5490196078431373
Acurácia: 0.5306372549019608
Acurácia: 0.5232843137254902
Acurácia: 0.5416666666666666
Acurácia: 0.5232843137254902
Acurácia: 0.5490196078431373
Acurácia: 0.5171568627450981
Acurácia: 0.5686274509803921
Acurácia: 0.5392156862745098
Acurácia: 0.5220588235294118
Acurácia: 0.5453431372549019
Acurácia: 0.5416666666666666
Acurácia: 0.5134803921568627
Acurácia: 0.5318627450980392
Acurácia: 0.5281862745098039
Acurácia: 0.5122549019607843
Acurácia: 0.5257352941176471
Acurácia: 0.5349693251533743
Acurácia: 0.5730061349693252
Acurácia: 0.534

### Test 18 with the complete dataset
* Sem remover palavras mais frequentes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Passagem para forma numérica:
    * gensim.corpora.Dictionary e bag of words;
* técnica para topic modelling: Latent Dirichlet Allocation (LDA)
* Usando 
    * 30 tópicos
    * texto(TF-IDF) variando max_features


In [20]:
for a in range(10, 1000 , 10):
    vectorizer = TfidfVectorizer( max_features= a)
    XX = vectorizer.fit_transform(DATASET["RELATOCLIENTE_CLEAN"])
    yy = DATASET["PROBLEMA"]
    XXX = hstack((XX, X.astype(float)))

    X_train, X_test, y_train, y_test = train_test_split( XXX,
                                                    DATASET["PROBLEMA"], 
                                                    train_size=0.80, random_state=28)

    encoder = preprocessing.LabelEncoder()
    y_train = encoder.fit_transform(y_train)
    y_test = encoder.fit_transform(y_test)
    
    lr = LogisticRegression(random_state=0).fit(X_train, y_train)
    Y_previsto_LR = lr.predict(X_test)

    # Análise da previsão
    p_ac = metrics.accuracy_score(y_test, np.floor( Y_previsto_LR) )
    print("max_features: "+ str(a) + " " + "#Acurácia: {:.4f}".format(p_ac) )

max_features: 10 #Acurácia: 0.6388
max_features: 20 #Acurácia: 0.6935
max_features: 30 #Acurácia: 0.7036
max_features: 40 #Acurácia: 0.7334
max_features: 50 #Acurácia: 0.7369
max_features: 60 #Acurácia: 0.7549
max_features: 70 #Acurácia: 0.7585
max_features: 80 #Acurácia: 0.7676
max_features: 90 #Acurácia: 0.7690
max_features: 100 #Acurácia: 0.7738
max_features: 110 #Acurácia: 0.7787
max_features: 120 #Acurácia: 0.7802
max_features: 130 #Acurácia: 0.7826
max_features: 140 #Acurácia: 0.7873
max_features: 150 #Acurácia: 0.7878
max_features: 160 #Acurácia: 0.7912
max_features: 170 #Acurácia: 0.7956
max_features: 180 #Acurácia: 0.7976
max_features: 190 #Acurácia: 0.8009
max_features: 200 #Acurácia: 0.8038
max_features: 210 #Acurácia: 0.8064
max_features: 220 #Acurácia: 0.8074
max_features: 230 #Acurácia: 0.8118
max_features: 240 #Acurácia: 0.8132
max_features: 250 #Acurácia: 0.8160
max_features: 260 #Acurácia: 0.8195
max_features: 270 #Acurácia: 0.8209
max_features: 280 #Acurácia: 0.8213
m

### Test 19 with the complete dataset
* Sem remover palavras mais frequentes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Passagem para forma numérica:
    * gensim.corpora.Dictionary e bag of words;
* técnica para topic modelling: Latent Dirichlet Allocation (LDA)
* Usando 
    * max_features  = 870
    * Variando número de tópicos


In [21]:
for num_topics in range(2, 60 , 4):

# Build LDA model
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       workers=2)
    rev_train = DATASET
    lda_train = lda_model


    for a in range( num_topics ):
        column = "TP" + str(a + 1)
        rev_train[column] = " "

    for i in range(len(rev_train)):
        top_topics = (
            lda_train.get_document_topics(corpus[i],
                                      minimum_probability=0.0)
        )
        topic_vec = [top_topics[j][1] for j in range(num_topics)]
    
        for a in range(num_topics):
            column = "TP" + str(a + 1)
            rev_train[column].iloc[i] = topic_vec[a]
         
    column = []
    for a in range(num_topics):
        column.append("TP" + str(a + 1))

    X = np.array(rev_train[column])
    y = np.array(rev_train.PROBLEMA)
    
    vectorizer = TfidfVectorizer( max_features= 930)
    XX = vectorizer.fit_transform(DATASET["RELATOCLIENTE_CLEAN"])
    yy = DATASET["PROBLEMA"]
    XXX = hstack((XX, X.astype(float)))

    X_train, X_test, y_train, y_test = train_test_split( XXX,
                                                    DATASET["PROBLEMA"], 
                                                    train_size=0.80, random_state=28)
    encoder = preprocessing.LabelEncoder()
    y_train = encoder.fit_transform(y_train)
    y_test = encoder.fit_transform(y_test)
    
    lr = LogisticRegression(random_state=0).fit(X_train, y_train)
    Y_previsto_LR = lr.predict(X_test)

    # Análise da previsão
    p_ac = metrics.accuracy_score(y_test, np.floor( Y_previsto_LR) )
    print("num_topics: "+ str(num_topics) + " " + "#Acurácia: {:.4f}".format(p_ac) ) 

num_topics: 2 #Acurácia: 0.8453
num_topics: 6 #Acurácia: 0.8458
num_topics: 10 #Acurácia: 0.8454
num_topics: 14 #Acurácia: 0.8459
num_topics: 18 #Acurácia: 0.8474
num_topics: 22 #Acurácia: 0.8464
num_topics: 26 #Acurácia: 0.8465
num_topics: 30 #Acurácia: 0.8465
num_topics: 34 #Acurácia: 0.8471
num_topics: 38 #Acurácia: 0.8468
num_topics: 42 #Acurácia: 0.8492
num_topics: 46 #Acurácia: 0.8461
num_topics: 50 #Acurácia: 0.8468
num_topics: 54 #Acurácia: 0.8482
num_topics: 58 #Acurácia: 0.8479


### Test 20 with the complete dataset
* Sem remover palavras mais frequentes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Passagem para forma numérica:
    * gensim.corpora.Dictionary e bag of words;
* gensim.similarities	
    * Word Embedding Similarity Index
* Similaridade entre as string escolhidas para representar as classes de problemas


In [35]:
path = "C:/PMON2021-NLP"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
DATASET["RELATOCLIENTE_CLEAN_T"] = list(sent_to_words(DATASET.RELATOCLIENTE_CLEAN.values.tolist()))
# Create Dictionary
id2word = corpora.Dictionary(DATASET.RELATOCLIENTE_CLEAN_T.values.tolist())
# Create Corpus
texts = DATASET.RELATOCLIENTE_CLEAN_T.values.tolist()
corpus = [id2word.doc2bow(text) for text in texts]
dictionary = Dictionary(DATASET.RELATOCLIENTE_CLEAN_T.values.tolist() )
tfidf = TfidfModel(dictionary=dictionary)
w2v_model = Word2Vec( DATASET.RELATOCLIENTE_CLEAN_T.values.tolist(), workers=2, min_count=5, seed=12345)
similarity_index = WordEmbeddingSimilarityIndex(w2v_model.wv)
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf, nonzero_limit=100)

s_modem_sem_sincronismo = "conexao internet massiva sincronismo modem status sucesso testes procedimento telefone".lower().split()
s_massiva = "massiva rede interrupcao ntt aberto primaria prazo gpon rftth".lower().split()
s_modem_sincronizado_e_autenticado = "conexao internet modem procedimento massiva sucesso sincronizado status acesso testes".lower().split()
s_parametros_ruins = "status attenuation margin parametros noise ont indicator conexao velocidade ruins".lower().split()
s_baixa_velocidade = "ping upload download velocidade teste lentidao testes baixa cabo reclama".lower().split()
s_queda_intermitencia = "quedas status conexao reinit internet attenuation ont ngasp power procedimentos".lower().split()

s_modem_sem_sincronismo = id2word.doc2bow(s_modem_sem_sincronismo)
s_massiva = id2word.doc2bow(s_massiva)
s_modem_sincronizado_e_autenticado = id2word.doc2bow(s_modem_sincronizado_e_autenticado)
s_parametros_ruins = id2word.doc2bow(s_parametros_ruins)
s_baixa_velocidade = id2word.doc2bow(s_baixa_velocidade)
s_queda_intermitencia = id2word.doc2bow(s_queda_intermitencia)

s0 = s_modem_sem_sincronismo
s1 = s_massiva
s2 = s_modem_sincronizado_e_autenticado
s3 = s_parametros_ruins
s4 = s_baixa_velocidade
s5 = s_queda_intermitencia

ss = [s0, s1, s2, s3, s4, s5]

DATASET["S0"] = 0.0
DATASET["S1"] = 0.0
DATASET["S2"] = 0.0
DATASET["S3"] = 0.0
DATASET["S4"] = 0.0
DATASET["S5"] = 0.0

for a in range(len(DATASET)):
    doc_vec = DATASET.iloc(0)[a][3]
    doc_bow = id2word.doc2bow( doc_vec)
        
    DATASET["S0"].iloc[a] = similarity_matrix.inner_product( doc_bow , s0, normalized=(True, True))
    DATASET["S1"].iloc[a] = similarity_matrix.inner_product( doc_bow , s1, normalized=(True, True))
    DATASET["S2"].iloc[a] = similarity_matrix.inner_product( doc_bow , s2, normalized=(True, True))
    DATASET["S3"].iloc[a] = similarity_matrix.inner_product( doc_bow , s3, normalized=(True, True))
    DATASET["S4"].iloc[a] = similarity_matrix.inner_product( doc_bow , s4, normalized=(True, True))
    DATASET["S5"].iloc[a] = similarity_matrix.inner_product( doc_bow , s5, normalized=(True, True))

X = np.array(DATASET[["S0", "S1", "S2", "S3", "S4", "S5" ]])
y = np.array(DATASET.PROBLEMA)

X_train, X_test, y_train, y_test = train_test_split( X,
                                                     y, 
                                                     train_size=0.80, random_state=28)

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

100%|██████████████████████████████████████████████████████████████████████████| 22774/22774 [00:04<00:00, 5080.51it/s]


Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,44.011303,0.855584
1,AdaBoostClassifier:,1.551394,0.748192
2,Voting_LR3_SVC1_ETC2:,980.843357,0.870663
3,Voting_LR1_SVC1_ETC1:,1019.340667,0.872747
4,DecisionTreeClassifier:,3.955566,0.833885
5,GaussianNB:,23.041739,0.455437
6,BernoulliNB:,52.897381,0.78117
7,MultinomialNB:,43.558101,0.778718
8,RandomForestClassifier:,45.540468,0.871889
9,ExtraTreesClassifier:,117.374906,0.875812


### Test 21 with the complete dataset
* Sem remover palavras mais frequentes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Passagem para forma numérica:
    * gensim.corpora.Dictionary e bag of words;
* gensim.similarities	
    * Word Embedding Similarity Index
* teste usando
    * Colunas de similaridade
    * Texto (TF-IDF) 


In [36]:
for a in range(100, 2000, 50):
    vectorizer = TfidfVectorizer( max_features= a)
    XX = vectorizer.fit_transform(DATASET["RELATOCLIENTE_CLEAN"])
    XXX = hstack((XX, X.astype(float)))
    y = np.array(DATASET.PROBLEMA)

    X_train, X_test, y_train, y_test = train_test_split( XXX,
                                                     y, 
                                                    train_size=0.80, random_state=28)
    
    encoder = preprocessing.LabelEncoder()
    y_train = encoder.fit_transform(y_train)
    y_test = encoder.fit_transform(y_test)

    lr = LogisticRegression(random_state=0).fit(X_train, y_train)
    Y_previsto_LR = lr.predict(X_test)

    # Análise da previsão
    p_ac = metrics.accuracy_score(y_test, np.floor( Y_previsto_LR) )
    print("max_features: "+ str(a) + " " + "#Acurácia: {:.4f}".format(p_ac) ) 

max_features: 100 #Acurácia: 0.7737
max_features: 150 #Acurácia: 0.7858
max_features: 200 #Acurácia: 0.8026
max_features: 250 #Acurácia: 0.8156
max_features: 300 #Acurácia: 0.8232
max_features: 350 #Acurácia: 0.8290
max_features: 400 #Acurácia: 0.8323
max_features: 450 #Acurácia: 0.8366
max_features: 500 #Acurácia: 0.8383
max_features: 550 #Acurácia: 0.8423
max_features: 600 #Acurácia: 0.8430
max_features: 650 #Acurácia: 0.8442
max_features: 700 #Acurácia: 0.8446
max_features: 750 #Acurácia: 0.8458
max_features: 800 #Acurácia: 0.8460
max_features: 850 #Acurácia: 0.8453
max_features: 900 #Acurácia: 0.8464
max_features: 950 #Acurácia: 0.8474
max_features: 1000 #Acurácia: 0.8459
max_features: 1050 #Acurácia: 0.8471
max_features: 1100 #Acurácia: 0.8476
max_features: 1150 #Acurácia: 0.8487
max_features: 1200 #Acurácia: 0.8492
max_features: 1250 #Acurácia: 0.8480
max_features: 1300 #Acurácia: 0.8482
max_features: 1350 #Acurácia: 0.8486
max_features: 1400 #Acurácia: 0.8491
max_features: 1450 

### Test 22 with the complete dataset
* Sem remover palavras mais frequentes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Passagem para forma numérica:
    * gensim.corpora.Dictionary e bag of words;
* gensim.similarities	
    * Word Embedding Similarity Index
* teste usando
    * 6 colunas de similaridade
    * 6 tópicos ( topic modelling LDA )


In [48]:
num_topics = 6
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics= 6,
                                       workers=2)

rev_train = DATASET
lda_train = lda_model

for a in range( num_topics ):
    column = "TP" + str(a + 1)
    rev_train[column] = " "

for i in range(len(rev_train)):
    top_topics = (
        lda_train.get_document_topics(corpus[i],
                                      minimum_probability=0.0)
    )
    topic_vec = [top_topics[j][1] for j in range(num_topics)]
    
    for a in range(num_topics):
        column = "TP" + str(a + 1)
        rev_train[column].iloc[i] = topic_vec[a]

column = []
for a in range(num_topics):
    column.append("TP" + str(a + 1))

X_t = np.array(rev_train[column])
y = np.array(rev_train.PROBLEMA)

XXX = np.append(X_t.astype(float), X, axis=1).astype(float)
y = np.array(DATASET.PROBLEMA)

X_train, X_test, y_train, y_test = train_test_split( XXX,
                                                     y, 
                                                    train_size=0.80, random_state=28)

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result_ovr(X_train, X_test, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,51.072125,0.710433
1,AdaBoostClassifier:,12.541309,0.672796
2,Voting_LR3_SVC1_ETC2:,0.0,0.0
3,Voting_LR1_SVC1_ETC1:,0.0,0.0
4,DecisionTreeClassifier:,2.416579,0.69658
5,GaussianNB:,0.062499,0.513547
6,BernoulliNB:,0.046858,0.206939
7,MultinomialNB:,0.03124,0.476768
8,RandomForestClassifier:,19.020178,0.785828
9,ExtraTreesClassifier:,11.446131,0.78877


### Test 23 with the complete dataset
* Sem remover palavras mais frequentes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Comparação
    * CountVectorizer(nível word, char e ngrams)
    * Tf-IDF (nível word, char e ngrams)


In [49]:
path = "C:/PMON2021-NLP"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
DATASET["RELATOCLIENTE_CLEAN_T"] = list(sent_to_words(DATASET.RELATOCLIENTE_CLEAN.values.tolist()))

X_train, X_test, y_train, y_test = train_test_split( DATASET["RELATOCLIENTE_CLEAN"] ,
                                                    DATASET["PROBLEMA"], 
                                                    train_size=0.80, random_state=28)


encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

# CountVectorizer word level
# X_train_count_vect_w, X_test_count_vect_w, y_train, y_test
count_vect_w = CountVectorizer(analyzer='word')
count_vect_w.fit(DATASET["RELATOCLIENTE_CLEAN"])
X_train_count_vect_w =  count_vect_w.transform(X_train)
X_test_count_vect_w =  count_vect_w.transform(X_test)

# CountVectorizer word ngram level
# X_train_count_vect_w_ngram, X_test_count_vect_w_ngram, y_train, y_test
count_vect_w_ngram = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
count_vect_w_ngram.fit(DATASET["RELATOCLIENTE_CLEAN"])
X_train_count_vect_w_ngram =  count_vect_w_ngram.transform(X_train)
X_test_count_vect_w_ngram =  count_vect_w_ngram.transform(X_test)

# CountVectorizer char level
# X_train_count_vect_char, X_test_count_vect_char, y_train, y_test
count_vect_char = CountVectorizer(analyzer='char', max_features=5000)
count_vect_char.fit(DATASET["RELATOCLIENTE_CLEAN"])
X_train_count_vect_char =  count_vect_char.transform(X_train)
X_test_count_vect_char =  count_vect_char.transform(X_test)

# CountVectorizer char ngram level
# X_train_count_vect_char_ngram, X_test_count_vect_char_ngram, y_train, y_test
count_vect_char_ngram = CountVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
count_vect_char_ngram.fit(DATASET["RELATOCLIENTE_CLEAN"])
X_train_count_vect_char_ngram =  count_vect_char_ngram.transform(X_train)
X_test_count_vect_char_ngram =  count_vect_char_ngram.transform(X_test)

#########################################

# tf-idf word level
# X_train_tfidf_w, X_test_tfidf_w, y_train, y_test
tfidf_w = TfidfVectorizer(analyzer='word', max_features=500)
tfidf_w.fit(DATASET["RELATOCLIENTE_CLEAN"])
X_train_tfidf_w =  tfidf_w.transform(X_train)
X_test_tfidf_w =  tfidf_w.transform(X_test)

# tf-idf word ngram level  
# X_train_tfidf_w_ngram, X_test_tfidf_w_ngram, y_train, y_test
tfidf_w_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_w_ngram.fit(DATASET["RELATOCLIENTE_CLEAN"])
X_train_tfidf_w_ngram =  tfidf_w_ngram.transform(X_train)
X_test_tfidf_w_ngram =  tfidf_w_ngram.transform(X_test)

# tf-idf char level 
# X_train_tfidf_char, X_test_tfidf_char, y_train, y_test
tfidf_char = TfidfVectorizer(analyzer='char', max_features=5000)
tfidf_char.fit(DATASET["RELATOCLIENTE_CLEAN"])
X_train_tfidf_char =  tfidf_char.transform(X_train) 
X_test_tfidf_char =  tfidf_char.transform(X_test)

# tf-idf char ngram level
# X_train_tfidf_char_ngram, X_test_tfidf_char_ngram, y_train, y_test
tfidf_char_ngram = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_char_ngram.fit(DATASET["RELATOCLIENTE_CLEAN"])
X_train_tfidf_char_ngram =  tfidf_char_ngram.transform(X_train) 
X_test_tfidf_char_ngram =  tfidf_char_ngram.transform(X_test)

### 23-1 CountVectorizer word level

In [51]:
results = get_tests_result_ovr(X_train_count_vect_w, X_test_count_vect_w, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,239.597515,0.86625
1,AdaBoostClassifier:,196.052419,0.802501
2,Voting_LR3_SVC1_ETC2:,0.0,0.0
3,Voting_LR1_SVC1_ETC1:,0.0,0.0
4,DecisionTreeClassifier:,20.010902,0.822484
5,GaussianNB:,114.567339,0.397205
6,BernoulliNB:,169.913872,0.79159
7,MultinomialNB:,103.386148,0.786073
8,RandomForestClassifier:,201.124133,0.878509
9,ExtraTreesClassifier:,754.16403,0.878877


### 23-2 CountVectorizer word ngram level

In [52]:
results = get_tests_result_ovr(X_train_count_vect_w_ngram, X_test_count_vect_w_ngram, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,129.513765,0.852274
1,AdaBoostClassifier:,9.001002,0.746843
2,Voting_LR3_SVC1_ETC2:,0.0,0.0
3,Voting_LR1_SVC1_ETC1:,0.0,0.0
4,DecisionTreeClassifier:,15.046026,0.807405
5,GaussianNB:,15.153743,0.471374
6,BernoulliNB:,26.385891,0.758122
7,MultinomialNB:,13.496485,0.779453
8,RandomForestClassifier:,111.225543,0.85019
9,ExtraTreesClassifier:,273.319567,0.84688


### 23-3 CountVectorizer char level 

In [53]:
results = get_tests_result_ovr(X_train_count_vect_char, X_test_count_vect_char, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,83.008878,0.673287
1,AdaBoostClassifier:,10.459291,0.592375
2,Voting_LR3_SVC1_ETC2:,0.0,0.0
3,Voting_LR1_SVC1_ETC1:,0.0,0.0
4,DecisionTreeClassifier:,8.451981,0.645335
5,GaussianNB:,0.156213,0.265294
6,BernoulliNB:,0.182806,0.422214
7,MultinomialNB:,0.066439,0.553022
8,RandomForestClassifier:,68.381751,0.760942
9,ExtraTreesClassifier:,214.710917,0.768052


### 23-4 CountVectorizer char ngram level

In [54]:
results = get_tests_result_ovr(X_train_count_vect_char_ngram, X_test_count_vect_char_ngram, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,525.134481,0.862817
1,AdaBoostClassifier:,69.412013,0.81378
2,Voting_LR3_SVC1_ETC2:,0.0,0.0
3,Voting_LR1_SVC1_ETC1:,0.0,0.0
4,DecisionTreeClassifier:,103.053493,0.784357
5,GaussianNB:,14.678028,0.343018
6,BernoulliNB:,27.131777,0.708103
7,MultinomialNB:,13.486414,0.706019
8,RandomForestClassifier:,206.212021,0.875812
9,ExtraTreesClassifier:,592.058095,0.877896


### 23-5 tf-idf word level

In [55]:
results = get_tests_result_ovr(X_train_tfidf_w, X_test_tfidf_w, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,79.921324,0.858281
1,AdaBoostClassifier:,14.990985,0.800294
2,Voting_LR3_SVC1_ETC2:,0.0,0.0
3,Voting_LR1_SVC1_ETC1:,0.0,0.0
4,DecisionTreeClassifier:,20.666564,0.804217
5,GaussianNB:,1.521257,0.421846
6,BernoulliNB:,0.939404,0.77173
7,MultinomialNB:,0.234411,0.772956
8,RandomForestClassifier:,52.655717,0.868579
9,ExtraTreesClassifier:,139.869122,0.875935


### 23-6 tf-idf word ngram level

In [56]:
results = get_tests_result_ovr(X_train_tfidf_w_ngram, X_test_tfidf_w_ngram, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,137.921298,0.845409
1,AdaBoostClassifier:,19.710509,0.749785
2,Voting_LR3_SVC1_ETC2:,0.0,0.0
3,Voting_LR1_SVC1_ETC1:,0.0,0.0
4,DecisionTreeClassifier:,60.369111,0.801888
5,GaussianNB:,14.992407,0.493319
6,BernoulliNB:,7.904554,0.758122
7,MultinomialNB:,2.288663,0.792448
8,RandomForestClassifier:,110.19999,0.849454
9,ExtraTreesClassifier:,298.020694,0.853132


### 23-7 tf-idf char level

In [57]:
results = get_tests_result_ovr(X_train_tfidf_char, X_test_tfidf_char, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,79.892709,0.67108
1,AdaBoostClassifier:,23.039277,0.630011
2,Voting_LR3_SVC1_ETC2:,0.0,0.0
3,Voting_LR1_SVC1_ETC1:,0.0,0.0
4,DecisionTreeClassifier:,12.131545,0.627559
5,GaussianNB:,0.141744,0.319603
6,BernoulliNB:,0.140565,0.422214
7,MultinomialNB:,0.046896,0.540517
8,RandomForestClassifier:,77.553649,0.75898
9,ExtraTreesClassifier:,219.745392,0.769891


### 23-8 tf-idf char ngram level

In [58]:
results = get_tests_result_ovr(X_train_tfidf_char_ngram, X_test_tfidf_char_ngram, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,558.069123,0.858649
1,AdaBoostClassifier:,205.661643,0.812676
2,Voting_LR3_SVC1_ETC2:,0.0,0.0
3,Voting_LR1_SVC1_ETC1:,0.0,0.0
4,DecisionTreeClassifier:,404.250487,0.777124
5,GaussianNB:,15.814322,0.357607
6,BernoulliNB:,9.746896,0.708103
7,MultinomialNB:,2.095361,0.767562
8,RandomForestClassifier:,235.754543,0.867721
9,ExtraTreesClassifier:,653.506603,0.875077


### Test 24 with the complete dataset 
* Other kind of problem (Motivo 3)
* Balanced

In [3]:
path = "C:/PMON2021-NLP"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')

X_train, X_test, y_train, y_test = train_test_split( DATASET["RELATOCLIENTE_CLEAN"] ,
                                                    DATASET["PROBLEMA"], 
                                                    train_size=0.80, random_state=28)

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

# CountVectorizer word level
# X_train_count_vect_w, X_test_count_vect_w, y_train, y_test
count_vect_w = CountVectorizer(analyzer='word')
count_vect_w.fit(DATASET["RELATOCLIENTE_CLEAN"].astype('U').values)
X_train_count_vect_w =  count_vect_w.transform(X_train.astype('U').values)
X_test_count_vect_w =  count_vect_w.transform(X_test.astype('U').values)

# tf-idf word level
# X_train_tfidf_w, X_test_tfidf_w, y_train, y_test
tfidf_w = TfidfVectorizer(analyzer='word')
tfidf_w.fit(DATASET["RELATOCLIENTE_CLEAN"].astype('U').values)
X_train_tfidf_w =  tfidf_w.transform(X_train.astype('U').values)
X_test_tfidf_w =  tfidf_w.transform(X_test.astype('U').values)


In [4]:
DATASET["MOTIVO3"].value_counts()[0:15]

Sem Sincronismo     36400
Exige Técnico       36400
Parâmetros Ruins    36400
Name: MOTIVO3, dtype: int64

### 24 Count vectorizer word level
* MemoryError: Unable to allocate 34.7 GiB for an array with shape (124080, 37545) and data type int64
* GaussianNB removed

In [8]:
def get_tests_result_ovr_24(X_train_v, X_test_v, y_train, y_test):
    all_res = []
    
    ### RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=50, random_state=1, n_jobs=5)
    train_test_time, accuracy, Y_previsto_RF = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "RandomForestClassifier: " + str( accuracy) ) 
    all_res.append(["RandomForestClassifier: ", train_test_time, accuracy])
    
    ### ExtraTreesClassifier
    clf = ExtraTreesClassifier(n_estimators=100, random_state=0, n_jobs=5 )
    train_test_time, accuracy, Y_previsto_ERT = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "ExtraTreesClassifier: " + str( accuracy) ) 
    all_res.append(["ExtraTreesClassifier: ", train_test_time, accuracy])
    
    ### LogisticRegression
    clf = LogisticRegression(random_state=0, n_jobs=5)
    train_test_time, accuracy, Y_previsto_LR = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "LogisticRegression: " + str( accuracy) ) 
    all_res.append(["LogisticRegression: ", train_test_time, accuracy])
    
    ### svm
    clf = svm.SVC()
    train_test_time, accuracy, Y_previsto_SVM = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "svm: " + str( accuracy) ) 
    all_res.append(["svm: ", train_test_time, accuracy])
        
    ### Stacking 1
    stack_result = []
    for a in range(len(Y_previsto_RF)):
        elements = []
        elements.append( np.floor( Y_previsto_RF[a] ) )
        elements.append( np.floor( Y_previsto_ERT[a] ) )
        elements.append( np.floor( Y_previsto_LR[a] ) )
        elements.append( np.floor( Y_previsto_SVM[a] ) )
        #elements.append( np.floor( Y_previsto_svc[a] ) )
        stack_result.append( stats.mode(elements).mode[0])
        
    accuracy = metrics.accuracy_score(y_test,  np.array(stack_result, dtype="int64" )  )
    all_res.append(["stacking_1: ", 0 , accuracy])
       
    ### Stacking_scikit
    clf1 = RandomForestClassifier(n_estimators=50, random_state=1, n_jobs=5)
    clf2 = LogisticRegression(random_state=0, n_jobs=5)
    clf3 = SVC()
    clf4 = ExtraTreesClassifier(n_estimators=100, random_state=0, n_jobs=5)
    estimators = [('rf', clf1), ('lr', clf2), ('svc', clf3), ('etc', clf4)]
    ini = time.time()
    clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(), n_jobs=5)
    
    ovr = OneVsRestClassifier(clf)
    ovr.fit(X_train_v, y_train)
    Y_Previsto_stacking = ovr.predict(X_test_v)
    
    fim = time.time()
    train_test_time = fim-ini
    accuracy = metrics.accuracy_score(y_test, Y_Previsto_stacking)
    all_res.append(["Stacking_scikit_ovr: ", train_test_time , accuracy])
    
    ### XGBClassifier
    Y_previsto_xgbc = 1
    try:
        clf = XGBClassifier(eval_metric='mlogloss')
        train_test_time, accuracy, Y_previsto_xgbc = train_model_ovr(clf, X_train_v.tocsc(), X_test_v.tocsc(), y_train, y_test)
        #print( "XGBClassifier: " + str( accuracy) ) 
        all_res.append(["XGBClassifier: ", train_test_time, accuracy])
    except:
        clf = XGBClassifier(eval_metric='mlogloss')
        train_test_time, accuracy, Y_previsto_xgbc = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
        #print( "XGBClassifier: " + str( accuracy) ) 
        all_res.append(["XGBClassifier: ", train_test_time, accuracy])
    
    return pd.DataFrame( all_res, columns=["ALGORITHM","TRAIN_TEST_TIME", "ACCURACY"]  )



### 24-1 Count vectorizer word level

In [8]:
results = get_tests_result_ovr_24(X_train_count_vect_w, X_test_count_vect_w, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,RandomForestClassifier:,540.429988,0.892582
1,ExtraTreesClassifier:,1741.336369,0.891896
2,LogisticRegression:,3.920946,0.85261
3,svm:,3132.753809,0.869963
4,stacking_1:,0.0,0.893727
5,Stacking_scikit_ovr:,23994.915546,0.896566
6,XGBClassifier:,9.313779,0.864423


### 24-2 TF-IDF word level

In [9]:
results = get_tests_result_ovr_24(X_train_tfidf_w, X_test_tfidf_w, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,RandomForestClassifier:,474.769183,0.885256
1,ExtraTreesClassifier:,1729.176712,0.886401
2,LogisticRegression:,3.903657,0.855266
3,svm:,3997.345918,0.887134
4,stacking_1:,0.0,0.888187
5,Stacking_scikit_ovr:,27784.868036,0.892811
6,XGBClassifier:,29.563,0.86685


### Test 25 with the complete dataset
* Other kind of problem (Motivo 3)
* Unbalanced

In [11]:
path = "C:/PMON2021-NLP"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')

X_train, X_test, y_train, y_test = train_test_split( DATASET["RELATOCLIENTE_CLEAN"] ,
                                                    DATASET["PROBLEMA"], 
                                                    train_size=0.80, random_state=28)


encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

# CountVectorizer word level
# X_train_count_vect_w, X_test_count_vect_w, y_train, y_test
count_vect_w = CountVectorizer(analyzer='word')
count_vect_w.fit(DATASET["RELATOCLIENTE_CLEAN"])
X_train_count_vect_w =  count_vect_w.transform(X_train)
X_test_count_vect_w =  count_vect_w.transform(X_test)

# tf-idf word level
# X_train_tfidf_w, X_test_tfidf_w, y_train, y_test
tfidf_w = TfidfVectorizer(analyzer='word')
tfidf_w.fit(DATASET["RELATOCLIENTE_CLEAN"])
X_train_tfidf_w =  tfidf_w.transform(X_train)
X_test_tfidf_w =  tfidf_w.transform(X_test)

In [12]:
DATASET["MOTIVO3"].value_counts()[0:15]

Sem Sincronismo     192825
Parâmetros Ruins     48775
Exige Técnico        36456
Name: MOTIVO3, dtype: int64

### 25-1 Count vectorizer word level

In [13]:
results = get_tests_result_ovr_24(X_train_count_vect_w, X_test_count_vect_w, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,RandomForestClassifier:,693.860521,0.920629
1,ExtraTreesClassifier:,2123.462413,0.921042
2,LogisticRegression:,9.091674,0.894681
3,svm:,14774.70838,0.911728
4,stacking_1:,0.0,0.922571
5,Stacking_scikit_ovr:,36084.228401,0.924639
6,XGBClassifier:,22.439721,0.906027


### 25-2 TF-IDF word level

In [14]:
results = get_tests_result_ovr_24(X_train_tfidf_w, X_test_tfidf_w, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,RandomForestClassifier:,551.986248,0.916565
1,ExtraTreesClassifier:,1999.468531,0.917338
2,LogisticRegression:,9.668607,0.891067
3,svm:,21025.366353,0.917734
4,stacking_1:,0.0,0.919837
5,Stacking_scikit_ovr:,46404.273352,0.923236
6,XGBClassifier:,64.996882,0.90556
