In [2]:
from platform import python_version
print(python_version())

3.8.5


In [3]:
!jupyter --version

jupyter core     : 4.6.3
jupyter-notebook : 6.1.4
qtconsole        : 4.7.7
ipython          : 7.19.0
ipykernel        : 5.3.4
jupyter client   : 6.1.7
jupyter lab      : 2.2.6
nbconvert        : 6.0.7
ipywidgets       : 7.5.1
nbformat         : 5.0.8
traitlets        : 5.0.5


### Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split 
import numpy as np
import pandas as pd
import time
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from scipy import stats
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier
from bert_serving.client import BertClient
from sklearn.multiclass import OneVsOneClassifier
import gensim
from gensim.utils import simple_preprocess
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
import gensim.corpora as corpora
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.sparse import hstack
import gensim.downloader as api
from gensim.corpora import Dictionary
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import WordEmbeddingSimilarityIndex
from gensim.models import TfidfModel
from gensim.models import Word2Vec

### Methods

In [2]:
import warnings
warnings.filterwarnings('ignore')

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def vectorization_TF_IDF(DATASET, text_field_name, label_name):
    
    X_train, X_test, y_train, y_test = train_test_split( DATASET[text_field_name] ,
                                                    DATASET[label_name], 
                                                    train_size=0.80, random_state=28)

    encoder = preprocessing.LabelEncoder()
    y_train = encoder.fit_transform(y_train)
    y_test = encoder.fit_transform(y_test)

    vectorizer = TfidfVectorizer()
    X_train_v = vectorizer.fit_transform(X_train.apply(str))
    X_test_v = vectorizer.transform(X_test.apply(str))
    
    return X_train_v, X_test_v, y_train, y_test

def saving_results(results, path, file_name):

    results.to_csv(path + file_name,  index = False, sep=";")
    
def train_model(classifier, X_train_v, X_test_v, y_train, y_test):
    ini = time.time()
    classifier.fit(X_train_v, y_train)
    predictions = classifier.predict(X_test_v)
    fim = time.time()
    
    return fim-ini, metrics.accuracy_score(predictions, y_test), predictions

def get_tests_result(X_train_v, X_test_v, y_train, y_test):
    
    all_res = []
    ### MLPClassifier
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(70, ), random_state=1, verbose=True)
    train_test_time, accuracy, Y_Previsto_MLP = train_model(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "MLPClassifier: " + str( accuracy) ) 
    all_res.append(["MLPClassifier: ", train_test_time, accuracy])
    
    ### AdaBoostClassifier
    clf = AdaBoostClassifier(n_estimators= 50, learning_rate=1)
    train_test_time, accuracy, Y_Previsto_ada = train_model(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "AdaBoostClassifier: " + str( accuracy) ) 
    all_res.append(["AdaBoostClassifier: ", train_test_time, accuracy])
    
    ### Voting_LR3_SVC1_ETC2
    ini = time.time()
    clf2 = LogisticRegression(random_state=0)
    clf5 = SVC(kernel='rbf', probability=True)
    clf6 = ExtraTreesClassifier(n_estimators=100, random_state=0)
    eclf = VotingClassifier(estimators=[ ('lr', clf2), ('svc', clf5), ('etc', clf6)], voting='soft', weights=[3, 1, 2])
    clf2 = clf2.fit(X_train_v, y_train)
    clf5 = clf5.fit(X_train_v, y_train)
    clf6 = clf6.fit(X_train_v, y_train)
    eclf = eclf.fit(X_train_v, y_train)
    Y_previsto_vc1 = eclf.predict(X_test_v.toarray())
    fim = time.time()
    train_test_time = fim - ini
    accuracy = metrics.accuracy_score(y_test, Y_previsto_vc1)
    #print( "Voting_LR3_SVC1_ETC2: " + str( accuracy) ) 
    all_res.append(["Voting_LR3_SVC1_ETC2: ", train_test_time, accuracy])
    
    ### Voting_LR1_SVC1_ETC1
    ini = time.time()
    clf2 = LogisticRegression(random_state=0)
    clf5 = SVC(kernel='rbf', probability=True)
    clf6 = ExtraTreesClassifier(n_estimators=100, random_state=0)
    eclf = VotingClassifier(estimators=[ ('lr', clf2), ('svc', clf5), ('etc', clf6)], voting='soft', weights=[1, 1, 1])
    clf2 = clf2.fit(X_train_v, y_train)
    clf5 = clf5.fit(X_train_v, y_train)
    clf6 = clf6.fit(X_train_v, y_train)
    eclf = eclf.fit(X_train_v, y_train)
    Y_previsto_vc2 = eclf.predict(X_test_v.toarray())
    fim = time.time()
    train_test_time = fim - ini
    accuracy = metrics.accuracy_score(y_test, Y_previsto_vc2)
    #print( "Voting_LR1_SVC1_ETC1: " + str( accuracy) ) 
    all_res.append(["Voting_LR1_SVC1_ETC1: ", train_test_time, accuracy])
    
    ### DecisionTreeClassifier
    clf = DecisionTreeClassifier(random_state=0)
    train_test_time, accuracy, Y_previsto_DTC = train_model(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "DecisionTreeClassifier: " + str( accuracy) ) 
    all_res.append(["DecisionTreeClassifier: ", train_test_time, accuracy])
    
    ### GaussianNB
    clf = GaussianNB()
    train_test_time, accuracy, Y_previsto_NB_G = train_model(clf, X_train_v.toarray(), X_test_v.toarray(), y_train, y_test)
    #print( "GaussianNB: " + str( accuracy) ) 
    all_res.append(["GaussianNB: ", train_test_time, accuracy])
    
    ### BernoulliNB
    clf = BernoulliNB()
    train_test_time, accuracy, Y_previsto_NB_B = train_model(clf, X_train_v.toarray(), X_test_v.toarray(), y_train, y_test)
    #print( "BernoulliNB: " + str( accuracy) ) 
    all_res.append(["BernoulliNB: ", train_test_time, accuracy])
    
    ### MultinomialNB
    clf = MultinomialNB()
    train_test_time, accuracy, Y_previsto_NB_M = train_model(clf, X_train_v.toarray(), X_test_v.toarray(), y_train, y_test)
    #print( "MultinomialNB: " + str( accuracy) ) 
    all_res.append(["MultinomialNB: ", train_test_time, accuracy])
    
    ### RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=50, random_state=1)
    train_test_time, accuracy, Y_previsto_RF = train_model(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "RandomForestClassifier: " + str( accuracy) ) 
    all_res.append(["RandomForestClassifier: ", train_test_time, accuracy])
    
    ### ExtraTreesClassifier
    clf = ExtraTreesClassifier(n_estimators=100, random_state=0)
    train_test_time, accuracy, Y_previsto_ERT = train_model(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "ExtraTreesClassifier: " + str( accuracy) ) 
    all_res.append(["ExtraTreesClassifier: ", train_test_time, accuracy])
    
    ### LogisticRegression
    clf = LogisticRegression(random_state=0)
    train_test_time, accuracy, Y_previsto_LR = train_model(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "LogisticRegression: " + str( accuracy) ) 
    all_res.append(["LogisticRegression: ", train_test_time, accuracy])
    
    ### svm
    clf = svm.SVC()
    train_test_time, accuracy, Y_previsto_SVM = train_model(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "svm: " + str( accuracy) ) 
    all_res.append(["svm: ", train_test_time, accuracy])
    
    ### svm_rbf
    clf = svm.SVC(kernel='rbf', probability=True)
    train_test_time, accuracy, Y_previsto_svc = train_model(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "svm_rbf: " + str( accuracy) ) 
    all_res.append(["svm_rbf: ", train_test_time, accuracy])
        
    ### Stacking 1
    stack_result = []
    for a in range(len(Y_Previsto_MLP)):
        elements = []
        elements.append( np.floor( Y_Previsto_MLP[a] ) )
        elements.append( np.floor( Y_Previsto_ada[a] ) )
        elements.append( np.floor( Y_previsto_vc1[a] ) )
        elements.append( np.floor( Y_previsto_vc2[a] ) )
        elements.append( np.floor( Y_previsto_DTC[a] ) )
        elements.append( np.floor( Y_previsto_NB_G[a] ) )
        elements.append( np.floor( Y_previsto_NB_B[a] ) )
        elements.append( np.floor( Y_previsto_RF[a] ) )
        elements.append( np.floor( Y_previsto_ERT[a] ) )
        elements.append( np.floor( Y_previsto_LR[a] ) )
        elements.append( np.floor( Y_previsto_SVM[a] ) )
        elements.append( np.floor( Y_previsto_svc[a] ) )
        stack_result.append( stats.mode(elements).mode[0])
        
    accuracy = metrics.accuracy_score(y_test,  np.array(stack_result, dtype="int64" )  )
    all_res.append(["stacking_1: ", 0 , accuracy])
    
    ### Stacking 2
    stack_result = []
    for a in range(len(Y_Previsto_MLP)):
        elements = []
        #elements.append( np.floor( Y_Previsto_MLP[a] ) )
        #elements.append( np.floor( Y_Previsto_ada[a] ) )
        elements.append( np.floor( Y_previsto_vc1[a] ) )
        elements.append( np.floor( Y_previsto_vc2[a] ) )
        #elements.append( np.floor( Y_previsto_DTC[a] ) )
        #elements.append( np.floor( Y_previsto_NB_G[a] ) )
        elements.append( np.floor( Y_previsto_NB_B[a] ) )
        elements.append( np.floor( Y_previsto_RF[a] ) )
        #elements.append( np.floor( Y_previsto_ERT[a] ) )
        #elements.append( np.floor( Y_previsto_LR[a] ) )
        #elements.append( np.floor( Y_previsto_SVM[a] ) )
        elements.append( np.floor( Y_previsto_svc[a] ) )
        stack_result.append( stats.mode(elements).mode[0])
        
    accuracy = metrics.accuracy_score(y_test,  np.array(stack_result, dtype="int64" )  )
    all_res.append(["stacking_2: ", 0 , accuracy])
    
    ### Stacking 3
    stack_result = []
    for a in range(len(Y_Previsto_MLP)):
        elements = []
        elements.append( np.floor( Y_Previsto_MLP[a] ) )
        #elements.append( np.floor( Y_Previsto_ada[a] ) )
        #elements.append( np.floor( Y_previsto_vc1[a] ) )
        elements.append( np.floor( Y_previsto_vc2[a] ) )
        elements.append( np.floor( Y_previsto_DTC[a] ) )
        #elements.append( np.floor( Y_previsto_NB_G[a] ) )
        #elements.append( np.floor( Y_previsto_NB_B[a] ) )
        elements.append( np.floor( Y_previsto_RF[a] ) )
        elements.append( np.floor( Y_previsto_ERT[a] ) )
        #elements.append( np.floor( Y_previsto_LR[a] ) )
        #elements.append( np.floor( Y_previsto_SVM[a] ) )
        elements.append( np.floor( Y_previsto_svc[a] ) )
        stack_result.append( stats.mode(elements).mode[0])
        
    accuracy = metrics.accuracy_score(y_test,  np.array(stack_result, dtype="int64" )  )
    all_res.append(["stacking_3: ", 0 , accuracy]) 
    
    ### Stacking_scikit
    clf1 = RandomForestClassifier(n_estimators=50, random_state=1)
    clf2 = LogisticRegression(random_state=0)
    clf3 = SVC()
    clf4 = ExtraTreesClassifier(n_estimators=100, random_state=0)
    estimators = [('rf', clf1), ('lr', clf2), ('svc', clf3), ('etc', clf4)]
    ini = time.time()
    clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(), n_jobs=None)
    clf.fit(X_train_v, y_train)
    Y_Previsto_stacking = clf.predict(X_test_v)
    fim = time.time()
    train_test_time = fim-ini
    accuracy = metrics.accuracy_score(y_test, Y_Previsto_stacking)
    all_res.append(["Stacking_scikit: ", train_test_time , accuracy])
    
    ### XGBClassifier
    clf = XGBClassifier(eval_metric='mlogloss')
    train_test_time, accuracy, Y_previsto_svc = train_model(clf, X_train_v.tocsc(), X_test_v.tocsc(), y_train, y_test)
    #print( "XGBClassifier: " + str( accuracy) ) 
    all_res.append(["XGBClassifier: ", train_test_time, accuracy])
    
    ### OneVsRestClassifier_RF
    clf = RandomForestClassifier(n_estimators=50, random_state=1)
    ovr = OneVsRestClassifier(clf)
    ini = time.time()
    ovr.fit(X_train_v, y_train)
    Y_Previsto_OVR_RF = ovr.predict(X_test_v)
    fim = time.time()
    train_test_time = fim-ini
    accuracy = metrics.accuracy_score(Y_Previsto_OVR_RF, y_test)
    all_res.append(["OvR_RF: ", train_test_time , accuracy])
    
    return pd.DataFrame( all_res, columns=["ALGORITHM","TRAIN_TEST_TIME", "ACCURACY"]  )

def train_model_ovo(classifier, X_train_v, X_test_v, y_train, y_test):
    
    ini = time.time()
    ovo = OneVsOneClassifier(classifier)
    ovo.fit(X_train_v, y_train)
    predictions = ovo.predict(X_test_v)
    fim = time.time()
    
    return fim-ini, metrics.accuracy_score(predictions, y_test), predictions

def get_tests_result_ovo(X_train_v, X_test_v, y_train, y_test):
    
    all_res = []
    ### MLPClassifier
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(70, ), random_state=1, verbose=True)
    train_test_time, accuracy, Y_Previsto_MLP = train_model_ovo(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "MLPClassifier: " + str( accuracy) ) 
    all_res.append(["MLPClassifier: ", train_test_time, accuracy])
    
    ### AdaBoostClassifier
    clf = AdaBoostClassifier(n_estimators= 50, learning_rate=1)
    train_test_time, accuracy, Y_Previsto_ada = train_model_ovo(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "AdaBoostClassifier: " + str( accuracy) ) 
    all_res.append(["AdaBoostClassifier: ", train_test_time, accuracy])
    
    ### Voting_LR3_SVC1_ETC2
    all_res.append(["Voting_LR3_SVC1_ETC2: ", 0 , 0 ])
    
    ### Voting_LR1_SVC1_ETC1
    all_res.append(["Voting_LR1_SVC1_ETC1: ", 0, 0])
    
    ### DecisionTreeClassifier
    clf = DecisionTreeClassifier(random_state=0)
    train_test_time, accuracy, Y_previsto_DTC = train_model_ovo(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "DecisionTreeClassifier: " + str( accuracy) ) 
    all_res.append(["DecisionTreeClassifier: ", train_test_time, accuracy])
    
    ### GaussianNB
    clf = GaussianNB()
    train_test_time, accuracy, Y_previsto_NB_G = train_model_ovo(clf, X_train_v.toarray(), X_test_v.toarray(), y_train, y_test)
    #print( "GaussianNB: " + str( accuracy) ) 
    all_res.append(["GaussianNB: ", train_test_time, accuracy])
    
    ### BernoulliNB
    clf = BernoulliNB()
    train_test_time, accuracy, Y_previsto_NB_B = train_model_ovo(clf, X_train_v.toarray(), X_test_v.toarray(), y_train, y_test)
    #print( "BernoulliNB: " + str( accuracy) ) 
    all_res.append(["BernoulliNB: ", train_test_time, accuracy])
    
    ### MultinomialNB
    clf = MultinomialNB()
    train_test_time, accuracy, Y_previsto_NB_M = train_model_ovo(clf, X_train_v.toarray(), X_test_v.toarray(), y_train, y_test)
    #print( "MultinomialNB: " + str( accuracy) ) 
    all_res.append(["MultinomialNB: ", train_test_time, accuracy])
    
    ### RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=50, random_state=1)
    train_test_time, accuracy, Y_previsto_RF = train_model_ovo(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "RandomForestClassifier: " + str( accuracy) ) 
    all_res.append(["RandomForestClassifier: ", train_test_time, accuracy])
    
    ### ExtraTreesClassifier
    clf = ExtraTreesClassifier(n_estimators=100, random_state=0)
    train_test_time, accuracy, Y_previsto_ERT = train_model_ovo(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "ExtraTreesClassifier: " + str( accuracy) ) 
    all_res.append(["ExtraTreesClassifier: ", train_test_time, accuracy])
    
    ### LogisticRegression
    clf = LogisticRegression(random_state=0)
    train_test_time, accuracy, Y_previsto_LR = train_model_ovo(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "LogisticRegression: " + str( accuracy) ) 
    all_res.append(["LogisticRegression: ", train_test_time, accuracy])
    
    ### svm
    clf = svm.SVC()
    train_test_time, accuracy, Y_previsto_SVM = train_model_ovo(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "svm: " + str( accuracy) ) 
    all_res.append(["svm: ", train_test_time, accuracy])
    
    ### svm_rbf
    clf = svm.SVC(kernel='rbf', probability=True)
    train_test_time, accuracy, Y_previsto_svc = train_model_ovo(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "svm_rbf: " + str( accuracy) ) 
    all_res.append(["svm_rbf: ", train_test_time, accuracy])
        
    ### Stacking 1
    stack_result = []
    for a in range(len(Y_Previsto_MLP)):
        elements = []
        elements.append( np.floor( Y_Previsto_MLP[a] ) )
        elements.append( np.floor( Y_Previsto_ada[a] ) )
        elements.append( np.floor( Y_previsto_DTC[a] ) )
        elements.append( np.floor( Y_previsto_NB_G[a] ) )
        elements.append( np.floor( Y_previsto_NB_B[a] ) )
        elements.append( np.floor( Y_previsto_RF[a] ) )
        elements.append( np.floor( Y_previsto_ERT[a] ) )
        elements.append( np.floor( Y_previsto_LR[a] ) )
        elements.append( np.floor( Y_previsto_SVM[a] ) )
        elements.append( np.floor( Y_previsto_svc[a] ) )
        stack_result.append( stats.mode(elements).mode[0])
        
    accuracy = metrics.accuracy_score(y_test,  np.array(stack_result, dtype="int64" )  )
    all_res.append(["stacking_1: ", 0 , accuracy])
    
    ### Stacking 2
    stack_result = []
    for a in range(len(Y_Previsto_MLP)):
        elements = []
        #elements.append( np.floor( Y_Previsto_MLP[a] ) )
        #elements.append( np.floor( Y_Previsto_ada[a] ) )
        #elements.append( np.floor( Y_previsto_DTC[a] ) )
        #elements.append( np.floor( Y_previsto_NB_G[a] ) )
        elements.append( np.floor( Y_previsto_NB_B[a] ) )
        elements.append( np.floor( Y_previsto_RF[a] ) )
        #elements.append( np.floor( Y_previsto_ERT[a] ) )
        #elements.append( np.floor( Y_previsto_LR[a] ) )
        #elements.append( np.floor( Y_previsto_SVM[a] ) )
        elements.append( np.floor( Y_previsto_svc[a] ) )
        stack_result.append( stats.mode(elements).mode[0])
        
    accuracy = metrics.accuracy_score(y_test,  np.array(stack_result, dtype="int64" )  )
    all_res.append(["stacking_2: ", 0 , accuracy])
    
    ### Stacking 3
    stack_result = []
    for a in range(len(Y_Previsto_MLP)):
        elements = []
        elements.append( np.floor( Y_Previsto_MLP[a] ) )
        #elements.append( np.floor( Y_Previsto_ada[a] ) )
        elements.append( np.floor( Y_previsto_DTC[a] ) )
        #elements.append( np.floor( Y_previsto_NB_G[a] ) )
        #elements.append( np.floor( Y_previsto_NB_B[a] ) )
        elements.append( np.floor( Y_previsto_RF[a] ) )
        elements.append( np.floor( Y_previsto_ERT[a] ) )
        #elements.append( np.floor( Y_previsto_LR[a] ) )
        #elements.append( np.floor( Y_previsto_SVM[a] ) )
        elements.append( np.floor( Y_previsto_svc[a] ) )
        stack_result.append( stats.mode(elements).mode[0])
        
    accuracy = metrics.accuracy_score(y_test,  np.array(stack_result, dtype="int64" )  )
    all_res.append(["stacking_3: ", 0 , accuracy]) 
    
    ### Stacking_scikit
    clf1 = RandomForestClassifier(n_estimators=50, random_state=1)
    clf2 = LogisticRegression(random_state=0)
    clf3 = SVC()
    clf4 = ExtraTreesClassifier(n_estimators=100, random_state=0)
    estimators = [('rf', clf1), ('lr', clf2), ('svc', clf3), ('etc', clf4)]
    ini = time.time()
    clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(), n_jobs=None)
    
    ovo = OneVsOneClassifier(clf)
    ovo.fit(X_train_v, y_train)
    Y_Previsto_stacking = ovo.predict(X_test_v)
    
    fim = time.time()
    train_test_time = fim-ini
    accuracy = metrics.accuracy_score(y_test, Y_Previsto_stacking)
    all_res.append(["Stacking_scikit_ovo: ", train_test_time , accuracy])
    
    ### XGBClassifier
    clf = XGBClassifier(eval_metric='mlogloss')
    train_test_time, accuracy, Y_previsto_svc = train_model_ovo(clf, X_train_v.tocsc(), X_test_v.tocsc(), y_train, y_test)
    #print( "XGBClassifier: " + str( accuracy) ) 
    all_res.append(["XGBClassifier: ", train_test_time, accuracy])
    
    return pd.DataFrame( all_res, columns=["ALGORITHM","TRAIN_TEST_TIME", "ACCURACY"]  )
    
    
def train_model_ovr(classifier, X_train_v, X_test_v, y_train, y_test):
    
    ini = time.time()
    ovr = OneVsRestClassifier(classifier)
    try:
        ovr.fit(X_train_v, y_train)
    except:
        ovr.fit(X_train_v.toarray(), y_train)
    predictions = ovr.predict(X_test_v)
    fim = time.time()
    
    return fim-ini, metrics.accuracy_score(predictions, y_test), predictions

def get_tests_result_ovr(X_train_v, X_test_v, y_train, y_test):
    
    all_res = []    
    ### MLPClassifier
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(70, ), random_state=1, verbose=True)
    train_test_time, accuracy, Y_Previsto_MLP = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "MLPClassifier: " + str( accuracy) ) 
    all_res.append(["MLPClassifier: ", train_test_time, accuracy])
    
    ### AdaBoostClassifier
    clf = AdaBoostClassifier(n_estimators= 50, learning_rate=1)
    train_test_time, accuracy, Y_Previsto_ada = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "AdaBoostClassifier: " + str( accuracy) ) 
    all_res.append(["AdaBoostClassifier: ", train_test_time, accuracy])
    
    ### Voting_LR3_SVC1_ETC2
    all_res.append(["Voting_LR3_SVC1_ETC2: ", 0 , 0 ])
    
    ### Voting_LR1_SVC1_ETC1
    all_res.append(["Voting_LR1_SVC1_ETC1: ", 0, 0])
    
    ### DecisionTreeClassifier
    clf = DecisionTreeClassifier(random_state=0)
    train_test_time, accuracy, Y_previsto_DTC = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "DecisionTreeClassifier: " + str( accuracy) ) 
    all_res.append(["DecisionTreeClassifier: ", train_test_time, accuracy])
    
    ### GaussianNB
    Y_previsto_NB_G = 1
    try:
        clf = GaussianNB()
        train_test_time, accuracy, Y_previsto_NB_G = train_model_ovr(clf, X_train_v.toarray(), X_test_v.toarray(), y_train, y_test)
        #print( "GaussianNB: " + str( accuracy) ) 
        all_res.append(["GaussianNB: ", train_test_time, accuracy])
    except:
        clf = GaussianNB()
        train_test_time, accuracy, Y_previsto_NB_G = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
        #print( "GaussianNB: " + str( accuracy) ) 
        all_res.append(["GaussianNB: ", train_test_time, accuracy])
      
    
    ### BernoulliNB
    Y_previsto_NB_B = 1
    try:
        clf = BernoulliNB()
        train_test_time, accuracy, Y_previsto_NB_B = train_model_ovr(clf, X_train_v.toarray(), X_test_v.toarray(), y_train, y_test)
        #print( "BernoulliNB: " + str( accuracy) ) 
        all_res.append(["BernoulliNB: ", train_test_time, accuracy])
    except:
        clf = BernoulliNB()
        train_test_time, accuracy, Y_previsto_NB_B = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
        #print( "BernoulliNB: " + str( accuracy) ) 
        all_res.append(["BernoulliNB: ", train_test_time, accuracy])
    
    ### MultinomialNB
    Y_previsto_NB_M = 1
    try:
        clf = MultinomialNB()
        train_test_time, accuracy, Y_previsto_NB_M = train_model_ovr(clf, X_train_v.toarray(), X_test_v.toarray(), y_train, y_test)
        #print( "MultinomialNB: " + str( accuracy) ) 
        all_res.append(["MultinomialNB: ", train_test_time, accuracy])
    except:
        clf = MultinomialNB()
        train_test_time, accuracy, Y_previsto_NB_M = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
        #print( "MultinomialNB: " + str( accuracy) ) 
        all_res.append(["MultinomialNB: ", train_test_time, accuracy])
    
    ### RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=50, random_state=1)
    train_test_time, accuracy, Y_previsto_RF = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "RandomForestClassifier: " + str( accuracy) ) 
    all_res.append(["RandomForestClassifier: ", train_test_time, accuracy])
    
    ### ExtraTreesClassifier
    clf = ExtraTreesClassifier(n_estimators=100, random_state=0)
    train_test_time, accuracy, Y_previsto_ERT = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "ExtraTreesClassifier: " + str( accuracy) ) 
    all_res.append(["ExtraTreesClassifier: ", train_test_time, accuracy])
    
    ### LogisticRegression
    clf = LogisticRegression(random_state=0)
    train_test_time, accuracy, Y_previsto_LR = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "LogisticRegression: " + str( accuracy) ) 
    all_res.append(["LogisticRegression: ", train_test_time, accuracy])
    
    ### svm
    clf = svm.SVC()
    train_test_time, accuracy, Y_previsto_SVM = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "svm: " + str( accuracy) ) 
    all_res.append(["svm: ", train_test_time, accuracy])
    
    ### svm_rbf
    clf = svm.SVC(kernel='rbf', probability=True)
    train_test_time, accuracy, Y_previsto_svc = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "svm_rbf: " + str( accuracy) ) 
    all_res.append(["svm_rbf: ", train_test_time, accuracy])
        
    ### Stacking 1
    stack_result = []
    for a in range(len(Y_Previsto_MLP)):
        elements = []
        elements.append( np.floor( Y_Previsto_MLP[a] ) )
        elements.append( np.floor( Y_Previsto_ada[a] ) )
        elements.append( np.floor( Y_previsto_DTC[a] ) )
        elements.append( np.floor( Y_previsto_NB_G[a] ) )
        elements.append( np.floor( Y_previsto_NB_B[a] ) )
        elements.append( np.floor( Y_previsto_RF[a] ) )
        elements.append( np.floor( Y_previsto_ERT[a] ) )
        elements.append( np.floor( Y_previsto_LR[a] ) )
        elements.append( np.floor( Y_previsto_SVM[a] ) )
        elements.append( np.floor( Y_previsto_svc[a] ) )
        stack_result.append( stats.mode(elements).mode[0])
        
    accuracy = metrics.accuracy_score(y_test,  np.array(stack_result, dtype="int64" )  )
    all_res.append(["stacking_1: ", 0 , accuracy])
    
    ### Stacking 2
    stack_result = []
    for a in range(len(Y_Previsto_MLP)):
        elements = []
        #elements.append( np.floor( Y_Previsto_MLP[a] ) )
        #elements.append( np.floor( Y_Previsto_ada[a] ) )
        #elements.append( np.floor( Y_previsto_DTC[a] ) )
        #elements.append( np.floor( Y_previsto_NB_G[a] ) )
        elements.append( np.floor( Y_previsto_NB_B[a] ) )
        elements.append( np.floor( Y_previsto_RF[a] ) )
        #elements.append( np.floor( Y_previsto_ERT[a] ) )
        #elements.append( np.floor( Y_previsto_LR[a] ) )
        #elements.append( np.floor( Y_previsto_SVM[a] ) )
        elements.append( np.floor( Y_previsto_svc[a] ) )
        stack_result.append( stats.mode(elements).mode[0])
        
    accuracy = metrics.accuracy_score(y_test,  np.array(stack_result, dtype="int64" )  )
    all_res.append(["stacking_2: ", 0 , accuracy])
    
    ### Stacking 3
    stack_result = []
    for a in range(len(Y_Previsto_MLP)):
        elements = []
        elements.append( np.floor( Y_Previsto_MLP[a] ) )
        #elements.append( np.floor( Y_Previsto_ada[a] ) )
        elements.append( np.floor( Y_previsto_DTC[a] ) )
        #elements.append( np.floor( Y_previsto_NB_G[a] ) )
        #elements.append( np.floor( Y_previsto_NB_B[a] ) )
        elements.append( np.floor( Y_previsto_RF[a] ) )
        elements.append( np.floor( Y_previsto_ERT[a] ) )
        #elements.append( np.floor( Y_previsto_LR[a] ) )
        #elements.append( np.floor( Y_previsto_SVM[a] ) )
        elements.append( np.floor( Y_previsto_svc[a] ) )
        stack_result.append( stats.mode(elements).mode[0])
        
    accuracy = metrics.accuracy_score(y_test,  np.array(stack_result, dtype="int64" )  )
    all_res.append(["stacking_3: ", 0 , accuracy]) 
    
    ### Stacking_scikit
    clf1 = RandomForestClassifier(n_estimators=50, random_state=1)
    clf2 = LogisticRegression(random_state=0)
    clf3 = SVC()
    clf4 = ExtraTreesClassifier(n_estimators=100, random_state=0)
    estimators = [('rf', clf1), ('lr', clf2), ('svc', clf3), ('etc', clf4)]
    ini = time.time()
    clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(), n_jobs=None)
    
    ovr = OneVsRestClassifier(clf)
    ovr.fit(X_train_v, y_train)
    Y_Previsto_stacking = ovr.predict(X_test_v)
    
    fim = time.time()
    train_test_time = fim-ini
    accuracy = metrics.accuracy_score(y_test, Y_Previsto_stacking)
    all_res.append(["Stacking_scikit_ovr: ", train_test_time , accuracy])
    
    ### XGBClassifier
    Y_previsto_xgbc = 1
    try:
        clf = XGBClassifier(eval_metric='mlogloss')
        train_test_time, accuracy, Y_previsto_xgbc = train_model_ovr(clf, X_train_v.tocsc(), X_test_v.tocsc(), y_train, y_test)
        #print( "XGBClassifier: " + str( accuracy) ) 
        all_res.append(["XGBClassifier: ", train_test_time, accuracy])
    except:
        clf = XGBClassifier(eval_metric='mlogloss')
        train_test_time, accuracy, Y_previsto_xgbc = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
        #print( "XGBClassifier: " + str( accuracy) ) 
        all_res.append(["XGBClassifier: ", train_test_time, accuracy])
    
    return pd.DataFrame( all_res, columns=["ALGORITHM","TRAIN_TEST_TIME", "ACCURACY"]  )

### Opening exemple Dataset

In [6]:
path = "C:/PMON2021-NLP/"
file = "DATASET_CLEAN.csv"
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
DATASET.head(4)

Unnamed: 0,RELATOCLIENTE,PROBLEMA,RELATOCLIENTE_CLEAN
0,cliente entrou em contato informando que está ...,Queda / Intermitência,cliente entrou contato informando esta sem sin...
1,CLIENTE COM QUEDAS REALIZEI OS TESTE E ENCAMIN...,Queda / Intermitência,cliente com quedas realizei teste encaminhei s...
2,"Cliente reclama de quedas e intermitência , pr...",Queda / Intermitência,cliente reclama quedas intermitencia procedime...
3,CLIENTE INFORMA QUE INTERNET ESTA COM QUEDAS H...,Queda / Intermitência,cliente informa internet esta com quedas mais ...


### Test 0 with the exemple dataset

In [7]:
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")
results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,0.236364,0.616667
1,AdaBoostClassifier:,0.07981,0.533333
2,Voting_LR3_SVC1_ETC2:,0.391004,0.666667
3,Voting_LR1_SVC1_ETC1:,0.388965,0.633333
4,DecisionTreeClassifier:,0.004959,0.583333
5,GaussianNB:,0.003989,0.5
6,BernoulliNB:,0.003027,0.616667
7,MultinomialNB:,0.00097,0.633333
8,RandomForestClassifier:,0.068843,0.666667
9,ExtraTreesClassifier:,0.123668,0.633333


### Test 1 with the complete dataset
* Todas as palavras da base;
* Removendo apenas stopwords;
* 3000 registros para cada classe de problema;

In [7]:
path = "C:/PMON2021-NLP/"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,0.235467,0.616667
1,AdaBoostClassifier:,0.078812,0.533333
2,Voting_LR3_SVC1_ETC2:,0.385972,0.65
3,Voting_LR1_SVC1_ETC1:,0.385939,0.633333
4,DecisionTreeClassifier:,0.003989,0.583333
5,GaussianNB:,0.002992,0.5
6,BernoulliNB:,0.003015,0.616667
7,MultinomialNB:,0.001,0.633333
8,RandomForestClassifier:,0.06685,0.666667
9,ExtraTreesClassifier:,0.125663,0.633333


### Test 2 with the complete dataset
* Todas as palavras da base;
* Removendo apenas stopwords;
* 7000 registros para cada classe de problema;

In [8]:
path = "C:/PMON2021-NLP/"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,0.227183,0.616667
1,AdaBoostClassifier:,0.078812,0.533333
2,Voting_LR3_SVC1_ETC2:,0.383978,0.65
3,Voting_LR1_SVC1_ETC1:,0.385962,0.633333
4,DecisionTreeClassifier:,0.003991,0.583333
5,GaussianNB:,0.002993,0.5
6,BernoulliNB:,0.002995,0.616667
7,MultinomialNB:,0.001967,0.633333
8,RandomForestClassifier:,0.06685,0.666667
9,ExtraTreesClassifier:,0.120676,0.633333


### Test 3 with the complete dataset
* Removendo as 6 palavras mais frequentes da base;
* Removendo stopwords;
* 3000 registros para cada classe de problema;


In [9]:
path = "C:/PMON2021-NLP/"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,0.228389,0.616667
1,AdaBoostClassifier:,0.077815,0.533333
2,Voting_LR3_SVC1_ETC2:,0.384975,0.65
3,Voting_LR1_SVC1_ETC1:,0.385967,0.633333
4,DecisionTreeClassifier:,0.00399,0.583333
5,GaussianNB:,0.002993,0.5
6,BernoulliNB:,0.00199,0.616667
7,MultinomialNB:,0.002006,0.633333
8,RandomForestClassifier:,0.068808,0.666667
9,ExtraTreesClassifier:,0.123672,0.633333


### Test 4 with the complete dataset
* Base composta pelas 700 palavras mais frequentes, removendo todas as palavras restantes;
* Removendo stopwords;
* 3000 registros para cada classe de problema;


In [10]:
path = "C:/PMON2021-NLP/"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,0.233511,0.616667
1,AdaBoostClassifier:,0.078811,0.533333
2,Voting_LR3_SVC1_ETC2:,0.385972,0.666667
3,Voting_LR1_SVC1_ETC1:,0.389954,0.633333
4,DecisionTreeClassifier:,0.003992,0.583333
5,GaussianNB:,0.002992,0.5
6,BernoulliNB:,0.002992,0.616667
7,MultinomialNB:,0.001967,0.633333
8,RandomForestClassifier:,0.066853,0.666667
9,ExtraTreesClassifier:,0.121671,0.633333


### Test 5 with the complete dataset
* Base composta pelas 700 palavras mais frequentes, removendo todas as palavras restantes;
* Removendo stopwords;
* 7000 registros para cada classe de problema;


In [11]:
path = "C:/PMON2021-NLP/"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,0.238506,0.616667
1,AdaBoostClassifier:,0.07782,0.533333
2,Voting_LR3_SVC1_ETC2:,0.385972,0.633333
3,Voting_LR1_SVC1_ETC1:,0.393946,0.65
4,DecisionTreeClassifier:,0.003989,0.583333
5,GaussianNB:,0.003962,0.5
6,BernoulliNB:,0.001994,0.616667
7,MultinomialNB:,0.001025,0.633333
8,RandomForestClassifier:,0.068787,0.666667
9,ExtraTreesClassifier:,0.128687,0.633333


### Test 6 with the complete dataset
* Base composta pelas 4000 palavras mais frequentes, removendo todas as palavras restantes;
* Removendo stopwords;
* 3000 registros para cada classe de problema;


In [12]:
path = "C:/PMON2021-NLP/"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,0.23365,0.616667
1,AdaBoostClassifier:,0.077815,0.533333
2,Voting_LR3_SVC1_ETC2:,0.386969,0.65
3,Voting_LR1_SVC1_ETC1:,0.39195,0.633333
4,DecisionTreeClassifier:,0.003962,0.583333
5,GaussianNB:,0.003018,0.5
6,BernoulliNB:,0.001996,0.616667
7,MultinomialNB:,0.001999,0.633333
8,RandomForestClassifier:,0.068842,0.666667
9,ExtraTreesClassifier:,0.122676,0.633333


### Test 7 with the complete dataset
* Base composta pelas 4000 palavras mais frequentes, removendo todas as palavras restantes;
* Removendo stopwords;
* 7000 registros para cada classe de problema;

In [13]:
path = "C:/PMON2021-NLP/"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,0.226495,0.616667
1,AdaBoostClassifier:,0.077815,0.533333
2,Voting_LR3_SVC1_ETC2:,0.386079,0.65
3,Voting_LR1_SVC1_ETC1:,0.387963,0.633333
4,DecisionTreeClassifier:,0.00399,0.583333
5,GaussianNB:,0.002987,0.5
6,BernoulliNB:,0.002999,0.616667
7,MultinomialNB:,0.00198,0.633333
8,RandomForestClassifier:,0.065808,0.666667
9,ExtraTreesClassifier:,0.124685,0.633333


### Test 8 with the complete dataset
* Todas as palavras da base;
* Removendo stopwords;
* BERT as service para português;
* 3000 registros para cada classe de problema;

In [None]:
path = "C:/PMON2021-NLP/"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train, X_test, y_train, y_test = train_test_split( DATASET["RELATOCLIENTE_CLEAN"] ,
                                                    DATASET["PROBLEMA"], 
                                                    train_size=0.8, random_state=42)

bc = BertClient()
X_train_bert = bc.encode(X_train.tolist())
X_test_bert = bc.encode(X_test.tolist())
X_train_v = X_train_bert.copy()
X_test_v = X_test_bert.copy()

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

### Test 9 with the complete dataset
* Base composta pelas 5000 palavras mais frequentes, removendo todas as palavras restantes;
* Removendo stopwords;
* 7000 registros para cada classe de problema;
* Staking personalizado: a moda das classificações de todos os algoritmos foi tomada como resultado da classificação;
* Staking 1: todos os algoritmos;
* Staking 2: 4 algoritmos com maior acurácia;


In [14]:
path = "C:/PMON2021-NLP/"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,0.231558,0.616667
1,AdaBoostClassifier:,0.079845,0.533333
2,Voting_LR3_SVC1_ETC2:,0.387992,0.65
3,Voting_LR1_SVC1_ETC1:,0.386963,0.65
4,DecisionTreeClassifier:,0.003989,0.583333
5,GaussianNB:,0.002992,0.5
6,BernoulliNB:,0.001989,0.616667
7,MultinomialNB:,0.000998,0.633333
8,RandomForestClassifier:,0.066849,0.666667
9,ExtraTreesClassifier:,0.121677,0.633333


### Test 10 with the complete dataset
* Base composta pelas 10.000 palavras mais frequentes, removendo todas as palavras restantes;
* Removendo stopwords;
* 7000 registros para cada classe de problema;
* Staking personalizado: a moda das classificações de todos os algoritmos foi tomada como resultado da classificação;
* Staking 1: todos os algoritmos;
* Staking 2: 4 algoritmos com maior acurácia;

In [15]:
path = "C:/PMON2021-NLP/"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,0.233443,0.616667
1,AdaBoostClassifier:,0.078812,0.533333
2,Voting_LR3_SVC1_ETC2:,0.385944,0.65
3,Voting_LR1_SVC1_ETC1:,0.387962,0.633333
4,DecisionTreeClassifier:,0.003989,0.583333
5,GaussianNB:,0.002992,0.5
6,BernoulliNB:,0.001995,0.616667
7,MultinomialNB:,0.000997,0.633333
8,RandomForestClassifier:,0.066844,0.666667
9,ExtraTreesClassifier:,0.12265,0.633333


### Test 11 with the complete dataset
* Sem remover palavras mais frequentes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Vetorização: TF-IDF;
* Staking personalizado: a moda das classificações de todos os algoritmos foi tomada como resultado da classificação;
* Staking 1: todos os algoritmos;
* Staking 2: 4 algoritmos com maior acurácia;


In [16]:
path = "C:/PMON2021-NLP/"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,0.229497,0.616667
1,AdaBoostClassifier:,0.077815,0.533333
2,Voting_LR3_SVC1_ETC2:,0.385973,0.65
3,Voting_LR1_SVC1_ETC1:,0.388931,0.633333
4,DecisionTreeClassifier:,0.003989,0.583333
5,GaussianNB:,0.002992,0.5
6,BernoulliNB:,0.002996,0.616667
7,MultinomialNB:,0.000993,0.633333
8,RandomForestClassifier:,0.065855,0.666667
9,ExtraTreesClassifier:,0.12165,0.633333


### Test 12 with the complete dataset
* Sem remover palavras mais frequentes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Vetorização: CountVectorizer;
* Staking personalizado: a moda das classificações de todos os algoritmos foi tomada como resultado da classificação;
* Staking 1: todos os algoritmos;
* Staking 2: 4 algoritmos com maior acurácia;

In [17]:
path = "C:/PMON2021-NLP/"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')

def cv(data):
    count_vectorizer = CountVectorizer()

    emb = count_vectorizer.fit_transform(data)

    return emb, count_vectorizer

list_corpus = DATASET["RELATOCLIENTE_CLEAN"].tolist()
list_labels = DATASET["PROBLEMA"].tolist()

X_train, X_test, y_train, y_test = train_test_split(list_corpus, list_labels, train_size=0.80, 
                                                                                random_state=28)
X_train_v, count_vectorizer = cv(X_train)
X_test_v = count_vectorizer.transform(X_test)

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,0.165574,0.65
1,AdaBoostClassifier:,0.071836,0.516667
2,Voting_LR3_SVC1_ETC2:,0.413893,0.666667
3,Voting_LR1_SVC1_ETC1:,0.410902,0.666667
4,DecisionTreeClassifier:,0.002992,0.6
5,GaussianNB:,0.002989,0.516667
6,BernoulliNB:,0.002995,0.616667
7,MultinomialNB:,0.002993,0.633333
8,RandomForestClassifier:,0.06582,0.6
9,ExtraTreesClassifier:,0.124638,0.65


### Test 13 with the complete dataset
* Sem remover palavras mais frequentes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Vetorização: CountVectorizer;
* Stemmed_RSLP ( radicais das palavras)
* Staking personalizado: a moda das classificações de todos os algoritmos foi tomada como resultado da classificação;
* Staking 1: todos os algoritmos;
* Staking 2: 4 algoritmos com maior acurácia;

In [20]:
path = "C:/PMON2021-NLP/"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')

def cv(data):
    count_vectorizer = CountVectorizer()

    emb = count_vectorizer.fit_transform(data)

    return emb, count_vectorizer

list_corpus = DATASET["RELATOCLIENTE_CLEAN"].tolist()
list_labels = DATASET["PROBLEMA"].tolist()

X_train, X_test, y_train, y_test = train_test_split(list_corpus, list_labels, train_size=0.80, 
                                                                                random_state=28)
X_train_v, count_vectorizer = cv(X_train)
X_test_v = count_vectorizer.transform(X_test)

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,0.159654,0.65
1,AdaBoostClassifier:,0.072805,0.516667
2,Voting_LR3_SVC1_ETC2:,0.415915,0.666667
3,Voting_LR1_SVC1_ETC1:,0.427855,0.666667
4,DecisionTreeClassifier:,0.002991,0.6
5,GaussianNB:,0.002992,0.516667
6,BernoulliNB:,0.003989,0.616667
7,MultinomialNB:,0.002993,0.633333
8,RandomForestClassifier:,0.068815,0.6
9,ExtraTreesClassifier:,0.124666,0.65


### Test 14 with the complete dataset
* Base composta pelas 10.000 palavras mais frequentes, removendo todas as palavras restantes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Vetorização: TF-IDF
* Abordagem One vs One
* Staking personalizado: a moda das classificações de todos os algoritmos foi tomada como resultado da classificação;
* Staking 1: todos os algoritmos;
* Staking 2: 4 algoritmos com maior acurácia;

In [22]:
path = "C:/PMON2021-NLP/"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result_ovo(X_train_v, X_test_v, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,1.300293,0.633333
1,AdaBoostClassifier:,1.154915,0.65
2,Voting_LR3_SVC1_ETC2:,0.0,0.0
3,Voting_LR1_SVC1_ETC1:,0.0,0.0
4,DecisionTreeClassifier:,0.025934,0.55
5,GaussianNB:,0.025927,0.5
6,BernoulliNB:,0.025926,0.616667
7,MultinomialNB:,0.015959,0.633333
8,RandomForestClassifier:,0.878645,0.633333
9,ExtraTreesClassifier:,1.483089,0.666667


### Test 15 with the complete dataset
* Base composta pelas 10.000 palavras mais frequentes, removendo todas as palavras restantes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Vetorização: TF-IDF
* Abordagem One vs Rest
* Staking personalizado: a moda das classificações de todos os algoritmos foi tomada como resultado da classificação;
* Staking 1: todos os algoritmos;
* Staking 2: 4 algoritmos com maior acurácia;

In [24]:
path = "C:/PMON2021-NLP/"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
X_train_v, X_test_v, y_train, y_test = vectorization_TF_IDF(DATASET,
                                                            "RELATOCLIENTE_CLEAN",
                                                           "PROBLEMA")

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result_ovr(X_train_v, X_test_v, y_train, y_test)
results
#### ... pasta 7

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,0.788175,0.683333
1,AdaBoostClassifier:,0.44982,0.65
2,Voting_LR3_SVC1_ETC2:,0.0,0.0
3,Voting_LR1_SVC1_ETC1:,0.0,0.0
4,DecisionTreeClassifier:,0.019951,0.55
5,GaussianNB:,0.016956,0.383333
6,BernoulliNB:,0.013958,0.666667
7,MultinomialNB:,0.006984,0.65
8,RandomForestClassifier:,0.346076,0.633333
9,ExtraTreesClassifier:,0.592418,0.633333


### Test 16 with the complete dataset
* Sem remover palavras mais frequentes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Vetorização: CountVectorizer
* Abordagem One vs Rest
* Staking Scikit-Learn

In [25]:
path = "C:/PMON2021-NLP/"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')


def cv(data):
    count_vectorizer = CountVectorizer()

    emb = count_vectorizer.fit_transform(data)

    return emb, count_vectorizer

list_corpus = DATASET["RELATOCLIENTE_CLEAN"].tolist()
list_labels = DATASET["PROBLEMA"].tolist()

X_train, X_test, y_train, y_test = train_test_split(list_corpus, list_labels, train_size=0.80, 
                                                                                random_state=28)
X_train_v, count_vectorizer = cv(X_train)
X_test_v = count_vectorizer.transform(X_test)

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result_ovr(X_train_v, X_test_v, y_train, y_test)
results
#### ... pasta 7

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,0.655514,0.666667
1,AdaBoostClassifier:,0.42788,0.633333
2,Voting_LR3_SVC1_ETC2:,0.0,0.0
3,Voting_LR1_SVC1_ETC1:,0.0,0.0
4,DecisionTreeClassifier:,0.016952,0.5
5,GaussianNB:,0.017944,0.416667
6,BernoulliNB:,0.014961,0.666667
7,MultinomialNB:,0.008975,0.65
8,RandomForestClassifier:,0.355026,0.683333
9,ExtraTreesClassifier:,0.606378,0.65


### Test 17 with the complete dataset
* Sem remover palavras mais frequentes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Passagem para forma numérica:
* gensim.corpora.Dictionary e bag of words;
* técnica para topic modelling: Latent Dirichlet Allocation (LDA)
* 30 tópicos 
* Usando somente as colunas dos 30 tópicos criados


In [27]:
path = "C:/PMON2021-NLP/"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
DATASET["RELATOCLIENTE_CLEAN_T"] = list(sent_to_words(DATASET.RELATOCLIENTE_CLEAN.values.tolist()))
# Create Dictionary
id2word = corpora.Dictionary(DATASET.RELATOCLIENTE_CLEAN_T.values.tolist())
# Create Corpus
texts = DATASET.RELATOCLIENTE_CLEAN_T.values.tolist()
corpus = [id2word.doc2bow(text) for text in texts]
# number of topics
num_topics = 30
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       workers=2)

# DATASET with topics
for a in range( num_topics ):
    column = "TP" + str(a + 1)
    DATASET[column] = " "

for i in range(len(DATASET)):
    top_topics = (
        lda_model.get_document_topics(corpus[i],
                                      minimum_probability=0.0)
    )
    topic_vec = [top_topics[j][1] for j in range(num_topics)]
    
    for a in range(num_topics):
        column = "TP" + str(a + 1)
        DATASET[column].iloc[i] = topic_vec[a]

column = []
for a in range(num_topics):
    column.append("TP" + str(a + 1))

X = np.array(DATASET[column])
y = np.array(DATASET.PROBLEMA)        
        
kf = KFold(50, shuffle=True, random_state=42)

for train_ind, val_ind in kf.split(X, y):
    # Assign CV IDX
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind]
    
    # Scale Data
    scaler = StandardScaler()
    X_train_scale = scaler.fit_transform(X_train)
    X_val_scale = scaler.transform(X_val)

    # Logisitic Regression
    lr = LogisticRegression(
        class_weight= 'balanced',
        solver='newton-cg',
        fit_intercept=True
    ).fit(X_train_scale, y_train)

    y_pred = lr.predict(X_val_scale)
    
    p_ac = metrics.accuracy_score(y_val,  y_pred)
    print("Acurácia: " + str(p_ac))


Acurácia: 0.16666666666666666
Acurácia: 0.6666666666666666
Acurácia: 0.16666666666666666
Acurácia: 0.16666666666666666
Acurácia: 0.3333333333333333
Acurácia: 0.16666666666666666
Acurácia: 0.5
Acurácia: 0.3333333333333333
Acurácia: 0.5
Acurácia: 0.3333333333333333
Acurácia: 0.5
Acurácia: 0.16666666666666666
Acurácia: 0.16666666666666666
Acurácia: 0.3333333333333333
Acurácia: 0.3333333333333333
Acurácia: 0.16666666666666666
Acurácia: 0.16666666666666666
Acurácia: 0.3333333333333333
Acurácia: 0.3333333333333333
Acurácia: 0.5
Acurácia: 0.3333333333333333
Acurácia: 0.6666666666666666
Acurácia: 0.6666666666666666
Acurácia: 0.5
Acurácia: 0.5
Acurácia: 0.6666666666666666
Acurácia: 0.5
Acurácia: 0.3333333333333333
Acurácia: 0.6666666666666666
Acurácia: 0.0
Acurácia: 0.3333333333333333
Acurácia: 0.16666666666666666
Acurácia: 0.5
Acurácia: 0.5
Acurácia: 0.6666666666666666
Acurácia: 0.3333333333333333
Acurácia: 0.3333333333333333
Acurácia: 0.16666666666666666
Acurácia: 0.6666666666666666
Acurácia:

### Test 18 with the complete dataset
* Sem remover palavras mais frequentes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Passagem para forma numérica:
    * gensim.corpora.Dictionary e bag of words;
* técnica para topic modelling: Latent Dirichlet Allocation (LDA)
* Usando 
    * 30 tópicos
    * texto(TF-IDF) variando max_features


In [28]:
for a in range(10, 1000 , 10):
    vectorizer = TfidfVectorizer( max_features= a)
    XX = vectorizer.fit_transform(DATASET["RELATOCLIENTE_CLEAN"])
    yy = DATASET["PROBLEMA"]
    XXX = hstack((XX, X.astype(float)))

    X_train, X_test, y_train, y_test = train_test_split( XXX,
                                                    DATASET["PROBLEMA"], 
                                                    train_size=0.80, random_state=28)

    encoder = preprocessing.LabelEncoder()
    y_train = encoder.fit_transform(y_train)
    y_test = encoder.fit_transform(y_test)
    
    lr = LogisticRegression(random_state=0).fit(X_train, y_train)
    Y_previsto_LR = lr.predict(X_test)

    # Análise da previsão
    p_ac = metrics.accuracy_score(y_test, np.floor( Y_previsto_LR) )
    print("max_features: "+ str(a) + " " + "#Acurácia: {:.4f}".format(p_ac) )

max_features: 10 #Acurácia: 0.4500
max_features: 20 #Acurácia: 0.4667
max_features: 30 #Acurácia: 0.6167
max_features: 40 #Acurácia: 0.6833
max_features: 50 #Acurácia: 0.6333
max_features: 60 #Acurácia: 0.6333
max_features: 70 #Acurácia: 0.6333
max_features: 80 #Acurácia: 0.6333
max_features: 90 #Acurácia: 0.6167
max_features: 100 #Acurácia: 0.6167
max_features: 110 #Acurácia: 0.6333
max_features: 120 #Acurácia: 0.6667
max_features: 130 #Acurácia: 0.6667
max_features: 140 #Acurácia: 0.6667
max_features: 150 #Acurácia: 0.6500
max_features: 160 #Acurácia: 0.6667
max_features: 170 #Acurácia: 0.6833
max_features: 180 #Acurácia: 0.6833
max_features: 190 #Acurácia: 0.6833
max_features: 200 #Acurácia: 0.6833
max_features: 210 #Acurácia: 0.6833
max_features: 220 #Acurácia: 0.6833
max_features: 230 #Acurácia: 0.6667
max_features: 240 #Acurácia: 0.6833
max_features: 250 #Acurácia: 0.6833
max_features: 260 #Acurácia: 0.6833
max_features: 270 #Acurácia: 0.6667
max_features: 280 #Acurácia: 0.6833
m

### Test 19 with the complete dataset
* Sem remover palavras mais frequentes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Passagem para forma numérica:
    * gensim.corpora.Dictionary e bag of words;
* técnica para topic modelling: Latent Dirichlet Allocation (LDA)
* Usando 
    * max_features  = 870
    * Variando número de tópicos


In [29]:
for num_topics in range(2, 60 , 4):
# number of topics
#num_topics = 30

# Build LDA model
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       workers=2)
    rev_train = DATASET
    lda_train = lda_model


    for a in range( num_topics ):
        column = "TP" + str(a + 1)
        rev_train[column] = " "

    for i in range(len(rev_train)):
        top_topics = (
            lda_train.get_document_topics(corpus[i],
                                      minimum_probability=0.0)
        )
        topic_vec = [top_topics[j][1] for j in range(num_topics)]
    
        for a in range(num_topics):
            column = "TP" + str(a + 1)
            rev_train[column].iloc[i] = topic_vec[a]
         
    column = []
    for a in range(num_topics):
        column.append("TP" + str(a + 1))

    X = np.array(rev_train[column])
    y = np.array(rev_train.PROBLEMA)
    
    vectorizer = TfidfVectorizer( max_features= 930)
    XX = vectorizer.fit_transform(DATASET["RELATOCLIENTE_CLEAN"])
    yy = DATASET["PROBLEMA"]
    XXX = hstack((XX, X.astype(float)))

    X_train, X_test, y_train, y_test = train_test_split( XXX,
                                                    DATASET["PROBLEMA"], 
                                                    train_size=0.80, random_state=28)
    encoder = preprocessing.LabelEncoder()
    y_train = encoder.fit_transform(y_train)
    y_test = encoder.fit_transform(y_test)
    
    lr = LogisticRegression(random_state=0).fit(X_train, y_train)
    Y_previsto_LR = lr.predict(X_test)

    # Análise da previsão
    p_ac = metrics.accuracy_score(y_test, np.floor( Y_previsto_LR) )
    print("num_topics: "+ str(num_topics) + " " + "#Acurácia: {:.4f}".format(p_ac) ) 

num_topics: 2 #Acurácia: 0.7000
num_topics: 6 #Acurácia: 0.6333
num_topics: 10 #Acurácia: 0.6167
num_topics: 14 #Acurácia: 0.6000
num_topics: 18 #Acurácia: 0.6500
num_topics: 22 #Acurácia: 0.6333
num_topics: 26 #Acurácia: 0.6667
num_topics: 30 #Acurácia: 0.6333
num_topics: 34 #Acurácia: 0.6500
num_topics: 38 #Acurácia: 0.6167
num_topics: 42 #Acurácia: 0.5833
num_topics: 46 #Acurácia: 0.5833
num_topics: 50 #Acurácia: 0.5833
num_topics: 54 #Acurácia: 0.6167
num_topics: 58 #Acurácia: 0.5667


### Test 20 with the complete dataset
* Sem remover palavras mais frequentes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Passagem para forma numérica:
    * gensim.corpora.Dictionary e bag of words;
* gensim.similarities	
    * Word Embedding Similarity Index
* Similaridade entre as string escolhidas para representar as classes de problemas


In [30]:
path = "C:/PMON2021-NLP/"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
DATASET["RELATOCLIENTE_CLEAN_T"] = list(sent_to_words(DATASET.RELATOCLIENTE_CLEAN.values.tolist()))
# Create Dictionary
id2word = corpora.Dictionary(DATASET.RELATOCLIENTE_CLEAN_T.values.tolist())
# Create Corpus
texts = DATASET.RELATOCLIENTE_CLEAN_T.values.tolist()
corpus = [id2word.doc2bow(text) for text in texts]
dictionary = Dictionary(DATASET.RELATOCLIENTE_CLEAN_T.values.tolist() )
tfidf = TfidfModel(dictionary=dictionary)
w2v_model = Word2Vec( DATASET.RELATOCLIENTE_CLEAN_T.values.tolist(), workers=2, min_count=5, seed=12345)
similarity_index = WordEmbeddingSimilarityIndex(w2v_model.wv)
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf, nonzero_limit=100)

s_modem_sem_sincronismo = "conexao internet massiva sincronismo modem status sucesso testes procedimento telefone".lower().split()
s_massiva = "massiva rede interrupcao ntt aberto primaria prazo gpon rftth".lower().split()
s_modem_sincronizado_e_autenticado = "conexao internet modem procedimento massiva sucesso sincronizado status acesso testes".lower().split()
s_parametros_ruins = "status attenuation margin parametros noise ont indicator conexao velocidade ruins".lower().split()
s_baixa_velocidade = "ping upload download velocidade teste lentidao testes baixa cabo reclama".lower().split()
s_queda_intermitencia = "quedas status conexao reinit internet attenuation ont ngasp power procedimentos".lower().split()

s_modem_sem_sincronismo = id2word.doc2bow(s_modem_sem_sincronismo)
s_massiva = id2word.doc2bow(s_massiva)
s_modem_sincronizado_e_autenticado = id2word.doc2bow(s_modem_sincronizado_e_autenticado)
s_parametros_ruins = id2word.doc2bow(s_parametros_ruins)
s_baixa_velocidade = id2word.doc2bow(s_baixa_velocidade)
s_queda_intermitencia = id2word.doc2bow(s_queda_intermitencia)

s0 = s_modem_sem_sincronismo
s1 = s_massiva
s2 = s_modem_sincronizado_e_autenticado
s3 = s_parametros_ruins
s4 = s_baixa_velocidade
s5 = s_queda_intermitencia

ss = [s0, s1, s2, s3, s4, s5]

DATASET["S0"] = 0.0
DATASET["S1"] = 0.0
DATASET["S2"] = 0.0
DATASET["S3"] = 0.0
DATASET["S4"] = 0.0
DATASET["S5"] = 0.0

for a in range(len(DATASET)):
    doc_vec = DATASET.iloc(0)[a][3]
    doc_bow = id2word.doc2bow( doc_vec)
        
    DATASET["S0"].iloc[a] = similarity_matrix.inner_product( doc_bow , s0, normalized=(True, True))
    DATASET["S1"].iloc[a] = similarity_matrix.inner_product( doc_bow , s1, normalized=(True, True))
    DATASET["S2"].iloc[a] = similarity_matrix.inner_product( doc_bow , s2, normalized=(True, True))
    DATASET["S3"].iloc[a] = similarity_matrix.inner_product( doc_bow , s3, normalized=(True, True))
    DATASET["S4"].iloc[a] = similarity_matrix.inner_product( doc_bow , s4, normalized=(True, True))
    DATASET["S5"].iloc[a] = similarity_matrix.inner_product( doc_bow , s5, normalized=(True, True))

X = np.array(DATASET[["S0", "S1", "S2", "S3", "S4", "S5" ]])
y = np.array(DATASET.PROBLEMA)

X_train, X_test, y_train, y_test = train_test_split( X,
                                                     y, 
                                                     train_size=0.80, random_state=28)

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result(X_train_v, X_test_v, y_train, y_test)
results

100%|██████████████████████████████████████████████████████████████████████████████| 796/796 [00:00<00:00, 4723.22it/s]


Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,0.153721,0.65
1,AdaBoostClassifier:,0.071831,0.516667
2,Voting_LR3_SVC1_ETC2:,0.40991,0.666667
3,Voting_LR1_SVC1_ETC1:,0.410899,0.666667
4,DecisionTreeClassifier:,0.002991,0.6
5,GaussianNB:,0.002965,0.516667
6,BernoulliNB:,0.002998,0.616667
7,MultinomialNB:,0.002989,0.633333
8,RandomForestClassifier:,0.067814,0.6
9,ExtraTreesClassifier:,0.121671,0.65


### Test 21 with the complete dataset
* Sem remover palavras mais frequentes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Passagem para forma numérica:
    * gensim.corpora.Dictionary e bag of words;
* gensim.similarities	
    * Word Embedding Similarity Index
* teste usando
    * Colunas de similaridade
    * Texto (TF-IDF) 


In [31]:
for a in range(100, 2000, 50):
    vectorizer = TfidfVectorizer( max_features= a)
    XX = vectorizer.fit_transform(DATASET["RELATOCLIENTE_CLEAN"])
    XXX = hstack((XX, X.astype(float)))
    y = np.array(DATASET.PROBLEMA)

    X_train, X_test, y_train, y_test = train_test_split( XXX,
                                                     y, 
                                                    train_size=0.80, random_state=28)
    
    encoder = preprocessing.LabelEncoder()
    y_train = encoder.fit_transform(y_train)
    y_test = encoder.fit_transform(y_test)

    lr = LogisticRegression(random_state=0).fit(X_train, y_train)
    Y_previsto_LR = lr.predict(X_test)

    # Análise da previsão
    p_ac = metrics.accuracy_score(y_test, np.floor( Y_previsto_LR) )
    print("max_features: "+ str(a) + " " + "#Acurácia: {:.4f}".format(p_ac) ) 

max_features: 100 #Acurácia: 0.7000
max_features: 150 #Acurácia: 0.7000
max_features: 200 #Acurácia: 0.6667
max_features: 250 #Acurácia: 0.7333
max_features: 300 #Acurácia: 0.7167
max_features: 350 #Acurácia: 0.7167
max_features: 400 #Acurácia: 0.7333
max_features: 450 #Acurácia: 0.7500
max_features: 500 #Acurácia: 0.7333
max_features: 550 #Acurácia: 0.7333
max_features: 600 #Acurácia: 0.7333
max_features: 650 #Acurácia: 0.7333
max_features: 700 #Acurácia: 0.7333
max_features: 750 #Acurácia: 0.7333
max_features: 800 #Acurácia: 0.7333
max_features: 850 #Acurácia: 0.7333
max_features: 900 #Acurácia: 0.7333
max_features: 950 #Acurácia: 0.7333
max_features: 1000 #Acurácia: 0.7333
max_features: 1050 #Acurácia: 0.7333
max_features: 1100 #Acurácia: 0.7333
max_features: 1150 #Acurácia: 0.7333
max_features: 1200 #Acurácia: 0.7333
max_features: 1250 #Acurácia: 0.7333
max_features: 1300 #Acurácia: 0.7333
max_features: 1350 #Acurácia: 0.7333
max_features: 1400 #Acurácia: 0.7333
max_features: 1450 

### Test 22 with the complete dataset
* Sem remover palavras mais frequentes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Passagem para forma numérica:
    * gensim.corpora.Dictionary e bag of words;
* gensim.similarities	
    * Word Embedding Similarity Index
* teste usando
    * 6 colunas de similaridade
    * 6 tópicos ( topic modelling LDA )


In [32]:
num_topics = 6
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics= 6,
                                       workers=2)

rev_train = DATASET
lda_train = lda_model

for a in range( num_topics ):
    column = "TP" + str(a + 1)
    rev_train[column] = " "

for i in range(len(rev_train)):
    top_topics = (
        lda_train.get_document_topics(corpus[i],
                                      minimum_probability=0.0)
    )
    topic_vec = [top_topics[j][1] for j in range(num_topics)]
    
    for a in range(num_topics):
        column = "TP" + str(a + 1)
        rev_train[column].iloc[i] = topic_vec[a]

column = []
for a in range(num_topics):
    column.append("TP" + str(a + 1))

X_t = np.array(rev_train[column])
y = np.array(rev_train.PROBLEMA)

XXX = np.append(X_t.astype(float), X, axis=1).astype(float)
y = np.array(DATASET.PROBLEMA)

X_train, X_test, y_train, y_test = train_test_split( XXX,
                                                     y, 
                                                    train_size=0.80, random_state=28)

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

results = get_tests_result_ovr(X_train, X_test, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,0.997371,0.533333
1,AdaBoostClassifier:,0.360032,0.45
2,Voting_LR3_SVC1_ETC2:,0.0,0.0
3,Voting_LR1_SVC1_ETC1:,0.0,0.0
4,DecisionTreeClassifier:,0.010973,0.35
5,GaussianNB:,0.004984,0.35
6,BernoulliNB:,0.004987,0.083333
7,MultinomialNB:,0.005961,0.233333
8,RandomForestClassifier:,0.342084,0.416667
9,ExtraTreesClassifier:,0.505674,0.416667


### Test 23 with the complete dataset
* Sem remover palavras mais frequentes;
* Removendo stopwords de forma personalizada;
* 7000 registros para cada classe de problema;
* Comparação
    * CountVectorizer(nível word, char e ngrams)
    * Tf-IDF (nível word, char e ngrams)


In [33]:
path = "C:/PMON2021-NLP/"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')
DATASET["RELATOCLIENTE_CLEAN_T"] = list(sent_to_words(DATASET.RELATOCLIENTE_CLEAN.values.tolist()))

X_train, X_test, y_train, y_test = train_test_split( DATASET["RELATOCLIENTE_CLEAN"] ,
                                                    DATASET["PROBLEMA"], 
                                                    train_size=0.80, random_state=28)


encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

# CountVectorizer word level
# X_train_count_vect_w, X_test_count_vect_w, y_train, y_test
count_vect_w = CountVectorizer(analyzer='word')
count_vect_w.fit(DATASET["RELATOCLIENTE_CLEAN"])
X_train_count_vect_w =  count_vect_w.transform(X_train)
X_test_count_vect_w =  count_vect_w.transform(X_test)

# CountVectorizer word ngram level
# X_train_count_vect_w_ngram, X_test_count_vect_w_ngram, y_train, y_test
count_vect_w_ngram = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
count_vect_w_ngram.fit(DATASET["RELATOCLIENTE_CLEAN"])
X_train_count_vect_w_ngram =  count_vect_w_ngram.transform(X_train)
X_test_count_vect_w_ngram =  count_vect_w_ngram.transform(X_test)

# CountVectorizer char level
# X_train_count_vect_char, X_test_count_vect_char, y_train, y_test
count_vect_char = CountVectorizer(analyzer='char', max_features=5000)
count_vect_char.fit(DATASET["RELATOCLIENTE_CLEAN"])
X_train_count_vect_char =  count_vect_char.transform(X_train)
X_test_count_vect_char =  count_vect_char.transform(X_test)

# CountVectorizer char ngram level
# X_train_count_vect_char_ngram, X_test_count_vect_char_ngram, y_train, y_test
count_vect_char_ngram = CountVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
count_vect_char_ngram.fit(DATASET["RELATOCLIENTE_CLEAN"])
X_train_count_vect_char_ngram =  count_vect_char_ngram.transform(X_train)
X_test_count_vect_char_ngram =  count_vect_char_ngram.transform(X_test)

#########################################

# tf-idf word level
# X_train_tfidf_w, X_test_tfidf_w, y_train, y_test
tfidf_w = TfidfVectorizer(analyzer='word', max_features=500)
tfidf_w.fit(DATASET["RELATOCLIENTE_CLEAN"])
X_train_tfidf_w =  tfidf_w.transform(X_train)
X_test_tfidf_w =  tfidf_w.transform(X_test)

# tf-idf word ngram level  
# X_train_tfidf_w_ngram, X_test_tfidf_w_ngram, y_train, y_test
tfidf_w_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_w_ngram.fit(DATASET["RELATOCLIENTE_CLEAN"])
X_train_tfidf_w_ngram =  tfidf_w_ngram.transform(X_train)
X_test_tfidf_w_ngram =  tfidf_w_ngram.transform(X_test)

# tf-idf char level 
# X_train_tfidf_char, X_test_tfidf_char, y_train, y_test
tfidf_char = TfidfVectorizer(analyzer='char', max_features=5000)
tfidf_char.fit(DATASET["RELATOCLIENTE_CLEAN"])
X_train_tfidf_char =  tfidf_char.transform(X_train) 
X_test_tfidf_char =  tfidf_char.transform(X_test)

# tf-idf char ngram level
# X_train_tfidf_char_ngram, X_test_tfidf_char_ngram, y_train, y_test
tfidf_char_ngram = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_char_ngram.fit(DATASET["RELATOCLIENTE_CLEAN"])
X_train_tfidf_char_ngram =  tfidf_char_ngram.transform(X_train) 
X_test_tfidf_char_ngram =  tfidf_char_ngram.transform(X_test)

### 23-1 CountVectorizer word level

In [34]:
results = get_tests_result_ovr(X_train_count_vect_w, X_test_count_vect_w, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,0.73465,0.666667
1,AdaBoostClassifier:,0.462766,0.616667
2,Voting_LR3_SVC1_ETC2:,0.0,0.0
3,Voting_LR1_SVC1_ETC1:,0.0,0.0
4,DecisionTreeClassifier:,0.016951,0.466667
5,GaussianNB:,0.018931,0.416667
6,BernoulliNB:,0.016954,0.65
7,MultinomialNB:,0.009973,0.65
8,RandomForestClassifier:,0.38998,0.65
9,ExtraTreesClassifier:,0.763045,0.65


### 23-2 CountVectorizer word ngram level

In [35]:
results = get_tests_result_ovr(X_train_count_vect_w_ngram, X_test_count_vect_w_ngram, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,2.768041,0.566667
1,AdaBoostClassifier:,0.938482,0.583333
2,Voting_LR3_SVC1_ETC2:,0.0,0.0
3,Voting_LR1_SVC1_ETC1:,0.0,0.0
4,DecisionTreeClassifier:,0.071809,0.616667
5,GaussianNB:,0.120684,0.483333
6,BernoulliNB:,0.078788,0.366667
7,MultinomialNB:,0.040861,0.6
8,RandomForestClassifier:,0.55454,0.483333
9,ExtraTreesClassifier:,1.331443,0.483333


### 23-3 CountVectorizer char level 

In [36]:
results = get_tests_result_ovr(X_train_count_vect_char, X_test_count_vect_char, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,1.08281,0.55
1,AdaBoostClassifier:,0.424859,0.533333
2,Voting_LR3_SVC1_ETC2:,0.0,0.0
3,Voting_LR1_SVC1_ETC1:,0.0,0.0
4,DecisionTreeClassifier:,0.013963,0.4
5,GaussianNB:,0.004986,0.283333
6,BernoulliNB:,0.006981,0.316667
7,MultinomialNB:,0.004987,0.6
8,RandomForestClassifier:,0.369018,0.566667
9,ExtraTreesClassifier:,0.640395,0.516667


### 23-4 CountVectorizer char ngram level

In [37]:
results = get_tests_result_ovr(X_train_count_vect_char_ngram, X_test_count_vect_char_ngram, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,2.444531,0.683333
1,AdaBoostClassifier:,0.76994,0.666667
2,Voting_LR3_SVC1_ETC2:,0.0,0.0
3,Voting_LR1_SVC1_ETC1:,0.0,0.0
4,DecisionTreeClassifier:,0.059846,0.55
5,GaussianNB:,0.050868,0.283333
6,BernoulliNB:,0.044874,0.666667
7,MultinomialNB:,0.015957,0.7
8,RandomForestClassifier:,0.404911,0.666667
9,ExtraTreesClassifier:,0.765019,0.683333


### 23-5 tf-idf word level

In [38]:
results = get_tests_result_ovr(X_train_tfidf_w, X_test_tfidf_w, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,0.497153,0.616667
1,AdaBoostClassifier:,0.473733,0.583333
2,Voting_LR3_SVC1_ETC2:,0.0,0.0
3,Voting_LR1_SVC1_ETC1:,0.0,0.0
4,DecisionTreeClassifier:,0.018979,0.516667
5,GaussianNB:,0.009973,0.383333
6,BernoulliNB:,0.009972,0.666667
7,MultinomialNB:,0.005984,0.616667
8,RandomForestClassifier:,0.377984,0.666667
9,ExtraTreesClassifier:,0.660234,0.6


### 23-6 tf-idf word ngram level

In [39]:
results = get_tests_result_ovr(X_train_tfidf_w_ngram, X_test_tfidf_w_ngram, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,3.90175,0.466667
1,AdaBoostClassifier:,0.963418,0.566667
2,Voting_LR3_SVC1_ETC2:,0.0,0.0
3,Voting_LR1_SVC1_ETC1:,0.0,0.0
4,DecisionTreeClassifier:,0.076771,0.533333
5,GaussianNB:,0.115714,0.483333
6,BernoulliNB:,0.06782,0.366667
7,MultinomialNB:,0.022921,0.583333
8,RandomForestClassifier:,0.544514,0.433333
9,ExtraTreesClassifier:,1.4073,0.5


### 23-7 tf-idf char level

In [40]:
results = get_tests_result_ovr(X_train_tfidf_char, X_test_tfidf_char, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,1.126647,0.516667
1,AdaBoostClassifier:,0.488715,0.533333
2,Voting_LR3_SVC1_ETC2:,0.0,0.0
3,Voting_LR1_SVC1_ETC1:,0.0,0.0
4,DecisionTreeClassifier:,0.018961,0.3
5,GaussianNB:,0.005981,0.333333
6,BernoulliNB:,0.006951,0.316667
7,MultinomialNB:,0.005014,0.3
8,RandomForestClassifier:,0.39394,0.483333
9,ExtraTreesClassifier:,0.646276,0.533333


### 23-8 tf-idf char ngram level

In [41]:
results = get_tests_result_ovr(X_train_tfidf_char_ngram, X_test_tfidf_char_ngram, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,MLPClassifier:,2.193224,0.7
1,AdaBoostClassifier:,1.234671,0.633333
2,Voting_LR3_SVC1_ETC2:,0.0,0.0
3,Voting_LR1_SVC1_ETC1:,0.0,0.0
4,DecisionTreeClassifier:,0.103723,0.616667
5,GaussianNB:,0.038895,0.283333
6,BernoulliNB:,0.037901,0.666667
7,MultinomialNB:,0.010967,0.65
8,RandomForestClassifier:,0.435835,0.683333
9,ExtraTreesClassifier:,0.776924,0.683333


### Test 24 with the complete dataset 
* Other kind of problem (Motivo 3)
* Balanced

In [43]:
path = "C:/PMON2021-NLP/"
file = "DATASET_CLEAN.csv"

import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')

X_train, X_test, y_train, y_test = train_test_split( DATASET["RELATOCLIENTE_CLEAN"] ,
                                                    DATASET["PROBLEMA"], 
                                                    train_size=0.80, random_state=28)

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

# CountVectorizer word level
# X_train_count_vect_w, X_test_count_vect_w, y_train, y_test
count_vect_w = CountVectorizer(analyzer='word')
count_vect_w.fit(DATASET["RELATOCLIENTE_CLEAN"].astype('U').values)
X_train_count_vect_w =  count_vect_w.transform(X_train.astype('U').values)
X_test_count_vect_w =  count_vect_w.transform(X_test.astype('U').values)

# tf-idf word level
# X_train_tfidf_w, X_test_tfidf_w, y_train, y_test
tfidf_w = TfidfVectorizer(analyzer='word')
tfidf_w.fit(DATASET["RELATOCLIENTE_CLEAN"].astype('U').values)
X_train_tfidf_w =  tfidf_w.transform(X_train.astype('U').values)
X_test_tfidf_w =  tfidf_w.transform(X_test.astype('U').values)


In [45]:
DATASET["PROBLEMA"].value_counts()[0:15]

Queda / Intermitência               50
Modem sem sincronismo               50
Massiva                             50
Modem sincronizado e autenticado    50
Parâmetros Ruins                    50
Baixa Velocidade                    50
Name: PROBLEMA, dtype: int64

### 24 Count vectorizer word level
* MemoryError: Unable to allocate 34.7 GiB for an array with shape (124080, 37545) and data type int64
* GaussianNB removed

In [46]:
def get_tests_result_ovr_26(X_train_v, X_test_v, y_train, y_test):
    all_res = []
    
    ### RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=50, random_state=1, n_jobs=5)
    train_test_time, accuracy, Y_previsto_RF = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "RandomForestClassifier: " + str( accuracy) ) 
    all_res.append(["RandomForestClassifier: ", train_test_time, accuracy])
    
    ### ExtraTreesClassifier
    clf = ExtraTreesClassifier(n_estimators=100, random_state=0, n_jobs=5 )
    train_test_time, accuracy, Y_previsto_ERT = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "ExtraTreesClassifier: " + str( accuracy) ) 
    all_res.append(["ExtraTreesClassifier: ", train_test_time, accuracy])
    
    ### LogisticRegression
    clf = LogisticRegression(random_state=0, n_jobs=5)
    train_test_time, accuracy, Y_previsto_LR = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "LogisticRegression: " + str( accuracy) ) 
    all_res.append(["LogisticRegression: ", train_test_time, accuracy])
    
    ### svm
    clf = svm.SVC()
    train_test_time, accuracy, Y_previsto_SVM = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
    #print( "svm: " + str( accuracy) ) 
    all_res.append(["svm: ", train_test_time, accuracy])
        
    ### Stacking 1
    stack_result = []
    for a in range(len(Y_previsto_RF)):
        elements = []
        elements.append( np.floor( Y_previsto_RF[a] ) )
        elements.append( np.floor( Y_previsto_ERT[a] ) )
        elements.append( np.floor( Y_previsto_LR[a] ) )
        elements.append( np.floor( Y_previsto_SVM[a] ) )
        #elements.append( np.floor( Y_previsto_svc[a] ) )
        stack_result.append( stats.mode(elements).mode[0])
        
    accuracy = metrics.accuracy_score(y_test,  np.array(stack_result, dtype="int64" )  )
    all_res.append(["stacking_1: ", 0 , accuracy])
       
    ### Stacking_scikit
    clf1 = RandomForestClassifier(n_estimators=50, random_state=1, n_jobs=5)
    clf2 = LogisticRegression(random_state=0, n_jobs=5)
    clf3 = SVC()
    clf4 = ExtraTreesClassifier(n_estimators=100, random_state=0, n_jobs=5)
    estimators = [('rf', clf1), ('lr', clf2), ('svc', clf3), ('etc', clf4)]
    ini = time.time()
    clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(), n_jobs=5)
    
    ovr = OneVsRestClassifier(clf)
    ovr.fit(X_train_v, y_train)
    Y_Previsto_stacking = ovr.predict(X_test_v)
    
    fim = time.time()
    train_test_time = fim-ini
    accuracy = metrics.accuracy_score(y_test, Y_Previsto_stacking)
    all_res.append(["Stacking_scikit_ovr: ", train_test_time , accuracy])
    
    ### XGBClassifier
    Y_previsto_xgbc = 1
    try:
        clf = XGBClassifier(eval_metric='mlogloss')
        train_test_time, accuracy, Y_previsto_xgbc = train_model_ovr(clf, X_train_v.tocsc(), X_test_v.tocsc(), y_train, y_test)
        #print( "XGBClassifier: " + str( accuracy) ) 
        all_res.append(["XGBClassifier: ", train_test_time, accuracy])
    except:
        clf = XGBClassifier(eval_metric='mlogloss')
        train_test_time, accuracy, Y_previsto_xgbc = train_model_ovr(clf, X_train_v, X_test_v, y_train, y_test)
        #print( "XGBClassifier: " + str( accuracy) ) 
        all_res.append(["XGBClassifier: ", train_test_time, accuracy])
    
    return pd.DataFrame( all_res, columns=["ALGORITHM","TRAIN_TEST_TIME", "ACCURACY"]  )



### 24-1 Count vectorizer word level

In [47]:
results = get_tests_result_ovr_26(X_train_count_vect_w, X_test_count_vect_w, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,RandomForestClassifier:,1.475053,0.65
1,ExtraTreesClassifier:,0.548533,0.65
2,LogisticRegression:,0.065853,0.683333
3,svm:,0.045879,0.616667
4,stacking_1:,0.0,0.666667
5,Stacking_scikit_ovr:,3.465724,0.683333
6,XGBClassifier:,0.315156,0.6


### 24-2 TF-IDF word level

In [48]:
results = get_tests_result_ovr_26(X_train_tfidf_w, X_test_tfidf_w, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,RandomForestClassifier:,0.296207,0.65
1,ExtraTreesClassifier:,0.550527,0.65
2,LogisticRegression:,0.048868,0.7
3,svm:,0.052889,0.683333
4,stacking_1:,0.0,0.683333
5,Stacking_scikit_ovr:,3.452828,0.65
6,XGBClassifier:,0.302168,0.683333


### Test 25 with the complete dataset
* Other kind of problem (Motivo 3)
* Unbalanced

In [50]:
path = "C:/PMON2021-NLP/"
file = "DATASET_CLEAN.csv"
import warnings
warnings.filterwarnings('ignore')
DATASET = pd.read_csv(path + file, error_bad_lines=False, delimiter=';')

X_train, X_test, y_train, y_test = train_test_split( DATASET["RELATOCLIENTE_CLEAN"] ,
                                                    DATASET["PROBLEMA"], 
                                                    train_size=0.80, random_state=28)


encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

# CountVectorizer word level
# X_train_count_vect_w, X_test_count_vect_w, y_train, y_test
count_vect_w = CountVectorizer(analyzer='word')
count_vect_w.fit(DATASET["RELATOCLIENTE_CLEAN"])
X_train_count_vect_w =  count_vect_w.transform(X_train)
X_test_count_vect_w =  count_vect_w.transform(X_test)

# tf-idf word level
# X_train_tfidf_w, X_test_tfidf_w, y_train, y_test
tfidf_w = TfidfVectorizer(analyzer='word')
tfidf_w.fit(DATASET["RELATOCLIENTE_CLEAN"])
X_train_tfidf_w =  tfidf_w.transform(X_train)
X_test_tfidf_w =  tfidf_w.transform(X_test)

In [52]:
DATASET["PROBLEMA"].value_counts()[0:15]

Queda / Intermitência               50
Modem sem sincronismo               50
Massiva                             50
Modem sincronizado e autenticado    50
Parâmetros Ruins                    50
Baixa Velocidade                    50
Name: PROBLEMA, dtype: int64

### 25-1 Count vectorizer word level

In [53]:
results = get_tests_result_ovr_26(X_train_count_vect_w, X_test_count_vect_w, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,RandomForestClassifier:,0.285218,0.65
1,ExtraTreesClassifier:,0.541559,0.65
2,LogisticRegression:,0.061835,0.683333
3,svm:,0.0459,0.616667
4,stacking_1:,0.0,0.666667
5,Stacking_scikit_ovr:,3.42697,0.683333
6,XGBClassifier:,0.311167,0.6


### 25-2 TF-IDF word level

In [54]:
results = get_tests_result_ovr_26(X_train_tfidf_w, X_test_tfidf_w, y_train, y_test)
results

Unnamed: 0,ALGORITHM,TRAIN_TEST_TIME,ACCURACY
0,RandomForestClassifier:,0.296208,0.65
1,ExtraTreesClassifier:,0.541557,0.65
2,LogisticRegression:,0.044908,0.7
3,svm:,0.051861,0.683333
4,stacking_1:,0.0,0.683333
5,Stacking_scikit_ovr:,3.454113,0.65
6,XGBClassifier:,0.304186,0.683333
