In [1]:
#from google.colab import drive
#drive.mount('/content/drive')
import pandas as pd
import numpy as np
from IPython.display import clear_output
from tqdm import tqdm
from time import time
import json
from sklearn.utils.fixes import *
random_state = 20

In [2]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [3]:
# Utility function to report best scores
def report(results, pipeline, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results["rank_test_score"] == i)
        if i == 1:
          results['pipeline'] = pipeline
          df_results = pd.DataFrame(results)
          df = df_results[df_results['rank_test_score'] == 1].filter(regex=("split*"))
          df['pipeline'] = df_results['pipeline']
          df['params'] = df_results['params']
          with open(f'Arquivos TCC/perfomance_cv.csv', 'a') as f:
            df.to_csv(f, mode='a', header=f.tell()==0, index=False)
        for candidate in candidates:
            print(color.BOLD + "Model with rank: {0}".format(i)+ color.END)
            print(color.BOLD + 
                "Mean validation score: {0:.4f} (std: {1:.4f})".format(
                    results["mean_test_score"][candidate],
                    results["std_test_score"][candidate],
                )+ color.END
            )
            print(color.BOLD + "Parameters: {0}".format(results["params"][candidate])+ color.END)
            print("")

# Pipeline 1: 20NG + all-MiniLM-L6-v2


In [4]:
#LOADING DATA
import pandas as pd
import numpy as np
import pickle
from IPython.display import clear_output
from sklearn import preprocessing
clear_output()
#datasets = ['TC02_20news', 'TC14_wos']
#models_names = ["all-MiniLM-L6-v2","all-MiniLM-L12-v2"]
dataset = 'TC02_20news'
model_name = "all-MiniLM-L6-v2"
embeddings_path = f'Arquivos TCC/embeddings/sbert/{model_name}_{dataset}.pkl'
with open(embeddings_path, "rb") as fIn:
  stored_data = pickle.load(fIn)
embeddings = stored_data['embeddings']
labels = stored_data['labels']
labels_enc = preprocessing.LabelEncoder().fit_transform(labels)
unique, counts = np.unique(stored_data['labels'], return_counts=True)
#Split hold-out test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels_enc, test_size=0.10, random_state=random_state,stratify=labels_enc)
unique, counts = np.unique(y_test, return_counts=True)
#dict(zip(unique, counts))
#K-fold cross validation strategy and import dependencies
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

**KNN**

In [10]:
#KNN
clf_KNN = KNeighborsClassifier(algorithm= "ball_tree")
param_dist_KNN = {
    "n_neighbors": [3,5,11,21,51,101,151,201],
    "p": [1,2],
    "weights" :["uniform","distance"]
}
n_it = 10
random_search_KNN = RandomizedSearchCV(
    clf_KNN, param_distributions=param_dist_KNN, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_KNN.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_KNN.cv_results_ , f'{dataset}+{model_name}+KNN')



RandomizedSearchCV took 426.27 seconds for 10 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.8242 (std: 0.0069)[0m
[1mParameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 3}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.8123 (std: 0.0059)[0m
[1mParameters: {'weights': 'uniform', 'p': 1, 'n_neighbors': 5}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.8062 (std: 0.0071)[0m
[1mParameters: {'weights': 'uniform', 'p': 2, 'n_neighbors': 11}[0m



In [11]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_KNN.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.8279[0m


**Logistic Regression**

In [5]:
#Logistic Regression

clf_LR = LogisticRegression(random_state=random_state,multi_class='multinomial', solver='sag', tol= 1e-4,max_iter=300)
param_dist_LR = {
    "C": [1e-1,1,10],
    "penalty": ["l2", "none"]
}
n_it = 6
random_search_LR = RandomizedSearchCV(
    clf_LR, param_distributions=param_dist_LR, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_LR.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_LR.cv_results_, f'{dataset}+{model_name}+LR')


RandomizedSearchCV took 556.33 seconds for 6 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7744 (std: 0.0063)[0m
[1mParameters: {'penalty': 'l2', 'C': 10}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7725 (std: 0.0055)[0m
[1mParameters: {'penalty': 'l2', 'C': 1}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7454 (std: 0.0058)[0m
[1mParameters: {'penalty': 'l2', 'C': 0.1}[0m



In [6]:
balanced_acc= balanced_accuracy_score(y_test,random_search_LR.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7774[0m


**SVC**

In [7]:
from sklearn.svm import LinearSVC
clf_SVC = LinearSVC(random_state=random_state, tol=1e-4)
param_dist_SVC = {
    'C': [1e-1,1,10],
    "penalty": ["l2", "none"]
    }
n_it = 6
random_search_SVC = RandomizedSearchCV(
    clf_SVC, param_distributions=param_dist_SVC, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_SVC.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_SVC.cv_results_, f'{dataset}+{model_name}+SVC')

RandomizedSearchCV took 84.51 seconds for 6 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7757 (std: 0.0091)[0m
[1mParameters: {'penalty': 'l2', 'C': 1}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7721 (std: 0.0080)[0m
[1mParameters: {'penalty': 'l2', 'C': 10}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7697 (std: 0.0070)[0m
[1mParameters: {'penalty': 'l2', 'C': 0.1}[0m



In [8]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_SVC.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7879[0m


In [16]:
from sklearn.ensemble import RandomForestClassifier
clf_RF = RandomForestClassifier(random_state=random_state)
param_dist_RF = {
    'n_estimators': [10,50,100,200],
    }
n_it = 10
random_search_RF = RandomizedSearchCV(
    clf_RF, param_distributions=param_dist_RF, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_RF.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_RF.cv_results_, f'{dataset}+{model_name}+RF')
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_RF.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

RandomizedSearchCV took 153.72 seconds for 10 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7680 (std: 0.0081)[0m
[1mParameters: {'n_estimators': 200}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7498 (std: 0.0055)[0m
[1mParameters: {'n_estimators': 100}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7202 (std: 0.0089)[0m
[1mParameters: {'n_estimators': 50}[0m

[1mA acurácia balanceada nos dados de teste: 0.7699[0m


In [17]:
report(random_search_RF.cv_results_, f'{dataset}+{model_name}+RF')

[1mModel with rank: 1[0m
[1mMean validation score: 0.7680 (std: 0.0081)[0m
[1mParameters: {'n_estimators': 200}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7498 (std: 0.0055)[0m
[1mParameters: {'n_estimators': 100}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7202 (std: 0.0089)[0m
[1mParameters: {'n_estimators': 50}[0m



# Pipeline 2: 20NG + all-MiniLM-L12-v2

In [18]:
#LOADING DATA
import pandas as pd
import numpy as np
import pickle
from IPython.display import clear_output
from sklearn import preprocessing
clear_output()
#datasets = ['TC02_20news', 'TC14_wos']
#models_names = ["all-MiniLM-L6-v2","all-MiniLM-L12-v2"]
dataset = 'TC02_20news'
model_name = "all-MiniLM-L12-v2"
embeddings_path = f'Arquivos TCC/embeddings/sbert/{model_name}_{dataset}.pkl'
with open(embeddings_path, "rb") as fIn:
  stored_data = pickle.load(fIn)
embeddings = stored_data['embeddings']
labels = stored_data['labels']
labels_enc = preprocessing.LabelEncoder().fit_transform(labels)
#Split hold-out test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels_enc, test_size=0.10, random_state=random_state,stratify=labels_enc)
unique, counts = np.unique(y_test, return_counts=True)
#K-fold cross validation strategy and import dependencies
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)
print(X_train[10][:10],'\n', y_test[50])

[ 0.04915353  0.08488183 -0.01936629  0.00663146 -0.01925441  0.00633633
  0.00899213  0.04904805 -0.08862329  0.03560806] 
 12


**KNN**

In [19]:
#KNN
clf_KNN = KNeighborsClassifier(algorithm= "ball_tree")
param_dist_KNN = {
    "n_neighbors": [3,5,11,21,51,101,151,201],
    "p": [1,2],
    "weights" :["uniform","distance"]
}
n_it = 10
random_search_KNN = RandomizedSearchCV(
    clf_KNN, param_distributions=param_dist_KNN, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_KNN.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_KNN.cv_results_, f'{dataset}+{model_name}+KNN')

RandomizedSearchCV took 432.54 seconds for 10 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.8173 (std: 0.0049)[0m
[1mParameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 3}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7987 (std: 0.0037)[0m
[1mParameters: {'weights': 'uniform', 'p': 1, 'n_neighbors': 5}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7938 (std: 0.0050)[0m
[1mParameters: {'weights': 'uniform', 'p': 2, 'n_neighbors': 11}[0m



In [20]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_KNN.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.8179[0m


**Logistic Regression**

In [21]:
#Logistic Regression

clf_LR = LogisticRegression(random_state=random_state,multi_class='multinomial', solver='sag', tol= 1e-4,max_iter=300)
param_dist_LR = {
    "C": [1e-1,1,10],
    "penalty": ["l2", "none"]
}
n_it = 6
random_search_LR = RandomizedSearchCV(
    clf_LR, param_distributions=param_dist_LR, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_LR.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_LR.cv_results_, f'{dataset}+{model_name}+LR')

RandomizedSearchCV took 173.86 seconds for 8 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7732 (std: 0.0093)[0m
[1mParameters: {'C': 10}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7683 (std: 0.0093)[0m
[1mParameters: {'C': 1}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7422 (std: 0.0095)[0m
[1mParameters: {'C': 0.1}[0m



In [22]:
balanced_acc= balanced_accuracy_score(y_test,random_search_LR.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7779[0m


**SVC**

In [23]:
from sklearn.svm import LinearSVC
clf_SVC = LinearSVC(random_state=random_state, tol=1e-4)
param_dist_SVC = {
    'C': [1e-1,1,10],
    "penalty": ["l2", "none"]
    }
n_it = 6
random_search_SVC = RandomizedSearchCV(
    clf_SVC, param_distributions=param_dist_SVC, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_SVC.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_SVC.cv_results_, f'{dataset}+{model_name}+SVC')

RandomizedSearchCV took 82.35 seconds for 6 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7756 (std: 0.0085)[0m
[1mParameters: {'C': 1}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7675 (std: 0.0090)[0m
[1mParameters: {'C': 0.1}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7669 (std: 0.0077)[0m
[1mParameters: {'C': 10}[0m



In [24]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_SVC.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7779[0m


In [25]:
from sklearn.ensemble import RandomForestClassifier
clf_RF = RandomForestClassifier(random_state=random_state)
param_dist_RF = {
    'n_estimators': [10,50,100,200],
    }
n_it = 10
random_search_RF = RandomizedSearchCV(
    clf_RF, param_distributions=param_dist_RF, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_RF.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_RF.cv_results_, f'{dataset}+{model_name}+RF')
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_RF.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

RandomizedSearchCV took 149.72 seconds for 10 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7716 (std: 0.0080)[0m
[1mParameters: {'n_estimators': 200}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7498 (std: 0.0087)[0m
[1mParameters: {'n_estimators': 100}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7153 (std: 0.0136)[0m
[1mParameters: {'n_estimators': 50}[0m

[1mA acurácia balanceada nos dados de teste: 0.7829[0m


# Pipeline 3: WOS + all-MiniLM-L6-v2

In [26]:
#LOADING DATA
import pandas as pd
import numpy as np
import pickle
from IPython.display import clear_output
from sklearn import preprocessing
clear_output()
#datasets = ['TC02_20news', 'TC14_wos']
#models_names = ["all-MiniLM-L6-v2","all-MiniLM-L12-v2"]
dataset = 'TC14_wos'
model_name = "all-MiniLM-L6-v2"
embeddings_path = f'Arquivos TCC/embeddings/sbert/{model_name}_{dataset}.pkl'
with open(embeddings_path, "rb") as fIn:
  stored_data = pickle.load(fIn)
embeddings = stored_data['embeddings']
labels = stored_data['labels']
labels_enc = preprocessing.LabelEncoder().fit_transform(labels)
#Split hold-out test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels_enc, test_size=0.10, random_state=random_state,stratify=labels_enc)
unique, counts = np.unique(y_test, return_counts=True)
#K-fold cross validation strategy and import dependencies
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)
print(X_train[10][:10], y_test[50])

[-0.04389033 -0.09177981 -0.02245235 -0.05314789 -0.02382849 -0.08375161
 -0.01472456 -0.00565096  0.02871884  0.02558138] 30


**KNN**

In [27]:
#KNN
clf_KNN = KNeighborsClassifier(algorithm= "ball_tree")
param_dist_KNN = {
    "n_neighbors": [3,5,11,21,51,101,151,201],
    "p": [1,2],
    "weights" :["uniform","distance"]
}
n_it = 10
random_search_KNN = RandomizedSearchCV(
    clf_KNN, param_distributions=param_dist_KNN, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_KNN.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_KNN.cv_results_, f'{dataset}+{model_name}+KNN')

RandomizedSearchCV took 159.55 seconds for 10 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7479 (std: 0.0099)[0m
[1mParameters: {'weights': 'uniform', 'p': 2, 'n_neighbors': 11}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7471 (std: 0.0123)[0m
[1mParameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 21}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7363 (std: 0.0107)[0m
[1mParameters: {'weights': 'distance', 'p': 2, 'n_neighbors': 51}[0m



In [28]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_KNN.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7393[0m


**Logistic Regression**

In [29]:
#Logistic Regression

clf_LR = LogisticRegression(random_state=random_state,multi_class='multinomial', solver='sag', tol= 1e-4,max_iter=300)
param_dist_LR = {
    "C": [1e-1,1,10],
    "penalty": ["l2", "none"]
}
n_it = 6
random_search_LR = RandomizedSearchCV(
    clf_LR, param_distributions=param_dist_LR, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_LR.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_LR.cv_results_, f'{dataset}+{model_name}+LR')

RandomizedSearchCV took 98.86 seconds for 8 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7819 (std: 0.0149)[0m
[1mParameters: {'C': 1}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7725 (std: 0.0168)[0m
[1mParameters: {'C': 10}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7485 (std: 0.0165)[0m
[1mParameters: {'C': 0.1}[0m



In [30]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_LR.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7847[0m


**SVC**

In [31]:
from sklearn.svm import LinearSVC
clf_SVC = LinearSVC(random_state=random_state, tol=1e-4)
param_dist_SVC = {
    'C': [1e-1,1,10],
    "penalty": ["l2", "none"]
    }
n_it = 6
random_search_SVC = RandomizedSearchCV(
    clf_SVC, param_distributions=param_dist_SVC, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_SVC.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_SVC.cv_results_, f'{dataset}+{model_name}+SVC')

RandomizedSearchCV took 39.13 seconds for 6 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7842 (std: 0.0135)[0m
[1mParameters: {'C': 0.1}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7761 (std: 0.0141)[0m
[1mParameters: {'C': 1}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7415 (std: 0.0124)[0m
[1mParameters: {'C': 10}[0m



In [32]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_SVC.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7835[0m


In [33]:
from sklearn.ensemble import RandomForestClassifier
clf_RF = RandomForestClassifier(random_state=random_state)
param_dist_RF = {
    'n_estimators': [10,50,100,200],
    }
n_it = 10
random_search_RF = RandomizedSearchCV(
    clf_RF, param_distributions=param_dist_RF, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_RF.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_RF.cv_results_, f'{dataset}+{model_name}+RF')
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_RF.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

RandomizedSearchCV took 99.39 seconds for 10 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7271 (std: 0.0114)[0m
[1mParameters: {'n_estimators': 200}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7133 (std: 0.0082)[0m
[1mParameters: {'n_estimators': 100}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.6842 (std: 0.0093)[0m
[1mParameters: {'n_estimators': 50}[0m

[1mA acurácia balanceada nos dados de teste: 0.7456[0m


# Pipeline 4: WOS + all-MiniLM-L12-v2


In [34]:
#LOADING DATA
import pandas as pd
import numpy as np
import pickle
from IPython.display import clear_output
from sklearn import preprocessing
clear_output()
#datasets = ['TC02_20news', 'TC14_wos']
#models_names = ["all-MiniLM-L6-v2","all-MiniLM-L12-v2"]
dataset = 'TC14_wos'
model_name = "all-MiniLM-L12-v2"
embeddings_path = f'Arquivos TCC/embeddings/sbert/{model_name}_{dataset}.pkl'
with open(embeddings_path, "rb") as fIn:
  stored_data = pickle.load(fIn)
embeddings = stored_data['embeddings']
labels = stored_data['labels']
labels_enc = preprocessing.LabelEncoder().fit_transform(labels)
#Split hold-out test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels_enc, test_size=0.10, random_state=random_state,stratify=labels_enc)
unique, counts = np.unique(y_test, return_counts=True)
#K-fold cross validation strategy and import dependencies
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)
print(X_train[10][:10], y_test[50])

[-0.03677584 -0.08568367  0.03027113  0.00241753 -0.07861282 -0.10965899
 -0.09844724  0.00531735  0.06428049  0.03564424] 30


**KNN**

In [35]:
#KNN
clf_KNN = KNeighborsClassifier(algorithm= "ball_tree")
param_dist_KNN = {
    "n_neighbors": [3,5,11,21,51,101,151,201],
    "p": [1,2],
    "weights" :["uniform","distance"]
}
n_it = 10
random_search_KNN = RandomizedSearchCV(
    clf_KNN, param_distributions=param_dist_KNN, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_KNN.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_KNN.cv_results_, f'{dataset}+{model_name}+KNN')

RandomizedSearchCV took 152.62 seconds for 10 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7280 (std: 0.0114)[0m
[1mParameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 21}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7264 (std: 0.0162)[0m
[1mParameters: {'weights': 'uniform', 'p': 2, 'n_neighbors': 11}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7221 (std: 0.0090)[0m
[1mParameters: {'weights': 'distance', 'p': 2, 'n_neighbors': 51}[0m



In [36]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_KNN.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7243[0m


**Logistic Regression**

In [37]:
#Logistic Regression

clf_LR = LogisticRegression(random_state=random_state,multi_class='multinomial', solver='sag', tol= 1e-4,max_iter=300)
param_dist_LR = {
    "C": [1e-1,1,10],
    "penalty": ["l2", "none"]
}
n_it = 6
random_search_LR = RandomizedSearchCV(
    clf_LR, param_distributions=param_dist_LR, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_LR.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_LR.cv_results_, f'{dataset}+{model_name}+LR')

RandomizedSearchCV took 102.03 seconds for 8 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7599 (std: 0.0138)[0m
[1mParameters: {'C': 1}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7491 (std: 0.0135)[0m
[1mParameters: {'C': 10}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7338 (std: 0.0110)[0m
[1mParameters: {'C': 0.1}[0m



In [38]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_LR.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7666[0m


**SVC**

In [39]:
from sklearn.svm import LinearSVC
clf_SVC = LinearSVC(random_state=random_state, tol=1e-4)
param_dist_SVC = {
    'C': [1e-1,1,10],
    "penalty": ["l2", "none"]
    }
n_it = 6
random_search_SVC = RandomizedSearchCV(
    clf_SVC, param_distributions=param_dist_SVC, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_SVC.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_SVC.cv_results_, f'{dataset}+{model_name}+SVC')

RandomizedSearchCV took 41.26 seconds for 6 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7621 (std: 0.0154)[0m
[1mParameters: {'C': 0.1}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7531 (std: 0.0174)[0m
[1mParameters: {'C': 1}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7129 (std: 0.0129)[0m
[1mParameters: {'C': 10}[0m



In [40]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_SVC.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7567[0m


In [41]:
from sklearn.ensemble import RandomForestClassifier
clf_RF = RandomForestClassifier(random_state=random_state)
param_dist_RF = {
    'n_estimators': [10,50,100,200],
    }
n_it = 10
random_search_RF = RandomizedSearchCV(
    clf_RF, param_distributions=param_dist_RF, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_RF.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_RF.cv_results_, f'{dataset}+{model_name}+RF')
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_RF.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

RandomizedSearchCV took 101.23 seconds for 10 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7154 (std: 0.0088)[0m
[1mParameters: {'n_estimators': 200}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7005 (std: 0.0113)[0m
[1mParameters: {'n_estimators': 100}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.6675 (std: 0.0125)[0m
[1mParameters: {'n_estimators': 50}[0m

[1mA acurácia balanceada nos dados de teste: 0.7112[0m


# Pipeline 5: 20NG + TF-IDF


In [42]:
#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
import pandas as pd
import numpy as np
import pickle
from sklearn.pipeline import make_pipeline
dataset = 'TC02_20news'
model_name = 'TFIDF'
df = pd.read_csv(f'Arquivos TCC/datasets/{dataset}.csv', sep=';')
df = df.dropna()
textos = df['text'].values.tolist()
labels = df['label']
labels_enc = preprocessing.LabelEncoder().fit_transform(labels)
#Split hold-out test data
from sklearn.model_selection import train_test_split
textos_train, textos_test, y_train, y_test = train_test_split(textos, labels_enc, test_size=0.10, random_state=random_state,stratify=labels_enc)

In [43]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_validate
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)
unique, counts = np.unique(y_test, return_counts=True)
print(y_train[10], y_test[50])

8 12


**KNN**

In [44]:

KNN = KNeighborsClassifier(n_neighbors=150, p = 2, weights='uniform', metric='cosine')
LR = LogisticRegression(C=10, solver='sag', multi_class='multinomial', random_state=random_state, n_jobs=-1, tol = 1e-4, max_iter=200)
SVC = LinearSVC()
pipeline_KNN = make_pipeline(TfidfVectorizer(stop_words='english',smooth_idf=True), KNN)
pipeline_KNN.fit(textos_train, y_train)
pipeline_LR = make_pipeline(TfidfVectorizer(stop_words='english',smooth_idf=True), LR)
pipeline_SVC = make_pipeline(TfidfVectorizer(stop_words='english',smooth_idf=True), SVC)

from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,pipeline_KNN.predict(textos_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7413[0m


In [45]:
pipeline_SVC.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'tfidfvectorizer', 'linearsvc', 'tfidfvectorizer__analyzer', 'tfidfvectorizer__binary', 'tfidfvectorizer__decode_error', 'tfidfvectorizer__dtype', 'tfidfvectorizer__encoding', 'tfidfvectorizer__input', 'tfidfvectorizer__lowercase', 'tfidfvectorizer__max_df', 'tfidfvectorizer__max_features', 'tfidfvectorizer__min_df', 'tfidfvectorizer__ngram_range', 'tfidfvectorizer__norm', 'tfidfvectorizer__preprocessor', 'tfidfvectorizer__smooth_idf', 'tfidfvectorizer__stop_words', 'tfidfvectorizer__strip_accents', 'tfidfvectorizer__sublinear_tf', 'tfidfvectorizer__token_pattern', 'tfidfvectorizer__tokenizer', 'tfidfvectorizer__use_idf', 'tfidfvectorizer__vocabulary', 'linearsvc__C', 'linearsvc__class_weight', 'linearsvc__dual', 'linearsvc__fit_intercept', 'linearsvc__intercept_scaling', 'linearsvc__loss', 'linearsvc__max_iter', 'linearsvc__multi_class', 'linearsvc__penalty', 'linearsvc__random_state', 'linearsvc__tol', 'linearsvc__verbose'])

In [46]:
clf_KNN = KNeighborsClassifier()
param_dist_KNN_pipeline = {
    'kneighborsclassifier__n_neighbors': [3,5,11,21,51,101,151,201],
    'kneighborsclassifier__p': [2],
    'kneighborsclassifier__weights' :["uniform","distance"]
}
pipeline_KNN = make_pipeline(TfidfVectorizer(stop_words='english',smooth_idf=True), clf_KNN)
random_search_KNN = RandomizedSearchCV(
    pipeline_KNN, param_distributions=param_dist_KNN_pipeline, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_KNN.fit(textos_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_KNN.cv_results_, f'{dataset}+{model_name}+KNN')

RandomizedSearchCV took 61.25 seconds for 10 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7534 (std: 0.0074)[0m
[1mParameters: {'kneighborsclassifier__weights': 'uniform', 'kneighborsclassifier__p': 2, 'kneighborsclassifier__n_neighbors': 51}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7412 (std: 0.0069)[0m
[1mParameters: {'kneighborsclassifier__weights': 'distance', 'kneighborsclassifier__p': 2, 'kneighborsclassifier__n_neighbors': 101}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7338 (std: 0.0068)[0m
[1mParameters: {'kneighborsclassifier__weights': 'distance', 'kneighborsclassifier__p': 2, 'kneighborsclassifier__n_neighbors': 151}[0m



In [47]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_KNN.best_estimator_.predict(textos_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7408[0m


**Logistic Regression**

In [48]:
clf_LR = LogisticRegression(random_state=random_state,multi_class='multinomial', solver='sag', tol= 1e-4,max_iter=300)
param_dist_LR_pipeline = {
    'logisticregression__C': [0.1,1,10],
    "logisticregression__penalty": ["l2", "none"]
}
pipeline_LR = make_pipeline(TfidfVectorizer(stop_words='english',smooth_idf=True), clf_LR)
n_it = 10
random_search_LR = RandomizedSearchCV(
    pipeline_LR, param_distributions=param_dist_LR_pipeline, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_LR.fit(textos_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_LR.cv_results_, f'{dataset}+{model_name}+LR')

RandomizedSearchCV took 73.60 seconds for 10 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.8442 (std: 0.0055)[0m
[1mParameters: {'logisticregression__C': 10}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.8339 (std: 0.0069)[0m
[1mParameters: {'logisticregression__C': 1}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7868 (std: 0.0104)[0m
[1mParameters: {'logisticregression__C': 0.1}[0m



In [49]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_LR.best_estimator_.predict(textos_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.8544[0m


**SVC**

In [50]:
from sklearn.svm import LinearSVC
clf_SVC = LinearSVC(random_state=random_state, tol=1e-4)
param_dist_SVC_pipeline = {
    'linearsvc__C': [1e-1,1,10],
    "linearsvc__penalty": ["l2", "none"]
    }
pipeline_SVC = make_pipeline(TfidfVectorizer(stop_words='english',smooth_idf=True), clf_SVC)
n_it = 6
random_search_SVC = RandomizedSearchCV(
    pipeline_SVC, param_distributions=param_dist_SVC_pipeline, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_SVC.fit(textos_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_SVC.cv_results_, f'{dataset}+{model_name}+SVC')

RandomizedSearchCV took 62.09 seconds for 6 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.8534 (std: 0.0058)[0m
[1mParameters: {'linearsvc__C': 1}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.8497 (std: 0.0059)[0m
[1mParameters: {'linearsvc__C': 10}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.8389 (std: 0.0076)[0m
[1mParameters: {'linearsvc__C': 0.1}[0m



In [51]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_SVC.best_estimator_.predict(textos_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.8604[0m


In [52]:
from sklearn.ensemble import RandomForestClassifier
clf_RF = RandomForestClassifier(random_state=random_state)
param_dist_RF_pipeline = {
    'randomforestclassifier__n_estimators': [10,50,100,200],
    }
pipeline_RF = make_pipeline(TfidfVectorizer(stop_words='english',smooth_idf=True), clf_RF)
n_it = 10
random_search_RF = RandomizedSearchCV(
    pipeline_RF, param_distributions=param_dist_RF_pipeline, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_RF.fit(textos_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_RF.cv_results_, f'{dataset}+{model_name}+RF')
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_RF.best_estimator_.predict(textos_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

RandomizedSearchCV took 1315.91 seconds for 10 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7867 (std: 0.0098)[0m
[1mParameters: {'randomforestclassifier__n_estimators': 100}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7685 (std: 0.0075)[0m
[1mParameters: {'randomforestclassifier__n_estimators': 50}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.6683 (std: 0.0068)[0m
[1mParameters: {'randomforestclassifier__n_estimators': 10}[0m

[1mA acurácia balanceada nos dados de teste: 0.7949[0m


In [53]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_RF.best_estimator_.predict(textos_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7949[0m


# Pipeline 6: WOS + TF-IDF

In [54]:
#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
import pandas as pd
import numpy as np
import pickle
from sklearn.pipeline import make_pipeline
dataset = 'TC14_wos'
model_name = 'TFIDF'
df = pd.read_csv(f'Arquivos TCC/datasets/{dataset}.csv', sep=';')
df = df.dropna()
textos = df['text'].values.tolist()
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_validate
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)
labels = df['label']
labels_enc = preprocessing.LabelEncoder().fit_transform(labels)
#Split hold-out test data
from sklearn.model_selection import train_test_split
textos_train, textos_test, y_train, y_test = train_test_split(textos, labels_enc, test_size=0.10, random_state=random_state,stratify=labels_enc)

In [55]:
KNN = KNeighborsClassifier(n_neighbors=150, p = 2, weights='distance')
KNN.fit(X_train, y_train)
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,KNN.predict(X_test))
print("A acurácia balanceada nos dados de teste: {0:.3f}".format(balanced_acc))

A acurácia balanceada nos dados de teste: 0.709


In [56]:
LR = LogisticRegression(C=10, solver='sag', multi_class='multinomial', random_state=random_state, n_jobs=-1, tol = 1e-4, max_iter=200)
LR.fit(X_train, y_train)
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,LR.predict(X_test))
print("A acurácia balanceada nos dados de teste: {0:.3f}".format(balanced_acc))

A acurácia balanceada nos dados de teste: 0.747


**KNN**

In [57]:
clf_KNN = KNeighborsClassifier()
param_dist_KNN_pipeline = {
    'kneighborsclassifier__n_neighbors': [3,5,11,21,51,101,151,201],
    'kneighborsclassifier__p': [2],
    'kneighborsclassifier__weights' :["uniform","distance"]
}
pipeline_KNN = make_pipeline(TfidfVectorizer(stop_words='english',smooth_idf=True), clf_KNN)
n_it=10
random_search_KNN = RandomizedSearchCV(
    pipeline_KNN, param_distributions=param_dist_KNN_pipeline, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_KNN.fit(textos_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_KNN.cv_results_, f'{dataset}+{model_name}+KNN')

RandomizedSearchCV took 43.29 seconds for 10 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7016 (std: 0.0133)[0m
[1mParameters: {'kneighborsclassifier__weights': 'distance', 'kneighborsclassifier__p': 2, 'kneighborsclassifier__n_neighbors': 201}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.6993 (std: 0.0105)[0m
[1mParameters: {'kneighborsclassifier__weights': 'distance', 'kneighborsclassifier__p': 2, 'kneighborsclassifier__n_neighbors': 151}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.6966 (std: 0.0119)[0m
[1mParameters: {'kneighborsclassifier__weights': 'distance', 'kneighborsclassifier__p': 2, 'kneighborsclassifier__n_neighbors': 101}[0m



In [58]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_KNN.best_estimator_.predict(textos_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.6987[0m


**Logistic Regression**

In [59]:
clf_LR = LogisticRegression(random_state=random_state,multi_class='multinomial', solver='sag', tol= 1e-4,max_iter=300)
param_dist_LR_pipeline = {
    'logisticregression__C': [0.1,1,10],
    "logisticregression__penalty": ["l2", "none"]
}
pipeline_LR = make_pipeline(TfidfVectorizer(stop_words='english',smooth_idf=True), clf_LR)
n_it = 6
random_search_LR = RandomizedSearchCV(
    pipeline_LR, param_distributions=param_dist_LR_pipeline, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_LR.fit(textos_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_LR.cv_results_, f'{dataset}+{model_name}+LR')

RandomizedSearchCV took 36.85 seconds for 6 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.8192 (std: 0.0156)[0m
[1mParameters: {'logisticregression__C': 10}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.8080 (std: 0.0140)[0m
[1mParameters: {'logisticregression__C': 1}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.6967 (std: 0.0106)[0m
[1mParameters: {'logisticregression__C': 0.1}[0m



In [60]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_LR.best_estimator_.predict(textos_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.8092[0m


**SVC**

In [61]:
from sklearn.svm import LinearSVC
clf_SVC = LinearSVC(random_state=random_state, tol=1e-4)
param_dist_SVC_pipeline = {
    'linearsvc__C': [1e-1,1,10],
    "linearsvc__penalty": ["l2", "none"]
    }
pipeline_SVC = make_pipeline(TfidfVectorizer(stop_words='english',smooth_idf=True), clf_SVC)
n_it = 6
random_search_SVC = RandomizedSearchCV(
    pipeline_SVC, param_distributions=param_dist_SVC_pipeline, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_SVC.fit(textos_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_SVC.cv_results_, f'{dataset}+{model_name}+SVC')

RandomizedSearchCV took 19.41 seconds for 6 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.8239 (std: 0.0147)[0m
[1mParameters: {'linearsvc__C': 1}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.8167 (std: 0.0153)[0m
[1mParameters: {'linearsvc__C': 0.1}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.8144 (std: 0.0142)[0m
[1mParameters: {'linearsvc__C': 10}[0m



In [62]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_SVC.best_estimator_.predict(textos_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.8229[0m


In [63]:
from sklearn.ensemble import RandomForestClassifier
clf_RF = RandomForestClassifier(random_state=random_state)
param_dist_RF_pipeline = {
    'randomforestclassifier__n_estimators': [10,50,100,200],
    }
pipeline_RF = make_pipeline(TfidfVectorizer(stop_words='english',smooth_idf=True), clf_RF)
n_it = 10
random_search_RF = RandomizedSearchCV(
    pipeline_RF, param_distributions=param_dist_RF_pipeline, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_RF.fit(textos_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_RF.cv_results_, f'{dataset}+{model_name}+RF')
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_RF.best_estimator_.predict(textos_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

RandomizedSearchCV took 66.93 seconds for 10 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.8429 (std: 0.0128)[0m
[1mParameters: {'randomforestclassifier__n_estimators': 100}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.8266 (std: 0.0116)[0m
[1mParameters: {'randomforestclassifier__n_estimators': 50}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.6943 (std: 0.0172)[0m
[1mParameters: {'randomforestclassifier__n_estimators': 10}[0m

[1mA acurácia balanceada nos dados de teste: 0.8450[0m


# Testes Resultados


In [64]:
#K-fold cross validation strategy and import dependencies
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_validate
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

In [65]:
best_KNN = KNeighborsClassifier(n_neighbors=2, p = 1, weights='distance', algorithm='ball_tree')
cv_results = cross_validate(best_KNN, X_train, y_train, cv=cv, n_jobs=-1, scoring='balanced_accuracy')

In [66]:
cv_results['test_score'].mean()

0.6676216753111552

In [67]:
df_cv = pd.read_csv('Arquivos TCC/perfomance_cv.csv')

In [68]:
df_cv

Unnamed: 0,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,pipeline,params
0,0.746498,0.729844,0.742484,0.750318,0.736504,0.753034,0.758034,0.735843,0.748102,0.735337,TC02_20news+all-MiniLM-L6-v2+LR,{'C': 0.1}
1,0.844875,0.813789,0.808652,0.828727,0.810456,0.82583,0.82588,0.835406,0.814313,0.827628,TC02_20news+all-MiniLM-L6-v2+KNN,"{'weights': 'distance', 'p': 1, 'n_neighbors': 3}"
2,0.785418,0.761511,0.767484,0.784788,0.772634,0.786423,0.787553,0.773096,0.774763,0.788702,TC02_20news+all-MiniLM-L6-v2+LR,{'C': 10}
3,0.781523,0.757634,0.77083,0.786479,0.769856,0.78196,0.794245,0.768127,0.779245,0.785387,TC02_20news+all-MiniLM-L6-v2+SVC,{'C': 1}
4,0.822653,0.814338,0.809732,0.82427,0.819307,0.820899,0.817572,0.829844,0.817079,0.822097,TC02_20news+all-MiniLM-L12-v2+KNN,"{'weights': 'distance', 'p': 1, 'n_neighbors': 3}"
5,0.780955,0.775993,0.771935,0.762541,0.773196,0.765262,0.779213,0.774201,0.774176,0.764763,TC02_20news+all-MiniLM-L12-v2+LR,{'C': 10}
6,0.774856,0.774881,0.774139,0.775343,0.773745,0.775843,0.781436,0.77867,0.771985,0.767578,TC02_20news+all-MiniLM-L12-v2+SVC,{'C': 1}
7,0.767507,0.752724,0.741839,0.728256,0.741307,0.753414,0.758569,0.752884,0.738314,0.753632,TC14_wos+all-MiniLM-L6-v2+KNN,"{'weights': 'distance', 'p': 2, 'n_neighbors':..."
8,0.792379,0.792123,0.779848,0.760539,0.775306,0.78125,0.795834,0.764415,0.770571,0.790859,TC14_wos+all-MiniLM-L6-v2+LR,{'C': 1}
9,0.794099,0.796451,0.780187,0.76793,0.776353,0.782674,0.800438,0.764082,0.771376,0.789538,TC14_wos+all-MiniLM-L6-v2+SVC,{'C': 0.1}


In [69]:

df_cv_explode = df_cv.melt(['pipeline','params'], var_name='Date', value_name='Value')

In [70]:
df_cv.filter(regex=("split*"))

Unnamed: 0,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score
0,0.746498,0.729844,0.742484,0.750318,0.736504,0.753034,0.758034,0.735843,0.748102,0.735337
1,0.844875,0.813789,0.808652,0.828727,0.810456,0.82583,0.82588,0.835406,0.814313,0.827628
2,0.785418,0.761511,0.767484,0.784788,0.772634,0.786423,0.787553,0.773096,0.774763,0.788702
3,0.781523,0.757634,0.77083,0.786479,0.769856,0.78196,0.794245,0.768127,0.779245,0.785387
4,0.822653,0.814338,0.809732,0.82427,0.819307,0.820899,0.817572,0.829844,0.817079,0.822097
5,0.780955,0.775993,0.771935,0.762541,0.773196,0.765262,0.779213,0.774201,0.774176,0.764763
6,0.774856,0.774881,0.774139,0.775343,0.773745,0.775843,0.781436,0.77867,0.771985,0.767578
7,0.767507,0.752724,0.741839,0.728256,0.741307,0.753414,0.758569,0.752884,0.738314,0.753632
8,0.792379,0.792123,0.779848,0.760539,0.775306,0.78125,0.795834,0.764415,0.770571,0.790859
9,0.794099,0.796451,0.780187,0.76793,0.776353,0.782674,0.800438,0.764082,0.771376,0.789538
