In [1]:
#from google.colab import drive
#drive.mount('/content/drive')
import pandas as pd
import numpy as np
from IPython.display import clear_output
from tqdm import tqdm
from time import time
import json
from sklearn.utils.fixes import *
random_state = 20

In [2]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [3]:
# Utility function to report best scores
def report(results, pipeline, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results["rank_test_score"] == i)
        if i == 1:
          results['pipeline'] = pipeline
          df_results = pd.DataFrame(results)
          df = df_results[df_results['rank_test_score'] == 1].filter(regex=("split*"))
          df['pipeline'] = df_results['pipeline']
          df['params'] = df_results['params']
          with open(f'Arquivos TCC/perfomance_cv.csv', 'a') as f:
            df.to_csv(f, mode='a', header=f.tell()==0, index=False)
        for candidate in candidates:
            print(color.BOLD + "Model with rank: {0}".format(i)+ color.END)
            print(color.BOLD + 
                "Mean validation score: {0:.4f} (std: {1:.4f})".format(
                    results["mean_test_score"][candidate],
                    results["std_test_score"][candidate],
                )+ color.END
            )
            print(color.BOLD + "Parameters: {0}".format(results["params"][candidate])+ color.END)
            print("")

# Obtenção dos embeddings

In [None]:
import pandas as pd
import numpy as np
!pip install -U sentence-transformers
from IPython.display import clear_output
from sentence_transformers import SentenceTransformer
import pickle
clear_output()

In [None]:
###### Models ######
models_names = ["all-MiniLM-L6-v2","all-MiniLM-L12-v2"]
###### Datasets ######
datasets = ['TC02_20news', 'TC14_wos']
for dataset in datasets:
  df = pd.read_csv(f'Arquivos TCC/datasets/{dataset}.csv', sep=';')
  df = df.dropna()
  for model_name in models_names:
    model = SentenceTransformer(f"sentence-transformers/{model_name}")
    textos = df['text'].values.tolist()
    embeddings = model.encode(textos)
    with open(f'Arquivos TCC/embeddings/sbert/{model_name}_{dataset}.pkl', "wb") as fOut:
      pickle.dump({'sentences': textos, 'embeddings': embeddings, 'labels': df['label']}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

clear_output()

In [None]:
#umap para redução da dimensão do embedding
!pip install umap-learn
from umap import UMAP
import pandas as pd
import numpy as np
import pickle
from IPython.display import clear_output
clear_output()
datasets = ['TC02_20news', 'TC14_wos']
models_names = ["all-MiniLM-L6-v2","all-MiniLM-L12-v2"]
for model_name in models_names:
  for dataset in datasets:
    embeddings_path = f'Arquivos TCC/embeddings/{model_name}_{dataset}.pkl'
    with open(embeddings_path, "rb") as fIn:
      stored_data = pickle.load(fIn)
      stored_sentences = stored_data['sentences']
      stored_embeddings = stored_data['embeddings']
      stored_labels = stored_data['labels']
      umap_model = UMAP(
              n_components=30,
              min_dist=0.01,
              random_state=5,
              n_neighbors=20,
              metric='cosine',
              low_memory=True
          )
      embeddings_umap = umap_model.fit_transform(stored_embeddings)
      with open(f'Arquivos TCC/embeddings/umap/{model_name}_{dataset}.pkl', "wb") as fOut:
        pickle.dump({'sentences': stored_sentences, 'umap_embeddings': embeddings_umap, 'labels': stored_labels}, fOut, protocol=pickle.HIGHEST_PROTOCOL)



# Pipeline 1: 20NG + all-MiniLM-L6-v2


In [5]:
#LOADING DATA
import pandas as pd
import numpy as np
import pickle
from IPython.display import clear_output
from sklearn import preprocessing
clear_output()
#datasets = ['TC02_20news', 'TC14_wos']
#models_names = ["all-MiniLM-L6-v2","all-MiniLM-L12-v2"]
dataset = 'TC02_20news'
model_name = "all-MiniLM-L6-v2"
embeddings_path = f'Arquivos TCC/embeddings/sbert/{model_name}_{dataset}.pkl'
with open(embeddings_path, "rb") as fIn:
  stored_data = pickle.load(fIn)
embeddings = stored_data['embeddings']
labels = stored_data['labels']
labels_enc = preprocessing.LabelEncoder().fit_transform(labels)
unique, counts = np.unique(stored_data['labels'], return_counts=True)
#Split hold-out test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels_enc, test_size=0.10, random_state=random_state,stratify=labels_enc)
unique, counts = np.unique(y_test, return_counts=True)
#dict(zip(unique, counts))
#K-fold cross validation strategy and import dependencies
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score
cv = StratifiedKFold(n_splits=10, shuffle=False)#, random_state=random_state)

**KNN**

In [26]:
#KNN
clf_KNN = KNeighborsClassifier(algorithm= "ball_tree")
param_dist_KNN = {
    "n_neighbors": [3,5,11,21],
    "p": [1,2],
    "weights" :["uniform","distance"]
}
n_it = 10
random_search_KNN = RandomizedSearchCV(
    clf_KNN, param_distributions=param_dist_KNN, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_KNN.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_KNN.cv_results_ , f'{dataset}+{model_name}+KNN')



RandomizedSearchCV took 770.67 seconds for 10 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.8236 (std: 0.0111)[0m
[1mParameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 3}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.8202 (std: 0.0102)[0m
[1mParameters: {'weights': 'distance', 'p': 2, 'n_neighbors': 5}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.8186 (std: 0.0126)[0m
[1mParameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 5}[0m



In [27]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_KNN.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.8279[0m


**Logistic Regression**

In [28]:
#Logistic Regression

clf_LR = LogisticRegression(random_state=42,multi_class='multinomial', solver='sag', tol= 1e-4,max_iter=300)
param_dist_LR = {
    "C": [1e-1,1,10]
}
n_it = 4
random_search_LR = RandomizedSearchCV(
    clf_LR, param_distributions=param_dist_LR, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_LR.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_LR.cv_results_, f'{dataset}+{model_name}+LR')


RandomizedSearchCV took 308.60 seconds for 4 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7782 (std: 0.0091)[0m
[1mParameters: {'C': 10}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7732 (std: 0.0066)[0m
[1mParameters: {'C': 1}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7436 (std: 0.0086)[0m
[1mParameters: {'C': 0.1}[0m



In [29]:
balanced_acc= balanced_accuracy_score(y_test,random_search_LR.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7774[0m


**SVC**

In [30]:
from sklearn.svm import LinearSVC
clf_SVC = LinearSVC(random_state=random_state, tol=1e-4)
param_dist_SVC = {
    'C': [0.1,1, 10],
    }
n_it = 6
random_search_SVC = RandomizedSearchCV(
    clf_SVC, param_distributions=param_dist_SVC, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_SVC.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_SVC.cv_results_, f'{dataset}+{model_name}+SVC')

RandomizedSearchCV took 138.70 seconds for 6 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7775 (std: 0.0102)[0m
[1mParameters: {'C': 1}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7711 (std: 0.0093)[0m
[1mParameters: {'C': 10}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7690 (std: 0.0067)[0m
[1mParameters: {'C': 0.1}[0m



In [31]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_SVC.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7879[0m


In [72]:
from sklearn.ensemble import RandomForestClassifier
clf_RF = RandomForestClassifier(random_state=random_state)
param_dist_RF = {
    'n_estimators': [10,50,100,200],
    }
n_it = 10
random_search_RF = RandomizedSearchCV(
    clf_RF, param_distributions=param_dist_RF, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_RF.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_RF.cv_results_, f'{dataset}+{model_name}+RF')
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_RF.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

RandomizedSearchCV took 299.58 seconds for 6 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.8239 (std: 0.0147)[0m
[1mParameters: {'linearsvc__C': 1}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.8167 (std: 0.0153)[0m
[1mParameters: {'linearsvc__C': 0.1}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.8144 (std: 0.0142)[0m
[1mParameters: {'linearsvc__C': 10}[0m



In [73]:
report(random_search_RF.cv_results_, f'{dataset}+{model_name}+RF')

[1mModel with rank: 1[0m
[1mMean validation score: 0.7643 (std: 0.0075)[0m
[1mParameters: {'n_estimators': 200}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7470 (std: 0.0095)[0m
[1mParameters: {'n_estimators': 100}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7146 (std: 0.0091)[0m
[1mParameters: {'n_estimators': 50}[0m



# Pipeline 2: 20NG + all-MiniLM-L12-v2

In [13]:
#LOADING DATA
import pandas as pd
import numpy as np
import pickle
from IPython.display import clear_output
from sklearn import preprocessing
clear_output()
#datasets = ['TC02_20news', 'TC14_wos']
#models_names = ["all-MiniLM-L6-v2","all-MiniLM-L12-v2"]
dataset = 'TC02_20news'
model_name = "all-MiniLM-L12-v2"
embeddings_path = f'Arquivos TCC/embeddings/sbert/{model_name}_{dataset}.pkl'
with open(embeddings_path, "rb") as fIn:
  stored_data = pickle.load(fIn)
embeddings = stored_data['embeddings']
labels = stored_data['labels']
labels_enc = preprocessing.LabelEncoder().fit_transform(labels)
#Split hold-out test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels_enc, test_size=0.10, random_state=random_state,stratify=labels_enc)
unique, counts = np.unique(y_test, return_counts=True)
#K-fold cross validation strategy and import dependencies
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score
cv = StratifiedKFold(n_splits=10, shuffle=False)#, random_state=random_state)
print(X_train[10][:10],'\n', y_test[50])

[ 0.04915353  0.08488183 -0.01936629  0.00663146 -0.01925441  0.00633633
  0.00899213  0.04904805 -0.08862329  0.03560806] 
 12


**KNN**

In [33]:
#KNN
clf_KNN = KNeighborsClassifier(algorithm= "ball_tree")
param_dist_KNN = {
    "n_neighbors": [3,5,11,21],
    "p": [1,2],
    "weights" :["uniform","distance"]
}
n_it = 10
random_search_KNN = RandomizedSearchCV(
    clf_KNN, param_distributions=param_dist_KNN, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_KNN.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_KNN.cv_results_, f'{dataset}+{model_name}+KNN')

RandomizedSearchCV took 755.04 seconds for 10 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.8198 (std: 0.0053)[0m
[1mParameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 3}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.8151 (std: 0.0073)[0m
[1mParameters: {'weights': 'distance', 'p': 2, 'n_neighbors': 5}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.8144 (std: 0.0064)[0m
[1mParameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 5}[0m



In [34]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_KNN.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.8179[0m


**Logistic Regression**

In [35]:
#Logistic Regression

clf_LR = LogisticRegression(random_state=42,multi_class='multinomial', solver='sag', tol= 1e-4,max_iter=300)
param_dist_LR = {
    "C": [1e-1,1,10]
}
n_it = 8
random_search_LR = RandomizedSearchCV(
    clf_LR, param_distributions=param_dist_LR, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_LR.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_LR.cv_results_, f'{dataset}+{model_name}+LR')

RandomizedSearchCV took 383.07 seconds for 8 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7722 (std: 0.0059)[0m
[1mParameters: {'C': 10}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7674 (std: 0.0052)[0m
[1mParameters: {'C': 1}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7418 (std: 0.0080)[0m
[1mParameters: {'C': 0.1}[0m



In [36]:
balanced_acc= balanced_accuracy_score(y_test,random_search_LR.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7779[0m


**SVC**

In [37]:
from sklearn.svm import LinearSVC
clf_SVC = LinearSVC(random_state=random_state, tol=1e-4)
param_dist_SVC = {
    'C': [0.1,1, 10],
    }
n_it = 6
random_search_SVC = RandomizedSearchCV(
    clf_SVC, param_distributions=param_dist_SVC, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_SVC.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_SVC.cv_results_, f'{dataset}+{model_name}+SVC')

RandomizedSearchCV took 143.49 seconds for 6 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7748 (std: 0.0035)[0m
[1mParameters: {'C': 1}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7671 (std: 0.0051)[0m
[1mParameters: {'C': 0.1}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7656 (std: 0.0057)[0m
[1mParameters: {'C': 10}[0m



In [38]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_SVC.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7779[0m


In [14]:
from sklearn.ensemble import RandomForestClassifier
clf_RF = RandomForestClassifier(random_state=random_state)
param_dist_RF = {
    'n_estimators': [10,50,100,200],
    }
n_it = 10
random_search_RF = RandomizedSearchCV(
    clf_RF, param_distributions=param_dist_RF, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_RF.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_RF.cv_results_, f'{dataset}+{model_name}+RF')
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_RF.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

RandomizedSearchCV took 755.72 seconds for 10 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7711 (std: 0.0105)[0m
[1mParameters: {'n_estimators': 200}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7507 (std: 0.0115)[0m
[1mParameters: {'n_estimators': 100}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7202 (std: 0.0081)[0m
[1mParameters: {'n_estimators': 50}[0m

[1mA acurácia balanceada nos dados de teste: 0.7829[0m


# Pipeline 3: WOS + all-MiniLM-L6-v2

In [5]:
#LOADING DATA
import pandas as pd
import numpy as np
import pickle
from IPython.display import clear_output
from sklearn import preprocessing
clear_output()
#datasets = ['TC02_20news', 'TC14_wos']
#models_names = ["all-MiniLM-L6-v2","all-MiniLM-L12-v2"]
dataset = 'TC14_wos'
model_name = "all-MiniLM-L6-v2"
embeddings_path = f'Arquivos TCC/embeddings/sbert/{model_name}_{dataset}.pkl'
with open(embeddings_path, "rb") as fIn:
  stored_data = pickle.load(fIn)
embeddings = stored_data['embeddings']
labels = stored_data['labels']
labels_enc = preprocessing.LabelEncoder().fit_transform(labels)
#Split hold-out test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels_enc, test_size=0.10, random_state=random_state,stratify=labels_enc)
unique, counts = np.unique(y_test, return_counts=True)
#K-fold cross validation strategy and import dependencies
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score
cv = StratifiedKFold(n_splits=10, shuffle=False)#, random_state=random_state)
print(X_train[10][:10], y_test[50])

[-0.04389033 -0.09177981 -0.02245235 -0.05314789 -0.02382849 -0.08375161
 -0.01472456 -0.00565096  0.02871884  0.02558138] 30


**KNN**

In [40]:
#KNN
clf_KNN = KNeighborsClassifier(algorithm= "ball_tree")
param_dist_KNN = {
    "n_neighbors": [3,5,11,21],
    "p": [1,2],
    "weights" :["uniform","distance"]
}
n_it = 10
random_search_KNN = RandomizedSearchCV(
    clf_KNN, param_distributions=param_dist_KNN, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_KNN.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_KNN.cv_results_, f'{dataset}+{model_name}+KNN')

RandomizedSearchCV took 275.16 seconds for 10 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7488 (std: 0.0108)[0m
[1mParameters: {'weights': 'distance', 'p': 2, 'n_neighbors': 11}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7463 (std: 0.0102)[0m
[1mParameters: {'weights': 'distance', 'p': 2, 'n_neighbors': 21}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7441 (std: 0.0098)[0m
[1mParameters: {'weights': 'uniform', 'p': 1, 'n_neighbors': 11}[0m



In [41]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_KNN.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7468[0m


**Logistic Regression**

In [42]:
#Logistic Regression

clf_LR = LogisticRegression(random_state=42,multi_class='multinomial', solver='sag', tol= 1e-4,max_iter=300)
param_dist_LR = {
    "C": [1e-1,1,10]
}
n_it = 8
random_search_LR = RandomizedSearchCV(
    clf_LR, param_distributions=param_dist_LR, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_LR.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_LR.cv_results_, f'{dataset}+{model_name}+LR')

RandomizedSearchCV took 281.27 seconds for 8 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7803 (std: 0.0118)[0m
[1mParameters: {'C': 1}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7726 (std: 0.0114)[0m
[1mParameters: {'C': 10}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7499 (std: 0.0112)[0m
[1mParameters: {'C': 0.1}[0m



In [43]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_LR.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7847[0m


**SVC**

In [45]:
from sklearn.svm import LinearSVC
clf_SVC = LinearSVC(random_state=random_state, tol=1e-4)
param_dist_SVC = {
    'C': [0.1,1, 10],
    }
n_it = 6
random_search_SVC = RandomizedSearchCV(
    clf_SVC, param_distributions=param_dist_SVC, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_SVC.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_SVC.cv_results_, f'{dataset}+{model_name}+SVC')

RandomizedSearchCV took 68.02 seconds for 6 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7823 (std: 0.0119)[0m
[1mParameters: {'C': 0.1}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7773 (std: 0.0128)[0m
[1mParameters: {'C': 1}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7393 (std: 0.0094)[0m
[1mParameters: {'C': 10}[0m



In [46]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_SVC.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7835[0m


In [28]:
from sklearn.ensemble import RandomForestClassifier
clf_RF = RandomForestClassifier(random_state=random_state)
param_dist_RF = {
    'n_estimators': [10,50,100,200],
    }
n_it = 10
random_search_RF = RandomizedSearchCV(
    clf_RF, param_distributions=param_dist_RF, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_RF.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_RF.cv_results_, f'{dataset}+{model_name}+RF')
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_RF.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

RandomizedSearchCV took 496.93 seconds for 10 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7312 (std: 0.0115)[0m
[1mParameters: {'n_estimators': 200}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7137 (std: 0.0127)[0m
[1mParameters: {'n_estimators': 100}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.6810 (std: 0.0117)[0m
[1mParameters: {'n_estimators': 50}[0m

[1mA acurácia balanceada nos dados de teste: 0.7456[0m


# Pipeline 4: WOS + all-MiniLM-L12-v2


In [4]:
#LOADING DATA
import pandas as pd
import numpy as np
import pickle
from IPython.display import clear_output
from sklearn import preprocessing
clear_output()
#datasets = ['TC02_20news', 'TC14_wos']
#models_names = ["all-MiniLM-L6-v2","all-MiniLM-L12-v2"]
dataset = 'TC14_wos'
model_name = "all-MiniLM-L12-v2"
embeddings_path = f'Arquivos TCC/embeddings/sbert/{model_name}_{dataset}.pkl'
with open(embeddings_path, "rb") as fIn:
  stored_data = pickle.load(fIn)
embeddings = stored_data['embeddings']
labels = stored_data['labels']
labels_enc = preprocessing.LabelEncoder().fit_transform(labels)
#Split hold-out test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels_enc, test_size=0.10, random_state=random_state,stratify=labels_enc)
unique, counts = np.unique(y_test, return_counts=True)
#K-fold cross validation strategy and import dependencies
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score
cv = StratifiedKFold(n_splits=10, shuffle=False)#, random_state=random_state)
print(X_train[10][:10], y_test[50])

[-0.03677584 -0.08568367  0.03027113  0.00241753 -0.07861282 -0.10965899
 -0.09844724  0.00531735  0.06428049  0.03564424] 30


**KNN**

In [48]:
#KNN
clf_KNN = KNeighborsClassifier(algorithm= "ball_tree")
param_dist_KNN = {
    "n_neighbors": [3,5,11,21],
    "p": [1,2],
    "weights" :["uniform","distance"]
}
n_it = 10
random_search_KNN = RandomizedSearchCV(
    clf_KNN, param_distributions=param_dist_KNN, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_KNN.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_KNN.cv_results_, f'{dataset}+{model_name}+KNN')

RandomizedSearchCV took 268.88 seconds for 10 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7297 (std: 0.0093)[0m
[1mParameters: {'weights': 'distance', 'p': 2, 'n_neighbors': 11}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7280 (std: 0.0098)[0m
[1mParameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 21}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7261 (std: 0.0091)[0m
[1mParameters: {'weights': 'distance', 'p': 2, 'n_neighbors': 21}[0m



In [49]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_KNN.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7378[0m


**Logistic Regression**

In [50]:
#Logistic Regression

clf_LR = LogisticRegression(random_state=42,multi_class='multinomial', solver='sag', tol= 1e-4,max_iter=300)
param_dist_LR = {
    "C": [1e-1,1,10]
}
n_it = 8
random_search_LR = RandomizedSearchCV(
    clf_LR, param_distributions=param_dist_LR, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_LR.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_LR.cv_results_, f'{dataset}+{model_name}+LR')

RandomizedSearchCV took 273.23 seconds for 8 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7602 (std: 0.0080)[0m
[1mParameters: {'C': 1}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7448 (std: 0.0098)[0m
[1mParameters: {'C': 10}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7347 (std: 0.0097)[0m
[1mParameters: {'C': 0.1}[0m



In [51]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_LR.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7666[0m


**SVC**

In [52]:
from sklearn.svm import LinearSVC
clf_SVC = LinearSVC(random_state=random_state, tol=1e-4)
param_dist_SVC = {
    'C': [0.1,1, 10],
    }
n_it = 6
random_search_SVC = RandomizedSearchCV(
    clf_SVC, param_distributions=param_dist_SVC, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_SVC.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_SVC.cv_results_, f'{dataset}+{model_name}+SVC')

RandomizedSearchCV took 77.75 seconds for 6 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7585 (std: 0.0088)[0m
[1mParameters: {'C': 0.1}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7511 (std: 0.0132)[0m
[1mParameters: {'C': 1}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7094 (std: 0.0133)[0m
[1mParameters: {'C': 10}[0m



In [None]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_SVC.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7567[0m


In [5]:
from sklearn.ensemble import RandomForestClassifier
clf_RF = RandomForestClassifier(random_state=random_state)
param_dist_RF = {
    'n_estimators': [10,50,100,200],
    }
n_it = 10
random_search_RF = RandomizedSearchCV(
    clf_RF, param_distributions=param_dist_RF, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_RF.fit(X_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_RF.cv_results_, f'{dataset}+{model_name}+RF')
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_RF.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

RandomizedSearchCV took 639.65 seconds for 10 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7160 (std: 0.0122)[0m
[1mParameters: {'n_estimators': 200}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7019 (std: 0.0101)[0m
[1mParameters: {'n_estimators': 100}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.6681 (std: 0.0113)[0m
[1mParameters: {'n_estimators': 50}[0m

[1mA acurácia balanceada nos dados de teste: 0.7112[0m


# Pipeline 5: 20NG + TF-IDF


In [17]:
#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
import pandas as pd
import numpy as np
import pickle
from sklearn.pipeline import make_pipeline
dataset = 'TC02_20news'
model_name = 'TDIDF'
df = pd.read_csv(f'Arquivos TCC/datasets/{dataset}.csv', sep=';')
df = df.dropna()
textos = df['text'].values.tolist()
labels = df['label']
labels_enc = preprocessing.LabelEncoder().fit_transform(labels)
#Split hold-out test data
from sklearn.model_selection import train_test_split
textos_train, textos_test, y_train, y_test = train_test_split(textos, labels_enc, test_size=0.10, random_state=random_state,stratify=labels_enc)

In [18]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_validate
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)
unique, counts = np.unique(y_test, return_counts=True)
print(y_train[10], y_test[50])

8 12


**KNN**

In [55]:

KNN = KNeighborsClassifier(n_neighbors=150, p = 2, weights='uniform', metric='cosine')
LR = LogisticRegression(C=10, solver='sag', multi_class='multinomial', random_state=random_state, n_jobs=-1, tol = 1e-4, max_iter=200)
SVC = LinearSVC()
pipeline_KNN = make_pipeline(TfidfVectorizer(stop_words='english',smooth_idf=True), KNN)
pipeline_KNN.fit(textos_train, y_train)
pipeline_LR = make_pipeline(TfidfVectorizer(stop_words='english',smooth_idf=True), LR)
pipeline_SVC = make_pipeline(TfidfVectorizer(stop_words='english',smooth_idf=True), SVC)

from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,pipeline_KNN.predict(textos_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7413[0m


In [None]:
pipeline_SVC.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'tfidfvectorizer', 'linearsvc', 'tfidfvectorizer__analyzer', 'tfidfvectorizer__binary', 'tfidfvectorizer__decode_error', 'tfidfvectorizer__dtype', 'tfidfvectorizer__encoding', 'tfidfvectorizer__input', 'tfidfvectorizer__lowercase', 'tfidfvectorizer__max_df', 'tfidfvectorizer__max_features', 'tfidfvectorizer__min_df', 'tfidfvectorizer__ngram_range', 'tfidfvectorizer__norm', 'tfidfvectorizer__preprocessor', 'tfidfvectorizer__smooth_idf', 'tfidfvectorizer__stop_words', 'tfidfvectorizer__strip_accents', 'tfidfvectorizer__sublinear_tf', 'tfidfvectorizer__token_pattern', 'tfidfvectorizer__tokenizer', 'tfidfvectorizer__use_idf', 'tfidfvectorizer__vocabulary', 'linearsvc__C', 'linearsvc__class_weight', 'linearsvc__dual', 'linearsvc__fit_intercept', 'linearsvc__intercept_scaling', 'linearsvc__loss', 'linearsvc__max_iter', 'linearsvc__multi_class', 'linearsvc__penalty', 'linearsvc__random_state', 'linearsvc__tol', 'linearsvc__verbose'])

In [56]:
clf_KNN = KNeighborsClassifier()
param_dist_KNN_pipeline = {
    'kneighborsclassifier__n_neighbors': [15,51,101,151,201],
    'kneighborsclassifier__p': [2],
    'kneighborsclassifier__weights' :["uniform","distance"]
}
pipeline_KNN = make_pipeline(TfidfVectorizer(stop_words='english',smooth_idf=True), clf_KNN)
random_search_KNN = RandomizedSearchCV(
    pipeline_KNN, param_distributions=param_dist_KNN_pipeline, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_KNN.fit(textos_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_KNN.cv_results_, f'{dataset}+{model_name}+KNN')

RandomizedSearchCV took 69.36 seconds for 6 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7534 (std: 0.0074)[0m
[1mParameters: {'kneighborsclassifier__weights': 'uniform', 'kneighborsclassifier__p': 2, 'kneighborsclassifier__n_neighbors': 51}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7412 (std: 0.0069)[0m
[1mParameters: {'kneighborsclassifier__weights': 'distance', 'kneighborsclassifier__p': 2, 'kneighborsclassifier__n_neighbors': 101}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7337 (std: 0.0068)[0m
[1mParameters: {'kneighborsclassifier__weights': 'distance', 'kneighborsclassifier__p': 2, 'kneighborsclassifier__n_neighbors': 151}[0m



In [57]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_KNN.best_estimator_.predict(textos_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7408[0m


**Logistic Regression**

In [6]:
clf_LR = LogisticRegression(random_state=42,multi_class='multinomial', solver='sag', tol= 1e-4,max_iter=300)
param_dist_LR_pipeline = {
    'logisticregression__C': [0.1,1,10]
}
pipeline_LR = make_pipeline(TfidfVectorizer(stop_words='english',smooth_idf=True), clf_LR)
n_it = 10
random_search_LR = RandomizedSearchCV(
    pipeline_LR, param_distributions=param_dist_LR_pipeline, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_LR.fit(textos_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_LR.cv_results_, f'{dataset}+{model_name}+LR')

RandomizedSearchCV took 483.04 seconds for 10 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.8442 (std: 0.0055)[0m
[1mParameters: {'logisticregression__C': 10}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.8340 (std: 0.0069)[0m
[1mParameters: {'logisticregression__C': 1}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.7867 (std: 0.0105)[0m
[1mParameters: {'logisticregression__C': 0.1}[0m



In [7]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_LR.best_estimator_.predict(textos_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.8544[0m


**SVC**

In [60]:
from sklearn.svm import LinearSVC
clf_SVC = LinearSVC(random_state=random_state, tol=1e-4)
param_dist_SVC_pipeline = {
    'linearsvc__C': [0.1,1, 10],
    }
pipeline_SVC = make_pipeline(TfidfVectorizer(stop_words='english',smooth_idf=True), clf_SVC)
n_it = 6
random_search_SVC = RandomizedSearchCV(
    pipeline_SVC, param_distributions=param_dist_SVC_pipeline, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_SVC.fit(textos_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_SVC.cv_results_, f'{dataset}+{model_name}+SVC')

RandomizedSearchCV took 111.65 seconds for 6 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.8534 (std: 0.0058)[0m
[1mParameters: {'linearsvc__C': 1}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.8497 (std: 0.0059)[0m
[1mParameters: {'linearsvc__C': 10}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.8389 (std: 0.0076)[0m
[1mParameters: {'linearsvc__C': 0.1}[0m



In [61]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_SVC.best_estimator_.predict(textos_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.8604[0m


In [19]:
from sklearn.ensemble import RandomForestClassifier
clf_RF = RandomForestClassifier(random_state=random_state)
param_dist_RF_pipeline = {
    'randomforestclassifier__n_estimators': [10,50,100],
    }
pipeline_RF = make_pipeline(TfidfVectorizer(stop_words='english',smooth_idf=True), clf_RF)
n_it = 10
random_search_RF = RandomizedSearchCV(
    pipeline_RF, param_distributions=param_dist_RF_pipeline, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_RF.fit(textos_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_RF.cv_results_, f'{dataset}+{model_name}+RF')
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_RF.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

RandomizedSearchCV took 7437.07 seconds for 10 candidates parameter settings.


[1mModel with rank: 1[0m
[1mMean validation score: 0.7867 (std: 0.0098)[0m
[1mParameters: {'randomforestclassifier__n_estimators': 100}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.7685 (std: 0.0075)[0m
[1mParameters: {'randomforestclassifier__n_estimators': 50}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.6683 (std: 0.0068)[0m
[1mParameters: {'randomforestclassifier__n_estimators': 10}[0m



AttributeError: 'numpy.ndarray' object has no attribute 'lower'

In [22]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_RF.best_estimator_.predict(textos_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.7949[0m


# Pipeline 6: WOS + TF-IDF

In [62]:
#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
import pandas as pd
import numpy as np
import pickle
dataset = 'TC14_wos'
model_name = 'TDIDF'
df = pd.read_csv(f'Arquivos TCC/datasets/{dataset}.csv', sep=';')
df = df.dropna()
textos = df['text'].values.tolist()
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_validate
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)
labels = df['label']
labels_enc = preprocessing.LabelEncoder().fit_transform(labels)
#Split hold-out test data
from sklearn.model_selection import train_test_split
textos_train, textos_test, y_train, y_test = train_test_split(textos, labels_enc, test_size=0.10, random_state=random_state,stratify=labels_enc)

In [63]:
KNN = KNeighborsClassifier(n_neighbors=150, p = 2, weights='distance')
KNN.fit(X_train, y_train)
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,KNN.predict(X_test))
print("A acurácia balanceada nos dados de teste: {0:.3f}".format(balanced_acc))

A acurácia balanceada nos dados de teste: 0.709


In [64]:
LR = LogisticRegression(C=10, solver='sag', multi_class='multinomial', random_state=random_state, n_jobs=-1, tol = 1e-4, max_iter=200)
LR.fit(X_train, y_train)
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,LR.predict(X_test))
print("A acurácia balanceada nos dados de teste: {0:.3f}".format(balanced_acc))

A acurácia balanceada nos dados de teste: 0.747


**KNN**

In [65]:
clf_KNN = KNeighborsClassifier()
param_dist_KNN_pipeline = {
    'kneighborsclassifier__n_neighbors': [15,51,101,151,201],
    'kneighborsclassifier__p': [2],
    'kneighborsclassifier__weights' :["uniform","distance"]
}
pipeline_KNN = make_pipeline(TfidfVectorizer(stop_words='english',smooth_idf=True), clf_KNN)
n_it=10
random_search_KNN = RandomizedSearchCV(
    pipeline_KNN, param_distributions=param_dist_KNN_pipeline, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_KNN.fit(textos_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_KNN.cv_results_, f'{dataset}+{model_name}+KNN')

RandomizedSearchCV took 46.25 seconds for 10 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.7016 (std: 0.0133)[0m
[1mParameters: {'kneighborsclassifier__weights': 'distance', 'kneighborsclassifier__p': 2, 'kneighborsclassifier__n_neighbors': 201}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.6997 (std: 0.0131)[0m
[1mParameters: {'kneighborsclassifier__weights': 'uniform', 'kneighborsclassifier__p': 2, 'kneighborsclassifier__n_neighbors': 201}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.6993 (std: 0.0105)[0m
[1mParameters: {'kneighborsclassifier__weights': 'distance', 'kneighborsclassifier__p': 2, 'kneighborsclassifier__n_neighbors': 151}[0m



In [66]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_KNN.best_estimator_.predict(textos_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.6987[0m


**Logistic Regression**

In [67]:
clf_LR = LogisticRegression(random_state=42,multi_class='multinomial', solver='sag', tol= 1e-4,max_iter=300)
param_dist_LR_pipeline = {
    'logisticregression__C': [0.1,1,10]
}
pipeline_LR = make_pipeline(TfidfVectorizer(stop_words='english',smooth_idf=True), clf_LR)
n_it = 6
random_search_LR = RandomizedSearchCV(
    pipeline_LR, param_distributions=param_dist_LR_pipeline, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_LR.fit(textos_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_LR.cv_results_, f'{dataset}+{model_name}+LR')

RandomizedSearchCV took 77.99 seconds for 6 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.8192 (std: 0.0156)[0m
[1mParameters: {'logisticregression__C': 10}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.8080 (std: 0.0140)[0m
[1mParameters: {'logisticregression__C': 1}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.6969 (std: 0.0107)[0m
[1mParameters: {'logisticregression__C': 0.1}[0m



In [68]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_LR.best_estimator_.predict(textos_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.8092[0m


**SVC**

In [69]:
from sklearn.svm import LinearSVC
clf_SVC = LinearSVC(random_state=random_state, tol=1e-4)
param_dist_SVC_pipeline = {
    'linearsvc__C': [0.1,1, 10],
    }
pipeline_SVC = make_pipeline(TfidfVectorizer(stop_words='english',smooth_idf=True), clf_SVC)
n_it = 6
random_search_SVC = RandomizedSearchCV(
    pipeline_SVC, param_distributions=param_dist_SVC_pipeline, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_SVC.fit(textos_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_SVC.cv_results_, f'{dataset}+{model_name}+SVC')

RandomizedSearchCV took 33.25 seconds for 6 candidates parameter settings.
[1mModel with rank: 1[0m
[1mMean validation score: 0.8239 (std: 0.0147)[0m
[1mParameters: {'linearsvc__C': 1}[0m

[1mModel with rank: 2[0m
[1mMean validation score: 0.8167 (std: 0.0153)[0m
[1mParameters: {'linearsvc__C': 0.1}[0m

[1mModel with rank: 3[0m
[1mMean validation score: 0.8144 (std: 0.0142)[0m
[1mParameters: {'linearsvc__C': 10}[0m



In [70]:
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_SVC.best_estimator_.predict(textos_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

[1mA acurácia balanceada nos dados de teste: 0.8229[0m


In [None]:
from sklearn.ensemble import RandomForestClassifier
clf_RF = RandomForestClassifier(random_state=random_state)
param_dist_RF_pipeline = {
    'randomforestclassifier__n_estimators': [10,50,100],
    }
pipeline_RF = make_pipeline(TfidfVectorizer(stop_words='english',smooth_idf=True), clf_RF)
n_it = 10
random_search_RF = RandomizedSearchCV(
    pipeline_RF, param_distributions=param_dist_RF_pipeline, n_iter=n_it, cv = cv, random_state = random_state, verbose = 2, scoring='balanced_accuracy', n_jobs=-1
)
start = time()
random_search_RF.fit(textos_train, y_train)
clear_output()
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
    % ((time() - start), n_it)
)
report(random_search_RF.cv_results_, f'{dataset}+{model_name}+RF')
from sklearn.metrics import balanced_accuracy_score
balanced_acc = balanced_accuracy_score(y_test,random_search_RF.best_estimator_.predict(X_test))
print(color.BOLD+"A acurácia balanceada nos dados de teste: {0:.4f}".format(balanced_acc)+color.END)

# Testes Resultados


In [None]:
#K-fold cross validation strategy and import dependencies
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_validate
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

In [None]:
best_KNN = KNeighborsClassifier(n_neighbors=2, p = 1, weights='distance', algorithm='ball_tree')
cv_results = cross_validate(best_KNN, X_train, y_train, cv=cv, n_jobs=-1, scoring='balanced_accuracy')

In [None]:
cv_results['test_score'].mean()

0.8187464985994397

In [None]:
df_cv = pd.read_csv('Arquivos TCC/perfomance_cv.csv')

In [None]:
df_cv

Unnamed: 0,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,pipeline,params
0,0.746498,0.729844,0.742484,0.750318,0.736504,0.753034,0.758034,0.735843,0.748102,0.735337,TC02_20news+all-MiniLM-L6-v2+LR,{'C': 0.1}


In [None]:

df_cv_explode = df_cv.melt(['pipeline','params'], var_name='Date', value_name='Value')

In [None]:
df_cv.filter(regex=("split*"))

Unnamed: 0,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score
0,0.746498,0.729844,0.742484,0.750318,0.736504,0.753034,0.758034,0.735843,0.748102,0.735337
