In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [3]:
path = "/content/drive/MyDrive/Dataset/Folds"
def load_data(dataset, fold):
  uri_val = "{}/{}/{}/prob_train.csv".format(path, dataset, fold)
  uri_test = "{}/{}/{}/prob_test.csv".format(path, dataset, fold)
  uri_test_pred = "{}/{}/{}/pred_test.csv".format(path, dataset, fold)
  df_val   = pd.read_table(uri_val, sep=',')
  df_test = pd.read_table(uri_test, sep=',')
  df_test_pred = pd.read_table(uri_test_pred, sep=',')

  val, class_val = df_val.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'class']), df_val['class']
  test, class_test = df_test.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'class']), df_test_pred["class"]  
  return val, class_val, test, class_test


In [4]:
machines = ["SVM", "LR", "RF", "NB", "EXTRA", "KNN", "MLP", "CNN"]
features = ["CV", "TFIDF", "W2V", "GLOVE", "FAST"]
labels = ["0", "1", "2"]
clfs = {}

for machine in machines:
  clfs[machine] = []
  for feature in features:
    if machine == "CNN" and feature == "TFIDF":
      feature = "TF"
    for label in labels:
      clfs[machine].append("{}-{}-{}".format(machine, feature, label))


clfs_by_features = {}
for feature in features:
  clfs_by_features[feature] = []
  for machine in machines:
    for label in labels:
      if machine == "CNN" and feature == "TFIDF":
        clfs_by_features[feature].append("CNN-TF-{}".format(label))
      else:
        clfs_by_features[feature].append("{}-{}-{}".format(machine, feature, label))

In [5]:
folds = ["F1", "F2", "F3", "F4", "F5"]
pred = {
    "F1": {},
    "F2": {},
    "F3": {},
    "F4": {},
    "F5": {},
}
export = {}
for fold in folds:
  val, class_val, test, class_test = load_data("union", fold)  
  export.update({fold: {}})
  print("=============== {} ===============".format(fold))
  for machine, clf in clfs.items():
    cols = clfs[machine]
    val_vision = val[cols]    
    test_vision = test[cols]
    
    # Instancia de MLP
    lr = LogisticRegression(random_state=42,  max_iter=500)

    lr.fit(val_vision, class_val)
    y_pred = lr.predict(test_vision)
    f1 = f1_score(class_test, y_pred, average="macro")
    pred[fold].update({machine: f1})
    export[fold].update({machine: "%.3f" % f1})
    print("{}: ".format(machine), f1)

SVM:  0.8698544425214015
LR:  0.8779089009448219
RF:  0.8525287282904671
NB:  0.8269251760988084
EXTRA:  0.8346230750886522
KNN:  0.7903656030556677
MLP:  0.8416154820859495
CNN:  0.8687422445949404
SVM:  0.857642859711513
LR:  0.8722503883532436
RF:  0.8469614872234915
NB:  0.8210085981535399
EXTRA:  0.8271429480028835
KNN:  0.7846510358855451
MLP:  0.8350019955146082
CNN:  0.8588698809601699
SVM:  0.8630340285899539
LR:  0.8693516559334885
RF:  0.8523300389795244
NB:  0.8377484825923397
EXTRA:  0.8360090084169042
KNN:  0.7887674647635201
MLP:  0.8457145442108036
CNN:  0.8683048671936778
SVM:  0.8705828037270166
LR:  0.8842782598602086
RF:  0.8501207400933786
NB:  0.83483540430422
EXTRA:  0.843447919376627
KNN:  0.7753283895469725
MLP:  0.8475241840027117
CNN:  0.8740591575148183
SVM:  0.8671114110496981
LR:  0.8748547680181814
RF:  0.8532304363322848
NB:  0.8282644914379399
EXTRA:  0.820160948261436
KNN:  0.7619591265622443
MLP:  0.8483611303699851
CNN:  0.871158857351209


In [6]:
df = pd.DataFrame(data=export)
df.to_excel("machine.xlsx")

In [7]:
scores = {}
for machine in machines:
  scores.update({machine: []})
for fold, items in pred.items():
  for clf, score in items.items():
    scores[clf].append(score)

export = {}
for clf, score in scores.items():
  print("========= ", clf, "===========")
  print("Mean: ", np.mean(score))
  print("STD: ", np.std(score))
  export.update({clf: {
      "Mean": "%.3f" % np.mean(score),
      "STD": "%.3f" % np.std(score)
  }})


Mean:  0.8656451091199167
STD:  0.004797980704324851
Mean:  0.8757287946219888
STD:  0.005125960397142395
Mean:  0.8510342861838293
STD:  0.0022872545036751457
Mean:  0.8297564305173696
STD:  0.005940291675693465
Mean:  0.8322767798293006
STD:  0.007967165044892918
Mean:  0.7802143239627899
STD:  0.010516783583517238
Mean:  0.8436434672368115
STD:  0.004907003672001893
Mean:  0.8682270015229632
STD:  0.0051075838871915925


In [8]:
df = pd.DataFrame(data=export)
df.to_excel("mean-std-machine.xlsx")

# Grupo B

In [9]:
pred = {
    "F1": {},
    "F2": {},
    "F3": {},
    "F4": {},
    "F5": {},
}

In [10]:
folds = ["F1", "F2", "F3", "F4", "F5"]

export = {}
for fold in folds:
  export.update({fold: {}})
  val, class_val, test, class_test = load_data("union", fold)  
  print("=============== {} ===============".format(fold))
  for feature, clfs in clfs_by_features.items():
    # for clf in clfs:
    cols = clfs
    # # Instancia de MLP
    val_vision = val[cols]    
    test_vision = test[cols]

    lr = LogisticRegression(random_state=42,  max_iter=500)
    lr.fit(val_vision, class_val)
    y_pred = lr.predict(test_vision)
    f1 = f1_score(class_test, y_pred, average="macro")
    pred[fold].update({feature: f1})
    export[fold].update({feature: "%.3f" % f1})
    print(feature)
    print("Accuracy: ",accuracy_score(class_test, y_pred))
    print("F1: ", f1)
    # print("")


CV
Accuracy:  0.9016094242575079
F1:  0.8701599764610789
TFIDF
Accuracy:  0.9029367844698856
F1:  0.8705565147174653
W2V
Accuracy:  0.8599634975941596
F1:  0.7986524025876042
GLOVE
Accuracy:  0.8486809357889498
F1:  0.7638402956673825
FAST
Accuracy:  0.8405508544881367
F1:  0.7573319283910305
CV
Accuracy:  0.8949551941586459
F1:  0.8596425048386731
TFIDF
Accuracy:  0.8979422502489214
F1:  0.8624545589890031
W2V
Accuracy:  0.8549618320610687
F1:  0.7878722665168465
GLOVE
Accuracy:  0.8494855625622303
F1:  0.764277652437047
FAST
Accuracy:  0.8420179223365416
F1:  0.7557098159993476
CV
Accuracy:  0.8972784600066379
F1:  0.8642274525658472
TFIDF
Accuracy:  0.9029206770660472
F1:  0.8709364284787969
W2V
Accuracy:  0.861599734483903
F1:  0.797679540058028
GLOVE
Accuracy:  0.8571191503484898
F1:  0.780783357656158
FAST
Accuracy:  0.8340524394291404
F1:  0.7370773760286485
CV
Accuracy:  0.8994357782940591
F1:  0.8691338132595225
TFIDF
Accuracy:  0.9054098904746101
F1:  0.875683837986507
W2V
Ac

In [11]:
df = pd.DataFrame(data=export)
df.to_excel("feature.xlsx")

In [12]:
# print(pred)
scores = {}
for feature in features:
  scores.update({feature: []})
for fold, items in pred.items():
  for feature, score in items.items():
    scores[feature].append(score)


export = {}
for feature, score in scores.items():
  print("========= ", feature, "===========")
  print("Mean: ", np.mean(score))
  print("STD: ", np.std(score))
  export.update({feature: {
      "Mean": "%.3f" % np.mean(score),
      "STD": "%.3f" % np.std(score)
  }})

Mean:  0.8662496298607388
STD:  0.00386572156110815
Mean:  0.8702129468092064
STD:  0.004295443807453683
Mean:  0.7930830733486249
STD:  0.00832163749498815
Mean:  0.7734257860918967
STD:  0.0076893355884087855
Mean:  0.75305649562121
STD:  0.008293474757476271


In [13]:
df = pd.DataFrame(data=export)
df.to_excel("mean-std-feature.xlsx")

# Grupo C

In [22]:
pred = []
export = {}

In [23]:
folds = ["F1", "F2", "F3", "F4", "F5"]

for fold in folds:
  # export.update({fold: {}})
  val, class_val, test, class_test = load_data("union", fold)  
  print("=============== {} ===============".format(fold))
  val_vision = val
  test_vision = test

  lr = LogisticRegression(random_state=42,  max_iter=500)
  lr.fit(val_vision, class_val)
  y_pred = lr.predict(test_vision)
  f1 = f1_score(class_test, y_pred, average="macro")
  pred.append(f1)
  export[fold] = "%.3f" % f1
  print(" TODOS ")
  print("Accuracy: ",accuracy_score(class_test, y_pred))
  print("F1: ", f1)
  # print("")



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


 TODOS 
Accuracy:  0.8996183839389414
F1:  0.8644760158267221
 TODOS 
Accuracy:  0.8898108197809492
F1:  0.8484439464897875
 TODOS 
Accuracy:  0.9004314636574843
F1:  0.8660961303054765
 TODOS 
Accuracy:  0.8942914039163624
F1:  0.8593472011250479
 TODOS 
Accuracy:  0.8873216063723863
F1:  0.8456321661526389


In [24]:
df = pd.DataFrame(data=export, index=[0])
df.to_excel("all.xlsx")

print("========= ", "TODOS", "===========")
print("Mean: ", np.mean(pred))
print("STD: ", np.std(pred))

Mean:  0.8567990919799346
STD:  0.008323119970122117


# CLUSTERS

In [27]:
def load_clfs(fold):  
  import random
  clfs_fold = []
  quantidade = random.randint(5, len(fold)-1)
  sorteados = 0
  indices = []
  while sorteados < quantidade:
    item = random.randint(0, len(fold)-1)
    if not item in indices:
      indices.append(item)
      sorteados += 1

  for i in indices:
    # print(i)
    # print(fold[i])
    clfs_fold.append(random.choices(fold[i].split(", "))[0])
  return quantidade, clfs_fold

F1 = [
    "RF-CV, RF-TFIDF",
    "SVM-GLOVE, MLP-GLOVE",
    "NB-CV, NB-TFIDF",
    "LR-W2V, SVM-W2V, MLP-W2V, MLP-FAST",
    "NB-GLOVE, LR-GLOVE, RF-GLOVE, EXTRA-GLOVE, KNN-GLOVE",
    "EXTRA-W2V, KNN-W2V, RF-W2V",
    "KNN-FAST, EXTRA-FAST, RF-FAST",
    "CNN-FAST, CNN-W2V, CNN-GLOVE",
    "EXTRA-CV, EXTRA-TFIDF",
    "NB-FAST, NB-W2V, KNN-TFIDF",
    "SVM-TFIDF, MLP-TFIDF, LR-CV, LR-TFIDF, SVM-CV, CNN-CV, CNN-TF, KNN-CV, MLP-CV",
    "SVM-FAST, LR-FAST"
]

F2 = [
    "MLP-GLOVE, LR-GLOVE, SVM-GLOVE",
    "KNN-TFIDF, NB-W2V, NB-FAST",
    "LR-W2V, SVM-W2V, MLP-W2V, MLP-FAST",
    "RF-FAST, KNN-FAST, EXTRA-FAST",
    "LR-FAST, SVM-FAST",
    "SVM-CV, LR-CV, SVM-TFIDF, MLP-TFIDF",
    "NB-CV, NB-TFIDF",
    "KNN-W2V, EXTRA-W2V, RF-W2V, CNN-FAST, CNN-W2V, CNN-GLOVE, KNN-CV",
    "EXTRA-GLOVE, RF-GLOVE, KNN-GLOVE, NB-GLOVE",
    "EXTRA-TFIDF, RF-TFIDF, RF-CV, EXTRA-CV",
    "MLP-CV, MLP-TFIDF",
    "CNN-CV, CNN-TF"
]

F3 = [
    "EXTRA-W2V, RF-W2V",
    "RF-GLOVE, EXTRA-GLOVE, KNN-GLOVE",
    "MLP-W2V, SVM-W2V, LR-W2V",
    "KNN-W2V, RF-FAST, EXTRA-FAST, KNN-FAST",
    "MLP-GLOVE, SVM-GLOVE, LR-GLOVE, NB-GLOVE",
    "CNN-GLOVE, CNN-W2V",
    "KNN-TFIDF, NB-W2V, NB-FAST",
    "NB-CV, NB-TFIDF",
    "LR-TFIDF, LR-CV, MLP-CV, MLP-TFIDF, SVM-CV, SVM-TFIDF",
    "CNN-TF, CNN-CV, EXTRA-TFIDF, EXTRA-CV, RF-TFIDF, RF-CV, CNN-FAST, KNN-CV",
    "LR-FAST, SVM-FAST, MLP-FAST"
]

F4 = [
    "EXTRA-W2V, RF-W2V, KNN-W2V",
    "KNN-TFIDF, NB-W2V, NB-FAST",
    "NB-TFIDF, NB-CV",
    "MLP-FAST, MLP-W2V",
    "NB-GLOVE, RF-GLOVE, CNN-GLOVE, EXTRA-GLOVE",
    "KNN-FAST, EXTRA-FAST, RF-FAST",
    "SVM-W2V, LR-W2V",
    "LR-CV, LR-TFIDF",
    "CNN-GLOVE, CNN-CV, CNN-W2V",
    "MLP-GLOVE, LR-GLOVE, SVM-GLOVE",
    "SVM-FAST, LR-FAST",
    "EXTRA-CV, RF-CV",
    "SVM-CV, SVM-TFIDF, MLP-CV, MLP-TFIDF",
    "CNN-TF, CNN-CV, CNN-FAST, RF-TFIDF, EXTRA-TFIDF"
]

F5 = [
    "RF-FAST, EXTRA-FAST, KNN-FAST",
    "EXTRA-W2V, RF-W2V, KNN-W2V",
    "LR-W2V, SVM-W2V",
    "MLP-W2V, MLP-FAST",
    "NB-GLOVE, LR-GLOVE, SVM-GLOVE, MLP-GLOVE, EXTRA-GLOVE, RF-GLOVE, KNN-GLOVE",
    "KNN-TFIDF, NB-W2V, NB-FAST",
    "EXTRA-CV, RF-CV",
    "LR-FAST, SVM-FAST",
    "CNN-W2V, CNN-FAST",
    "MLP-CV, MLP-TFIDF, SVM-CV, SVM-TFIDF, LR-CV, LR-TFIDF, CNN-CV",
    "CNN-GLOVE, KNN-CV, NB-TFIDF, NB-CV",
    "RF-TFIDF, EXTRA-TFIDF, CNN-TF"
]

# Grupo D

In [28]:
clusters = {"F1": F1, "F2": F2, "F3": F3, "F4": F4, "F5": F5}
results = {}
for fold, cluster in clusters.items():
  cont = 1
  results.update({fold: {"clfs": [], "f1": 0}})
  while cont < 300:
    qt, clfs = load_clfs(cluster)
    pred = []
    # folds = ["F1"]
    # fold="F1"

    val, class_val, test, class_test = load_data("union", fold)
    cols = []
    # columns 
    for clf in clfs:
      for i in range(0,3):
        cols.append("{}-{}".format(clf, i))
    
    val_vision = val[cols]
    test_vision = test[cols]
    # Treinamento
    lr = LogisticRegression(random_state=42,  max_iter=500)
    lr.fit(val_vision, class_val)
    y_pred = lr.predict(test_vision)
    f1 = f1_score(class_test, y_pred, average="macro")
    pred.append(f1)
    # print("=============== {} ===============".format(fold))

    if f1 > results[fold]["f1"]:
      results[fold].update({"clfs": clfs, "f1": f1})
    
    cont += 1

In [29]:
results

{'F1': {'clfs': ['MLP-W2V',
   'NB-TFIDF',
   'RF-TFIDF',
   'KNN-TFIDF',
   'SVM-CV',
   'KNN-W2V',
   'KNN-FAST',
   'CNN-W2V',
   'SVM-FAST'],
  'f1': 0.8715076282280257},
 'F2': {'clfs': ['MLP-TFIDF',
   'NB-CV',
   'SVM-CV',
   'CNN-TF',
   'KNN-TFIDF',
   'CNN-FAST',
   'EXTRA-TFIDF'],
  'f1': 0.8704047889407267},
 'F3': {'clfs': ['KNN-TFIDF',
   'RF-CV',
   'SVM-W2V',
   'LR-GLOVE',
   'NB-TFIDF',
   'KNN-FAST',
   'KNN-GLOVE',
   'SVM-CV'],
  'f1': 0.8748893910875429},
 'F4': {'clfs': ['SVM-TFIDF',
   'KNN-TFIDF',
   'CNN-W2V',
   'KNN-FAST',
   'SVM-FAST',
   'EXTRA-CV',
   'KNN-W2V',
   'NB-TFIDF',
   'SVM-GLOVE',
   'LR-W2V',
   'MLP-FAST'],
  'f1': 0.8764188290929202},
 'F5': {'clfs': ['CNN-W2V',
   'RF-TFIDF',
   'NB-W2V',
   'NB-TFIDF',
   'MLP-TFIDF',
   'MLP-W2V'],
  'f1': 0.875191856533438}}

In [30]:
df = pd.DataFrame(data=results)
df.to_excel("stacked.xlsx")

In [31]:
lista = []
for fold, result in results.items():
  lista.append(result["f1"])
print("========= ", "COMBINAÇÃO", "===========")
print("Mean: ", np.mean(lista))
print("STD: ", np.std(lista))

Mean:  0.8736824987765306
STD:  0.002310657792040599
