In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
path = "/content/drive/MyDrive/Dataset/Folds"
def load_data(dataset, fold):
  uri_val = "{}/{}/{}/prob_train.csv".format(path, dataset, fold)
  uri_test = "{}/{}/{}/prob_test.csv".format(path, dataset, fold)
  uri_test_pred = "{}/{}/{}/pred_test.csv".format(path, dataset, fold)
  df_val   = pd.read_table(uri_val, sep=',')
  df_test = pd.read_table(uri_test, sep=',')
  df_test_pred = pd.read_table(uri_test_pred, sep=',')
  val, class_val = df_val.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'class']), df_val['class']
  test, class_test = df_test.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'class']), df_test_pred["class"]  
  return val, class_val, test, class_test


In [None]:
machines = ["SVM", "LR", "RF", "NB", "EXTRA", "KNN", "MLP", "CNN"]
features = ["CV", "TFIDF", "W2V", "GLOVE", "FAST"]
labels = ["0", "1", "2"]
clfs = {}

for machine in machines:
  clfs[machine] = []
  for feature in features:
    if machine == "CNN" and feature == "TFIDF":
      feature = "TF"
    for label in labels:
      clfs[machine].append("{}-{}-{}".format(machine, feature, label))


clfs_by_features = {}
for feature in features:
  clfs_by_features[feature] = []
  for machine in machines:
    for label in labels:
      if machine == "CNN" and feature == "TFIDF":
        clfs_by_features[feature].append("CNN-TF-{}".format(label))
      else:
        clfs_by_features[feature].append("{}-{}-{}".format(machine, feature, label))

In [None]:
folds = ["F1", "F2", "F3", "F4", "F5"]
pred = {
    "F1": {},
    "F2": {},
    "F3": {},
    "F4": {},
    "F5": {},
}
export = {}
for fold in folds:
  val, class_val, test, class_test = load_data("td", fold)  
  export.update({fold: {}})
  print("=============== {} ===============".format(fold))
  for machine, clf in clfs.items():
    cols = clfs[machine]
    val_vision = val[cols]    
    test_vision = test[cols]
    
    # Instancia de MLP
    lr = LogisticRegression(random_state=42,  max_iter=500)

    lr.fit(val_vision, class_val)
    y_pred = lr.predict(test_vision)
    f1 = f1_score(class_test, y_pred, average="macro")
    pred[fold].update({machine: f1})
    export[fold].update({machine: "%.3f" % f1})
    print("{}: ".format(machine), f1)

SVM:  0.7296244716455611
LR:  0.7290244091214896
RF:  0.6485832289927996
NB:  0.6522998660032365
EXTRA:  0.6186032489858709
KNN:  0.6534997557628647
MLP:  0.7093871316511203
CNN:  0.7245991207894513
SVM:  0.7262150696023575
LR:  0.7115050257709115
RF:  0.6170293548345253
NB:  0.6537955473185327
EXTRA:  0.6010850814662466
KNN:  0.6527217906252477
MLP:  0.6930807879343851
CNN:  0.7349545062178487
SVM:  0.7344175999510707
LR:  0.7318882053826407
RF:  0.6357890909119894
NB:  0.6553635316863841
EXTRA:  0.6137210875695344
KNN:  0.664209499789965
MLP:  0.7032122197638152
CNN:  0.7303482657404627
SVM:  0.7319752742495874
LR:  0.726209957433479
RF:  0.6358754193922116
NB:  0.6532464356547852
EXTRA:  0.6048822189573566
KNN:  0.6626319861050958
MLP:  0.695852501180022
CNN:  0.717082880866193
SVM:  0.7049535022259864
LR:  0.7271843750300165
RF:  0.6372961229903985
NB:  0.6487401919969256
EXTRA:  0.6161702477514134
KNN:  0.6561134069904034
MLP:  0.68799035579216
CNN:  0.708018810762137


In [None]:
df = pd.DataFrame(data=export)
df.to_excel("machine.xlsx")

In [None]:
scores = {}
for machine in machines:
  scores.update({machine: []})
for fold, items in pred.items():
  for clf, score in items.items():
    scores[clf].append(score)

export = {}
for clf, score in scores.items():
  print("========= ", clf, "===========")
  print("Mean: ", np.mean(score))
  print("STD: ", np.std(score))
  export.update({clf: {
      "Mean": "%.3f" % np.mean(score),
      "STD": "%.3f" % np.std(score)
  }})


Mean:  0.7254371835349126
STD:  0.010593528190570642
Mean:  0.7251623945477075
STD:  0.007097340388220048
Mean:  0.6349146434243849
STD:  0.010139753184135307
Mean:  0.6526891145319729
STD:  0.0022105569916129357
Mean:  0.6108923769460844
STD:  0.006747149254629535
Mean:  0.6578352878547153
STD:  0.00472331362597924
Mean:  0.6979045992643005
STD:  0.007560315751345792
Mean:  0.7230007168752185
STD:  0.009580320901296308


In [None]:
df = pd.DataFrame(data=export)
df.to_excel("mean-std-machine.xlsx")

# Grupo B

In [None]:
pred = {
    "F1": {},
    "F2": {},
    "F3": {},
    "F4": {},
    "F5": {},
}

In [None]:
folds = ["F1", "F2", "F3", "F4", "F5"]

export = {}
for fold in folds:
  export.update({fold: {}})
  val, class_val, test, class_test = load_data("td", fold)  
  print("=============== {} ===============".format(fold))
  for feature, clfs in clfs_by_features.items():
    # for clf in clfs:
    cols = clfs
    # # Instancia de MLP
    val_vision = val[cols]    
    test_vision = test[cols]

    lr = LogisticRegression(random_state=42,  max_iter=500)
    lr.fit(val_vision, class_val)
    y_pred = lr.predict(test_vision)
    f1 = f1_score(class_test, y_pred, average="macro")
    pred[fold].update({feature: f1})
    export[fold].update({feature: "%.3f" % f1})
    print(feature)
    print("Accuracy: ",accuracy_score(class_test, y_pred))
    print("F1: ", f1)
    # print("")


CV
Accuracy:  0.8987290700020173
F1:  0.7227303505687983
TFIDF
Accuracy:  0.9045793826911438
F1:  0.6792125591608359
W2V
Accuracy:  0.8658462779907202
F1:  0.5701840944940191
GLOVE
Accuracy:  0.8739156748033085
F1:  0.5770714052548453
FAST
Accuracy:  0.8668549525922937
F1:  0.5612827716893157
CV
Accuracy:  0.8838006858987291
F1:  0.6939828237172615
TFIDF
Accuracy:  0.8880371192253379
F1:  0.6483447526587639
W2V
Accuracy:  0.868065362114182
F1:  0.6034296857468732
GLOVE
Accuracy:  0.8563647367359289
F1:  0.5581734387274961
FAST
Accuracy:  0.8551543272140407
F1:  0.550212515565354
CV
Accuracy:  0.8965099858785556
F1:  0.7088913352095393
TFIDF
Accuracy:  0.8981238652410732
F1:  0.6601897928872614
W2V
Accuracy:  0.8721000605204761
F1:  0.5932003930638248
GLOVE
Accuracy:  0.8587855557797055
F1:  0.5565236914983233
FAST
Accuracy:  0.8547508573734114
F1:  0.5616978347829145
CV
Accuracy:  0.892453591606134
F1:  0.7047339339249817
TFIDF
Accuracy:  0.898910411622276
F1:  0.652519190235537
W2V
Ac

In [None]:
df = pd.DataFrame(data=export)
df.to_excel("feature.xlsx")

In [None]:
# print(pred)
scores = {}
for feature in features:
  scores.update({feature: []})
for fold, items in pred.items():
  for feature, score in items.items():
    scores[feature].append(score)


export = {}
for feature, score in scores.items():
  print("========= ", feature, "===========")
  print("Mean: ", np.mean(score))
  print("STD: ", np.std(score))
  export.update({feature: {
      "Mean": "%.3f" % np.mean(score),
      "STD": "%.3f" % np.std(score)
  }})

Mean:  0.7042577282872433
STD:  0.011363088354217434
Mean:  0.6630996596913865
STD:  0.012206066001959153
Mean:  0.5767471618530481
STD:  0.020010421958157786
Mean:  0.5605150760567017
STD:  0.008361174077298073
Mean:  0.556592048400458
STD:  0.00504759854066605


In [None]:
df = pd.DataFrame(data=export)
df.to_excel("mean-std-feature.xlsx")

# Grupo C

In [None]:
pred = []
export = {}

In [None]:
folds = ["F1", "F2", "F3", "F4", "F5"]

for fold in folds:
  # export.update({fold: {}})
  val, class_val, test, class_test = load_data("td", fold)  
  print("=============== {} ===============".format(fold))
  val_vision = val
  test_vision = test

  lr = LogisticRegression(random_state=42,  max_iter=500)
  lr.fit(val_vision, class_val)
  y_pred = lr.predict(test_vision)
  f1 = f1_score(class_test, y_pred, average="macro")
  pred.append(f1)
  export[fold] = "%.3f" % f1
  print(" TODOS ")
  print("Accuracy: ",accuracy_score(class_test, y_pred))
  print("F1: ", f1)
  # print("")



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


 TODOS 
Accuracy:  0.9013516239661086
F1:  0.6767032782517434
 TODOS 
Accuracy:  0.8894492636675408
F1:  0.6509460887239514
 TODOS 
Accuracy:  0.898325600161388
F1:  0.6684922030956552


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


 TODOS 
Accuracy:  0.8904358353510896
F1:  0.6417462050130068
 TODOS 
Accuracy:  0.8847861178369653
F1:  0.6379230292255741


In [None]:
df = pd.DataFrame(data=export, index=[0])
df.to_excel("all.xlsx")

# CLUSTERS

In [None]:
def load_clfs(fold):  
  import random
  clfs_fold = []
  quantidade = random.randint(5, len(fold)-1)
  sorteados = 0
  indices = []
  while sorteados < quantidade:
    item = random.randint(0, len(fold)-1)
    if not item in indices:
      indices.append(item)
      sorteados += 1

  for i in indices:
    # print(i)
    # print(fold[i])
    clfs_fold.append(random.choices(fold[i].split(", "))[0])
  return quantidade, clfs_fold

F1 = [
    "KNN-TFIDF, KNN-CV",
    "MLP-FAST, MLP-W2V",
    "MLP-CV, SVM-CV",
    "NB-FAST, NB-W2V",
    "CNN-W2V, CNN-TF, SVM-TFIDF, LR-TFIDF, LR-CV, CNN-FAST, CNN-CV",
    "SVM-FAST, LR-FAST, LR-W2V, SVM-W2V",
    "NB-TFIDF, NB-CV, CNN-GLOVE",
    "KNN-GLOVE, LR-GLOVE, NB-GLOVE, SVM-GLOVE, MLP-GLOVE, RF-GLOVE, EXTRA-GLOVE",
    "MLP-TFIDF, EXTRA-TFIDF, RF-TFIDF",
    "EXTRA-CV, RF-CV",
    "KNN-FAST, EXTRA-FAST, KNN-W2V, LR-W2V, RF-W2V, RF-FAST"
]

F2 = [
    "NB-W2V, NB-FAST",
    "LR-FAST, SVM-FAST",
    "NB-CV, NB-TFIDF",
    "CNN-GLOVE, CNN-FAST, CNN-W2V",
    "MLP-GLOVE, EXTRA-GLOVE, SVM-GLOVE, RF-GLOVE, LR-GLOVE, NB-GLOVE, KNN-GLOVE",
    "EXTRA-TFIDF, RF-TFIDF",
    "MLP-W2V, MLP-FAST",
    "LR-W2V, SVM-W2V",
    "KNN-FAST, EXTRA-FAST, RF-FAST, RF-W2V, KNN-W2V, EXTRA-W2V, KNN-TFIDF, KNN-CV",
    "MLP-TFIDF, SVM-CV, LR-CV, MLP-CV, LR-TFIDF, SVM-TFIDF, CNN-TF, CNN-CV",
    "EXTRA-CV, RF-CV"
]

F3 = [
    "NB-W2V, NB-FAST",
    "LR-FAST, LR-W2V, SVM-FAST, SVM-W2V",
    "LR-GLOVE, EXTRA-GLOVE, KNN-GLOVE, RF-GLOVE",
    "MLP-W2V, MLP-FAST",
    "KNN-FAST, EXTRA-FAST, RF-FAST, EXTRA-W2V, KNN-W2V, RF-W2V",
    "SVM-TFIDF, SVM-CV, MLP-CV, MLP-TFIDF",
    "LR-CV, LR-TFIDF, CNN-CV",
    "CNN-GLOVE, CNN-TF",
    "KNN-CV, KNN-TFIDF",
    "CNN-FAST, CNN-W2V, RF-TFIDF, EXTRA-TFIDF",
    "NB-CV, NB-TFIDF",
    "SVM-GLOVE, MLP-GLOVE, NB-GLOVE",
    "EXTRA-CV, RF-CV"
]

F4 = [
    "CNN-W2V, RF-TFIDF, CNN-GLOVE, EXTRA-TFIDF",
    "NB-CV, NB-TFIDF, KNN-CV, KNN-TFIDF",
    "LR-W2V, SVM-W2V, MLP-W2V",
    "NB-W2V, NB-FAST",
    "KNN-FAST, EXTRA-FAST, RF-FAST, KNN-W2V",
    "EXTRA-W2V, RF-W2V",
    "CNN-TF, SVM-TFIDF, MLP-TFIDF, SVM-CV, LR-CV, MLP-CV, LR-TFIDF, CNN-CV, CNN-FAST",
    "LR-FAST, SVM-FAST, MLP-FAST",
    "NB-GLOVE, MLP-GLOVE, SVM-GLOVE, LR-GLOVE, RF-GLOVE, EXTRA-GLOVE, KNN-GLOVE",
    "RF-CV, EXTRA-CV"
]

F5 = [
    "NB-W2V, NB-FAST",
    "CNN-FAST, CNN-W2V",
    "CNN-TF", "RF-TFIDF", "EXTRA-TFIDF",
    "SVM-CV, LR-CV, LR-TFIDF, CNN-CV, SVM-TFIDF, MLP-CV, MLP-TFIDF",
    "EXTRA-FAST, EXTRA-W2V, KNN-FAST",
    "RF-CV, EXTRA-CV",
    "RF-W2V, RF-FAST, KNN-W2V",
    "KNN-CV, KNN-TFIDF, CNN-GLOVE, MLP-FAST, MLP-W2V, SVM-W2V, SVM-FAST, LR-FAST, LR-W2V",
    "KNN-GLOVE, LR-GLOVE, NB-GLOVE, EXTRA-GLOVE, SVM-GLOVE, MLP-GLOVE, RF-GLOVE",
    "NB-TFIDF, NB-CV"
]


# Grupo D

In [None]:
clusters = {"F1": F1, "F2": F2, "F3": F3, "F4": F4, "F5": F5}
results = {}
for fold, cluster in clusters.items():
  cont = 1
  results.update({fold: {"clfs": [], "f1": 0}})
  while cont < 300:
    qt, clfs = load_clfs(cluster)
    pred = []
    # folds = ["F1"]
    # fold="F1"

    val, class_val, test, class_test = load_data("td", fold)
    cols = []
    # columns 
    for clf in clfs:
      for i in range(0,3):
        cols.append("{}-{}".format(clf, i))
    
    val_vision = val[cols]
    test_vision = test[cols]
    # Treinamento
    lr = LogisticRegression(random_state=42,  max_iter=500)
    lr.fit(val_vision, class_val)
    y_pred = lr.predict(test_vision)
    f1 = f1_score(class_test, y_pred, average="macro")
    pred.append(f1)
    # print("=============== {} ===============".format(fold))

    if f1 > results[fold]["f1"]:
      results[fold].update({"clfs": clfs, "f1": f1})
    
    cont += 1

In [None]:
results

{'F1': {'clfs': ['MLP-GLOVE',
   'KNN-CV',
   'MLP-FAST',
   'SVM-CV',
   'CNN-FAST',
   'NB-W2V',
   'LR-W2V'],
  'f1': 0.7369723981637569},
 'F2': {'clfs': ['CNN-FAST', 'SVM-TFIDF', 'KNN-FAST', 'LR-FAST', 'NB-CV'],
  'f1': 0.7356500433142772},
 'F3': {'clfs': ['MLP-FAST', 'NB-CV', 'SVM-CV', 'SVM-FAST', 'NB-W2V'],
  'f1': 0.7396232752452147},
 'F4': {'clfs': ['NB-GLOVE', 'KNN-CV', 'NB-W2V', 'LR-W2V', 'SVM-CV'],
  'f1': 0.7383267574750954},
 'F5': {'clfs': ['CNN-CV',
   'LR-W2V',
   'NB-FAST',
   'CNN-FAST',
   'NB-TFIDF',
   'CNN-TF'],
  'f1': 0.727017614677011}}

In [None]:
df = pd.DataFrame(data=results)
df.to_excel("stacked.xlsx")

In [None]:
lista = []

for fold, r in results.items():
  lista.append(r["f1"])
print("========= ", "Combinados", "===========")
print("Mean: ", np.mean(lista))
print("STD: ", np.std(lista))

Mean:  0.735518017775071
STD:  0.004452677025928748
