In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [39]:
path = "/content/drive/MyDrive/Dataset/Folds/hateval/"
def load_data():
  uri_val = "{}/prob_train_hateval.csv".format(path)
  uri_test = "{}/prob_test_hateval.csv".format(path)
  uri_test_pred = "{}/test_hate.csv".format(path)
  df_val   = pd.read_table(uri_val, sep=',')
  df_test = pd.read_table(uri_test, sep=',')
  df_test_pred = pd.read_table(uri_test_pred, sep=',')
  val, class_val = df_val.drop(columns=['Unnamed: 0', 'HS']), df_val['HS']
  test, class_test = df_test.drop(columns=['Unnamed: 0', 'HS']), df_test_pred["HS"]  
  return val, class_val, test, class_test


In [40]:
machines = ["SVM", "LR", "RF", "NB", "EXTRA", "KNN", "MLP", "CNN"]
features = ["CV", "TFIDF", "W2V", "GLOVE", "FAST"]
labels = ["0", "1"]
clfs = {}

for machine in machines:
  clfs[machine] = []
  for feature in features:
    if machine == "CNN" and feature == "TFIDF":
      feature = "TF"
    for label in labels:
      clfs[machine].append("{}-{}-{}".format(machine, feature, label))


clfs_by_features = {}
for feature in features:
  clfs_by_features[feature] = []
  for machine in machines:
    for label in labels:
      if machine == "CNN" and feature == "TFIDF":
        clfs_by_features[feature].append("CNN-TF-{}".format(label))
      else:
        clfs_by_features[feature].append("{}-{}-{}".format(machine, feature, label))

In [41]:
export = {}
pred = {}
val, class_val, test, class_test = load_data()  
for machine, clf in clfs.items():
  # print(machine, clf)
  cols = clfs[machine]
  val_vision = val[cols]    
  test_vision = test[cols]
  
  # Instancia de MLP
  lr = LogisticRegression(random_state=42,  max_iter=500)

  lr.fit(val_vision, class_val)
  y_pred = lr.predict(test_vision)
  f1 = f1_score(class_test, y_pred, average="macro")
  pred.update({machine: f1})
  export.update({machine: "%.3f" % f1})
  print("{}: ".format(machine), f1)

SVM:  0.49764979799071607
LR:  0.4821706446708184
RF:  0.3881777108433735
NB:  0.5014041800930898
EXTRA:  0.3918906744558047
KNN:  0.547193282399344
MLP:  0.48273136151514284
CNN:  0.479124967845027


In [42]:
df = pd.DataFrame(data=export, index=[0])
df.to_excel("machine.xlsx")

In [43]:
pred

{'CNN': 0.479124967845027,
 'EXTRA': 0.3918906744558047,
 'KNN': 0.547193282399344,
 'LR': 0.4821706446708184,
 'MLP': 0.48273136151514284,
 'NB': 0.5014041800930898,
 'RF': 0.3881777108433735,
 'SVM': 0.49764979799071607}

In [44]:
scores = {}
for machine in machines:
  scores.update({machine: []})

# pred.items()
# for fold, items in pred.items():
for clf, score in pred.items():
  scores[clf].append(score)

print(scores)
export = {}
for clf, score in scores.items():
  print("========= ", clf, "===========")
  print("Mean: ", np.mean(score))
  print("STD: ", np.std(score))
  export.update({clf: {
      "Mean": "%.3f" % np.mean(score),
      "STD": "%.3f" % np.std(score)
  }})


{'SVM': [0.49764979799071607], 'LR': [0.4821706446708184], 'RF': [0.3881777108433735], 'NB': [0.5014041800930898], 'EXTRA': [0.3918906744558047], 'KNN': [0.547193282399344], 'MLP': [0.48273136151514284], 'CNN': [0.479124967845027]}
Mean:  0.49764979799071607
STD:  0.0
Mean:  0.4821706446708184
STD:  0.0
Mean:  0.3881777108433735
STD:  0.0
Mean:  0.5014041800930898
STD:  0.0
Mean:  0.3918906744558047
STD:  0.0
Mean:  0.547193282399344
STD:  0.0
Mean:  0.48273136151514284
STD:  0.0
Mean:  0.479124967845027
STD:  0.0


In [45]:
df = pd.DataFrame(data=export)
df.to_excel("mean-std-machine.xlsx")

# Grupo B

In [46]:
pred={}
export = {}
val, class_val, test, class_test = load_data()  
for feature, clfs in clfs_by_features.items():
  cols = clfs
  val_vision = val[cols]    
  test_vision = test[cols]

  lr = LogisticRegression(random_state=42,  max_iter=500)
  lr.fit(val_vision, class_val)
  y_pred = lr.predict(test_vision)
  f1 = f1_score(class_test, y_pred, average="macro")
  pred.update({feature: f1})
  export.update({feature: "%.3f" % f1})
  print(feature)
  print("Accuracy: ",accuracy_score(class_test, y_pred))
  print("F1: ", f1)
  # print("")


CV
Accuracy:  0.44733333333333336
F1:  0.364340498794624
TFIDF
Accuracy:  0.47333333333333333
F1:  0.4101305229329804
W2V
Accuracy:  0.5493333333333333
F1:  0.5348143730430893
GLOVE
Accuracy:  0.5806666666666667
F1:  0.579949032925585
FAST
Accuracy:  0.5516666666666666
F1:  0.5377243826844729


In [47]:
df = pd.DataFrame(data=export, index=[0])
df.to_excel("feature.xlsx")

In [48]:
# print(pred)
scores = {}
# print(features)
for feature in features:
  scores.update({feature: []})


for feature, score in pred.items():
  scores[feature].append(score)

export = {}
for feature, score in scores.items():
  print("========= ", feature, "===========")
  print("Mean: ", np.mean(score))
  print("STD: ", np.std(score))
  export.update({feature: {
      "Mean": "%.3f" % np.mean(score),
      "STD": "%.3f" % np.std(score)
  }})

Mean:  0.364340498794624
STD:  0.0
Mean:  0.4101305229329804
STD:  0.0
Mean:  0.5348143730430893
STD:  0.0
Mean:  0.579949032925585
STD:  0.0
Mean:  0.5377243826844729
STD:  0.0


In [49]:
df = pd.DataFrame(data=export)
df.to_excel("mean-std-feature.xlsx")

# Grupo C

In [50]:
pred = []
export = {}

In [51]:
val, class_val, test, class_test = load_data()  
val_vision = val
test_vision = test

lr = LogisticRegression(random_state=42,  max_iter=500)
lr.fit(val_vision, class_val)
y_pred = lr.predict(test_vision)
f1 = f1_score(class_test, y_pred, average="macro")
pred.append(f1)
export = "%.3f" % f1
print(" TODOS ")
print("F1: ", f1)
# print("")

 TODOS 
F1:  0.41030172708704327


In [52]:
# df = pd.DataFrame(data=export, index=[0])
# df.to_excel("all.xlsx")

print("========= ", "TODOS", "===========")
print("Mean: ", np.mean(pred))
print("STD: ", np.std(pred))

Mean:  0.41030172708704327
STD:  0.0


# CLUSTERS

In [53]:
def load_clfs(fold):  
  import random
  clfs_fold = []
  quantidade = random.randint(5, len(fold)-1)
  sorteados = 0
  indices = []
  while sorteados < quantidade:
    item = random.randint(0, len(fold)-1)
    if not item in indices:
      indices.append(item)
      sorteados += 1

  for i in indices:
    # print(i)
    # print(fold[i])
    clfs_fold.append(random.choices(fold[i].split(", "))[0])
  return quantidade, clfs_fold

F = [
  "NB-W2V, NB-FAST, NB-GLOVE",
  "EXTRA-W2V, RF-W2V, RF-FAST, CNN-GLOVE",
  "NB-CV, NB-TFIDF",
  "EXTRA-CV, RF-CV",
  "LR-CV, LR-TFIDF",
  "EXTRA-TFIDF, RF-TFIDF, KNN-CV",
  "LR-FAST, SVM-FAST, EXTRA-FAST",
  "CNN-W2V, CNN-CV, CNN-TF, CNN-FAST",
  "LR-W2V, KNN-FAST, SVM-W2V, MLP-W2V, MLP-FAST, KNN-W2V",
  "MLP-CV, SVM-CV, MLP-TFIDF, SVM-TFIDF",
  "KNN-TFIDF, KNN-GLOVE, MLP-GLOVE, SVM-GLOVE, LR-GLOVE, EXTRA-GLOVE, RF-GLOVE"
]

# Grupo D

In [54]:
results = {}

cont = 1
results.update({"clfs": [], "f1": 0})
while cont < 300:
  qt, clfs = load_clfs(F)
  pred = []
  
  val, class_val, test, class_test = load_data()
  cols = []
  # columns 
  for clf in clfs:
    for i in range(0,2):
      cols.append("{}-{}".format(clf, i))
  
  val_vision = val[cols]
  test_vision = test[cols]
  # # Treinamento
  lr = LogisticRegression(random_state=42,  max_iter=500)
  lr.fit(val_vision, class_val)
  y_pred = lr.predict(test_vision)
  f1 = f1_score(class_test, y_pred, average="macro")
  pred.append(f1)
  # print("=============== {} ===============".format(fold))
  if f1 > results["f1"]:
    results.update({"clfs": clfs, "f1": f1})      
  
  cont += 1

In [55]:
results

{'clfs': ['RF-W2V', 'LR-TFIDF', 'KNN-GLOVE', 'NB-W2V', 'MLP-W2V'],
 'f1': 0.5715235373149818}

In [56]:
# lista = []

# for fold, r in results.items():
  # print(r)
#   lista.append(r["f1"])
# print("========= ", "Combinados", "===========")
# print("Mean: ", np.mean(lista))
# print("STD: ", np.std(lista))