In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def load_data(dataset_name, fold=None):
  if dataset_name == "hateval":
    uri_pred   = '/content/drive/MyDrive/Dataset/Folds/hateval/pred_test_hateval.csv'
    uri_test   = '/content/drive/MyDrive/Dataset/Folds/hateval/test_hate.csv'
    df_pred = pd.read_table(uri_pred, sep=',')
    df_test = pd.read_table(uri_test, sep=',')
    
    return df_pred, df_test["HS"]
  else:
    uri_pred   = '/content/drive/MyDrive/Dataset/Folds/{}/{}/pred_test.csv'.format(dataset_name, fold)  
    uri_test   = '/content/drive/MyDrive/Dataset/Folds/{}/{}/test.csv'.format(dataset_name, fold)  
    df_pred = pd.read_table(uri_pred, sep=',').drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
    df_test = pd.read_table(uri_test, sep=',')
    if dataset_name=="zw":
      label = df_test["norm"]
    else:
      label = df_test["class"]
    return df_pred, label

In [4]:
results = {}
features = ["CV", "TFIDF", "W2V", "GLOVE", "FAST"]
clfs = ["SVM", "LR", "RF", "NB", "MLP", "EXTRA", "KNN", "CNN"]

for db in ["td", "zw", "union"]:
  results.update({db: {}})
  for fold in "F1,F2,F3,F4,F5".split(","):
    results[db].update({fold: {}})
    pred, y_true = load_data(db, fold)
    # pred = pred.drop("class")
    for feature in features:
      results[db][fold].update({feature: {}})
      for clf in clfs:
        # if clf == "CNN" and feature == "TFIDF":
          # feature = "TF"
        col = "{}-{}".format(clf, feature)
        f1 = f1_score(y_true, pred[col], average="macro")
        results[db][fold][feature].update({clf: "%.3f" % f1})

In [5]:
for db, fold in results.items():
  for fold, data in fold.items():
    df = pd.DataFrame(data=data)
    df.to_excel("{}-{}-monolitico.xlsx".format(db, fold))

In [6]:
results = {}
features = ["CV", "TFIDF", "W2V", "GLOVE", "FAST"]
clfs = ["SVM", "LR", "RF", "NB", "MLP", "EXTRA", "KNN", "CNN"]

pred, y_true = load_data("hateval")
for feature in features:
  results.update({feature: {}})
  for clf in clfs:
    col = "{}-{}".format(clf, feature)
    f1 = f1_score(y_true, pred[col], average="macro")
    results[feature].update({clf: "%.3f" % f1})

In [7]:
df = pd.DataFrame(data=results)
df.to_excel("hateval-monolitico.xlsx")