In [None]:
import pandas as pd
import numpy as np
import re
from time import time

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import precision_score, recall_score, f1_score, make_scorer,fbeta_score, average_precision_score

import matplotlib.pyplot as plt
import seaborn as sns

import nlp_utils as utils
from nlp_utils import get_vectorizer

pd.options.display.max_colwidth = 100

In [None]:
# Set up
text = "entities"
label = "SH"

class_names = ("Controls", "Self harm")

# Parameters of feature extraction
vectorizer_mode = "select features"
params = {'analyzer' : "word",
          'ngram_range' : (1,1),
          'use_idf' : True,
          'mode' : "select by pvalue",
          'thresh' : 0.001}

n_keywords = 20

In [None]:
df_train = pd.read_csv("../../data/rmh_train.csv")
df_train.SH.value_counts()

# Binary classification (excl. SI)

In [None]:
X = df_train[text]
y = df_train.SH.values

cv = StratifiedKFold(n_splits=10)

vectorizer = get_vectorizer(vectorizer_mode, params)

scores = {"precision" : [], 
          "recall" : [], 
          "f1" : [], 
          "f2" : [],
          "AP" : []
         }

for train_index, val_index in cv.split(X, y):
    vectorizer.fit(X[train_index], y[train_index])
    SH_keywords = vectorizer.df_features.sort_values(by="p_value")[:n_keywords].feature.tolist()
    
    y_pred = np.zeros_like(y[val_index])
    y_pred[X[val_index].str.contains("|".join(SH_keywords))] = 1
    y_proba = pd.get_dummies(y_pred, drop_first=False).values
    
    scores["precision"].append(precision_score(y[val_index], y_pred, average="binary"))
    scores["recall"].append(recall_score(y[val_index], y_pred, average="binary"))
    scores["f1"].append(f1_score(y[val_index], y_pred, average="binary"))
    scores["f2"].append(fbeta_score(y[val_index], y_pred, beta=2, average="binary"))
    scores["AP"].append(average_precision_score(y[val_index], y_proba[:,1]))
    
print("Average Precision: %0.3f (+/- %0.2f)" % (np.mean(scores["precision"]), 
                                                np.std(scores["precision"]) * 2))
print("Average Recall: %0.3f (+/- %0.2f)" % (np.mean(scores["recall"]), 
                                             np.std(scores["recall"]) * 2))
print("Average F1 score: %0.3f (+/- %0.2f)" % (np.mean(scores["f1"]), 
                                               np.std(scores["f1"]) * 2))
print("Average F2 score: %0.3f (+/- %0.2f)" % (np.mean(scores["f2"]), 
                                               np.std(scores["f2"]) * 2))
print("Average AP score: %0.3f (+/- %0.2f)" % (np.mean(scores["AP"]), 
                                               np.std(scores["AP"]) * 2))

In [None]:
SH_keywords

In [None]:
y_train = df_train.SH.values
vectorizer = get_vectorizer(vectorizer_mode, params)
vectorizer.fit(df_train[text], y_train)

In [None]:
vectorizer.df_features

In [None]:
SH_keywords = vectorizer.df_features.sort_values(by="p_value")[:n_keywords].feature.tolist()
SH_keywords

In [None]:
[print('"' + word + '",') for word in SH_keywords]

In [None]:
vectorizer.df_features.sort_values(by="p_value")[:1335]

In [None]:
df_test = pd.read_csv("../../data/rmh_test.csv")
y_test = df_test.SH.values

y_pred = np.zeros_like(y_test)
y_pred[df_test[text].str.contains("|".join(SH_keywords))] = 1
np.bincount(y_pred)

In [None]:
y_proba = pd.get_dummies(y_pred, drop_first=False).values
utils.evaluate_model(y_test, y_proba, class_names, "test", digits=3, save_figures=False)