In [1]:
from ptic import pmi_tfidf_classifier as ptah
import numpy as np
from nltk.tokenize import word_tokenize
import pandas as pd
import nltk
from time import time
import string
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
np.random.seed(42)

[nltk_data] Downloading package stopwords to /home/aln/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/aln/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# DILI datasets

In [2]:
path = '../datasets/DILI_CAMDA_challenge/'

data_raw = pd.read_csv(path+"DILI_initial_set.csv")
data_raw["abs_title"] = data_raw["Abstract"].astype(str) + " " + data_raw["Title"].astype(str)

indices = np.random.permutation(data_raw.index)
data = data_raw.loc[indices]
data = data_raw.sample(frac=1)

idx = int(data.shape[0] * 0.1)
test_data = data.iloc[:idx]
train_data = data.iloc[idx:]
targets_train = train_data['Label'].values
targets_test = test_data['Label'].values

In [3]:
s1 = time()
tokenized_texts = ptah.tokenization(train_data, var_name="abs_title")
N = len(tokenized_texts)
word2text_count = ptah.get_word_stat(tokenized_texts)
words_pmis = ptah.create_pmi_dict(tokenized_texts, targets_train, min_count=5)
e1 = time()

s2 = time()
tokenized_test_texts = ptah.tokenization(test_data, var_name="abs_title")
results = ptah.classify_pmi_based(words_pmis, word2text_count, tokenized_test_texts, N)
e2 = time()

print('trainin time (min):', (e1 - s1) / 60)
print('testing time (min):', (e2 - s2) / 60)

trainin time (min): 0.3067026694615682
testing time (min): 0.03766229550043742


In [4]:
tn, fp, fn, tp = confusion_matrix(results, targets_test).ravel()
print('accuracy:', accuracy_score(results, targets_test))
print('precision:', precision_score(results, targets_test))
print('recall:', recall_score(results, targets_test))
print('f1_score:', f1_score(results, targets_test))
print('fp_rate:', fp / (fp + tn))
print('fn_rate:', fn / (fn + tp))

accuracy: 0.9401408450704225
precision: 0.9532085561497327
recall: 0.9344692005242464
f1_score: 0.9437458636664461
fp_rate: 0.0532724505327245
fn_rate: 0.0655307994757536


In [6]:
path = '../datasets/DILI_CAMDA_challenge/'
test_data_val_1 = pd.read_csv(path + "DILI_validation_set_1.tsv", sep='\t')
test_data_val_2 = pd.read_csv(path + "DILI_validation_set_2.tsv", sep='\t')
test_data_val_1["abs_title"] = test_data_val_1["Abstract"] + " " + test_data_val_1["Title"]

tokenized_test_texts_val_1 = ptah.tokenization(test_data_val_1, var_name="abs_title")
tokenized_test_texts_val_2 = ptah.tokenization(test_data_val_2, var_name="Abstract")

labels_model_val_1 = ptah.classify_pmi_based(words_pmis, word2text_count, tokenized_test_texts_val_1, N)
labels_model_val_2 = ptah.classify_pmi_based(words_pmis, word2text_count, tokenized_test_texts_val_2, N)

pd.DataFrame(labels_model_val_1).astype(int).to_csv('../submissions/PTIC/labels_ptic_val_1.csv', index=False, header=False)
pd.DataFrame(labels_model_val_2).astype(int).to_csv('../submissions/PTIC/labels_ptic_val_2.csv', index=False, header=False)

# DILI extended datasets

In [7]:
path = '../datasets/merged_additional_data_dili_cleaned.csv'

data_raw = pd.read_csv(path)
data_raw.fillna("")
data_raw["abs_title_anno"] = data_raw["abstract"].astype(str) + " " + data_raw["title"].astype(str) + " " + data_raw["tox_annotation"].astype(str)

data_subset = data_raw.groupby("label").sample(n=sum(data_raw["label"]), random_state=1)

indices = np.random.permutation(data_subset.index)
data = data_subset.loc[indices]
data = data_subset.sample(frac=1)

idx = int(data.shape[0] * 0.1)
test_data = data.iloc[:idx]
train_data = data.iloc[idx:]
targets_train = train_data['label'].values
targets_test = test_data['label'].values

In [8]:
s1 = time()
tokenized_texts = ptah.tokenization(train_data, var_name="abs_title_anno")
N = len(tokenized_texts)
word2text_count = ptah.get_word_stat(tokenized_texts)
words_pmis = ptah.create_pmi_dict(tokenized_texts, targets_train, min_count=5)
e1 = time()

s2 = time()
tokenized_test_texts = ptah.tokenization(test_data, var_name="abs_title_anno")
results = ptah.classify_pmi_based(words_pmis, word2text_count, tokenized_test_texts, N)
e2 = time()

print('trainin time (min):', (e1 - s1) / 60)
print('testing time (min):', (e2 - s2) / 60)

trainin time (min): 0.4597957690556844
testing time (min): 0.05903757413228353


In [9]:
tn, fp, fn, tp = confusion_matrix(results, targets_test).ravel()
print('accuracy:', accuracy_score(results, targets_test))
print('precision:', precision_score(results, targets_test))
print('recall:', recall_score(results, targets_test))
print('f1_score:', f1_score(results, targets_test))
print('fp_rate:', fp / (fp + tn))
print('fn_rate:', fn / (fn + tp))

accuracy: 0.9454123112659698
precision: 0.9704840613931524
recall: 0.9225589225589226
f1_score: 0.945914844649022
fp_rate: 0.030084235860409144
fn_rate: 0.07744107744107744


In [10]:
path = '../datasets/DILI_CAMDA_challenge/'
test_data_val_1 = pd.read_csv(path + "DILI_validation_set_1.tsv", sep='\t')
test_data_val_2 = pd.read_csv(path + "DILI_validation_set_2.tsv", sep='\t')
test_data_val_1["abs_title"] = test_data_val_1["Abstract"] + " " + test_data_val_1["Title"]

tokenized_test_texts_val_1 = ptah.tokenization(test_data_val_1, var_name="abs_title")
tokenized_test_texts_val_2 = ptah.tokenization(test_data_val_2, var_name="Abstract")

labels_model_val_1 = ptah.classify_pmi_based(words_pmis, word2text_count, tokenized_test_texts_val_1, N)
labels_model_val_2 = ptah.classify_pmi_based(words_pmis, word2text_count, tokenized_test_texts_val_2, N)

pd.DataFrame(labels_model_val_1).astype(int).to_csv('../submissions/PTIC/labels_ptic_extended_val_1.csv', index=False, header=False)
pd.DataFrame(labels_model_val_2).astype(int).to_csv('../submissions/PTIC/labels_ptic_extended_val_2.csv', index=False, header=False)