In [75]:
from nltk.tokenize import word_tokenize
import nltk
import pandas as pd
from nltk.corpus import stopwords
import string
import numpy as np
from collections import defaultdict, Counter
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

nltk.download('punkt')
nltk.download('stopwords')
np.random.seed(42)

[nltk_data] Downloading package punkt to /home/aln/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/aln/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# IMDB dataset

In [39]:
path = "../datasets/"

# Data is taken from here
# https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
data_raw = pd.read_csv(path + 'IMDB_Dataset.csv')
indices = np.random.permutation(data_raw.index)
data = data_raw.loc[indices]
data = data_raw.sample(frac=1)
data = data.replace(to_replace=['negative', 'positive'], value=[0, 1])

idx = int(data.shape[0] * 0.1)
test_data = data.iloc[:idx]
train_data = data.iloc[idx:]
targets_train = train_data["sentiment"].values
targets_test = test_data["sentiment"].values

tokenized_texts = tokenization(train_data, 'review')
tokenized_test_texts = tokenization(test_data, 'review')

In [40]:
model = NaiveBayes()
model.train(tokenized_texts, targets_train)
labels_model = model.predict(tokenized_test_texts)

In [41]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
print('accuracy:', accuracy_score(labels_model, targets_test))
print('precision:', precision_score(labels_model, targets_test))
print('recall:', recall_score(labels_model, targets_test))
print('f1_score:', f1_score(labels_model, targets_test))

accuracy: 0.853
precision: 0.8459081836327346
recall: 0.8585899513776337
f1_score: 0.8522018902071185


# DILI datasets

In [76]:
path = '../datasets/DILI_CAMDA_challenge/'
data_raw = pd.read_csv(path + "DILI_initial_set.csv")
data_raw["abs_title"] = data_raw["Abstract"] + " " + data_raw["Title"]
indices = np.random.permutation(data_raw.index)
data = data_raw.loc[indices]
data = data_raw.sample(frac=1)
idx = int(data.shape[0] * 0.1)
test_data = data.iloc[:idx]
train_data = data.iloc[idx:]
targets_train = train_data['Label'].values
targets_test = test_data['Label'].values

In [77]:
stop_words = set(nltk.corpus.stopwords.words('english') + list(string.punctuation))

def tokenize(s):
    return [i for i in word_tokenize(s.lower()) if i not in stop_words]

def tokenization(train_data, var_name='Abstract'):
    tokenized_texts = []
    #print("Tokenization....")
    for _, row in train_data.iterrows():
        text = str(row[var_name])
        words = tokenize(text)
        tokenized_texts.append(words)
    return tokenized_texts

In [78]:
class NaiveBayes:

    def __init__(self):
        self.p_w_class = defaultdict(Counter)
        self.total_words = defaultdict(int)
        self.class_probs = defaultdict(int)

    def prob_word_class(self, tokenized_texts, labels):
        for text, label in zip(tokenized_texts, labels):
            unique_words = np.unique(text)
            for w in unique_words:
                self.p_w_class[label][w] += 1
        for l in labels:
            self.total_words[l] += 1
        for l in self.p_w_class:
            doc_count = self.total_words[l]
            for w in self.p_w_class[l]:
                self.p_w_class[l][w] /= doc_count

    def prob_class(self, labels):
        for l in labels:
            self.class_probs[l] += 1
        tot = len(labels)
        for c in self.class_probs:
            self.class_probs[c] /= tot
            self.class_probs[c] = np.log(self.class_probs[c])

    def reset(self):
        self.__init__()

    def train(self, train_data, train_labels):
        self.prob_word_class(train_data, train_labels)
        self.prob_class(train_labels)

    def predict(self, test_data):
        labels_model = []
        for text in test_data:
            p_class_given_words = self.class_probs.copy()
            for w in text:
                for c in p_class_given_words:
                    if w in self.p_w_class[0] and w in self.p_w_class[1]:
                        p_class_given_words[c] += np.log(self.p_w_class[c][w])
            max_c = max(p_class_given_words, key=p_class_given_words.get)
            labels_model.append(max_c)
        return labels_model



In [79]:
from time import time

s = time()
tokenized_train_texts = tokenization(train_data, var_name="abs_title")
model = NaiveBayes()
model.train(tokenized_train_texts, targets_train)
e = time()
# labels_model = model.predict(tokenized_test_texts)
print(s - e)

-18.514108180999756


In [80]:
(e - s) / 60

0.30856846968332924

In [81]:
s = time()
tokenized_test_texts = tokenization(test_data, var_name="abs_title")
labels_model = model.predict(tokenized_test_texts)
e = time()
print( (e - s) / 60)

0.04295214017232259


In [82]:
tn, fp, fn, tp = confusion_matrix(labels_model, targets_test).ravel()
print('accuracy:', accuracy_score(labels_model, targets_test))
print('precision:', precision_score(labels_model, targets_test))
print('recall:', recall_score(labels_model, targets_test))
print('f1_score:', f1_score(labels_model, targets_test))
print('fp_rate:', fp / (fp + tn))
print('fn_rate:', fn / (fn + tp))

accuracy: 0.9288732394366197
precision: 0.9077540106951871
recall: 0.9549929676511955
f1_score: 0.9307745030843043
fp_rate: 0.09732016925246827
fn_rate: 0.0450070323488045


In [83]:
path = '../datasets/DILI_CAMDA_challenge/'
test_data_val_1 = pd.read_csv(path + "DILI_validation_set_1.tsv", sep='\t')
test_data_val_2 = pd.read_csv(path + "DILI_validation_set_2.tsv", sep='\t')
test_data_val_1["abs_title"] = test_data_val_1["Abstract"] + " " + test_data_val_1["Title"]

tokenized_test_texts_val_1 = tokenization(test_data_val_1, var_name="abs_title")
tokenized_test_texts_val_2 = tokenization(test_data_val_2)

labels_model_val_1 = pd.DataFrame( model.predict(tokenized_test_texts_val_1) )
labels_model_val_2 = pd.DataFrame( model.predict(tokenized_test_texts_val_2) )
labels_model_val_1.to_csv('../submissions/NB/labels_nb_val_1.csv', index=False, header=["Labels"])
labels_model_val_2.to_csv('../submissions/NB/labels_nb_val_2.csv', index=False, header=["Labels"])

# DILI extended dataset

In [84]:
path = '../datasets/'
data_raw = pd.read_csv(path + "merged_additional_data_dili_cleaned.csv")
data_raw.fillna("")
data_raw["abs_title_anno"] = data_raw["abstract"].astype(str) + " " + data_raw["title"].astype(str) + " " + data_raw["tox_annotation"].astype(str)
data_subset = data_raw.groupby("label").sample(n=sum(data_raw["label"]), random_state=1)

indices = np.random.permutation(data_subset.index)
data = data_raw.loc[indices]
data = data_raw.sample(frac=1)
idx = int(data.shape[0] * 0.1)
test_data = data.iloc[:idx]
train_data = data.iloc[idx:]
targets_train = train_data['label'].values
targets_test = test_data['label'].values

In [85]:
s = time()
tokenized_train_texts = tokenization(train_data, var_name="abs_title_anno")
model = NaiveBayes()
model.train(tokenized_train_texts, targets_train)
e = time()
# labels_model = model.predict(tokenized_test_texts)
print(s - e)

-42.21417593955994


In [86]:
(e - s) / 60

0.7035695989926656

In [87]:
s = time()
tokenized_test_texts = tokenization(test_data, var_name="abs_title_anno")
labels_model = model.predict(tokenized_test_texts)
e = time()
print( (e - s) / 60)

0.09738533099492391


In [88]:
tn, fp, fn, tp = confusion_matrix(labels_model, targets_test).ravel()
print('accuracy:', accuracy_score(labels_model, targets_test))
print('precision:', precision_score(labels_model, targets_test))
print('recall:', recall_score(labels_model, targets_test))
print('f1_score:', f1_score(labels_model, targets_test))
print('fp_rate:', fp / (fp + tn))
print('fn_rate:', fn / (fn + tp))

accuracy: 0.9268645908761767
precision: 0.9707792207792207
recall: 0.8367537313432836
f1_score: 0.8987975951903808
fp_rate: 0.015976331360946745
fn_rate: 0.16324626865671643


In [89]:
path = '../datasets/DILI_CAMDA_challenge/'
test_data_val_1 = pd.read_csv(path + "DILI_validation_set_1.tsv", sep='\t')
test_data_val_2 = pd.read_csv(path + "DILI_validation_set_2.tsv", sep='\t')
test_data_val_1["abs_title"] = test_data_val_1["Abstract"] + " " + test_data_val_1["Title"]

tokenized_test_texts_val_1 = tokenization(test_data_val_1, var_name="abs_title")
tokenized_test_texts_val_2 = tokenization(test_data_val_2)

labels_model_val_1 = pd.DataFrame( model.predict(tokenized_test_texts_val_1) )
labels_model_val_2 = pd.DataFrame( model.predict(tokenized_test_texts_val_2) )
labels_model_val_1.to_csv('../submissions/NB/labels_nb_extended_val_1.csv', index=False, header=["Labels"])
labels_model_val_2.to_csv('../submissions/NB/labels_nb_extended_val_2.csv', index=False, header=["Labels"])