In [9]:
from nltk.tokenize import word_tokenize
import nltk
import pandas as pd
from nltk.corpus import stopwords
import string
import numpy as np
from collections import defaultdict, Counter
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/arsentii/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/arsentii/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
path = '../datasets/DILI/'
data_raw = pd.read_csv(path + "DILI_data.csv")
indices = np.random.permutation(data_raw.index)
data = data_raw.loc[indices]
data = data_raw.sample(frac=1)
idx = int(data.shape[0] * 0.1)
test_data = data.iloc[:idx]
train_data = data.iloc[idx:]
targets_train = train_data['Label'].values
targets_test = test_data['Label'].values

In [28]:
path = "../datasets/DILI/"
stop_words = set(nltk.corpus.stopwords.words('english') + list(string.punctuation))

def tokenize(s):
    return [i for i in word_tokenize(s.lower()) if i not in stop_words]

def tokenization(train_data, var_name='Abstract'):
    tokenized_texts = []
    #print("Tokenization....")
    for _, row in train_data.iterrows():
        text = str(row[var_name])
        #text = str(row['Title']) + ' ' + str(row['Abstract'])
        words = tokenize(text)
        tokenized_texts.append(words)
    return tokenized_texts

In [5]:
class NaiveBayes:

    def __init__(self):
        self.p_w_class = defaultdict(Counter)
        self.total_words = defaultdict(int)
        self.class_probs = defaultdict(int)

    def prob_word_class(self, tokenized_texts, labels):
        for text, label in zip(tokenized_texts, labels):
            unique_words = np.unique(text)
            for w in unique_words:
                self.p_w_class[label][w] += 1
        for l in labels:
            self.total_words[l] += 1
        for l in self.p_w_class:
            doc_count = self.total_words[l]
            for w in self.p_w_class[l]:
                self.p_w_class[l][w] /= doc_count

    def prob_class(self, labels):
        for l in labels:
            self.class_probs[l] += 1
        tot = len(labels)
        for c in self.class_probs:
            self.class_probs[c] /= tot
            self.class_probs[c] = np.log(self.class_probs[c])

    def reset(self):
        self.__init__()

    def train(self, train_data, train_labels):
        self.prob_word_class(train_data, train_labels)
        self.prob_class(train_labels)

    def predict(self, test_data):
        labels_model = []
        for text in test_data:
            p_class_given_words = self.class_probs.copy()
            for w in text:
                for c in p_class_given_words:
                    if w in self.p_w_class[0] and w in self.p_w_class[1]:
                        p_class_given_words[c] += np.log(self.p_w_class[c][w])
            max_c = max(p_class_given_words, key=p_class_given_words.get)
            labels_model.append(max_c)
        return labels_model



In [20]:
from time import time
tokenized_test_texts = tokenization(test_data)

s = time()
tokenized_train_texts = tokenization(train_data)
model = NaiveBayes()
model.train(tokenized_train_texts, targets_train)
e = time()
# labels_model = model.predict(tokenized_test_texts)
print(s - e)

-14.376672506332397


In [25]:
(e - s) / 60

0.2396112084388733

In [26]:
s = time()
tokenized_test_texts = tokenization(test_data)
labels_model = model.predict(tokenized_test_texts)
e = time()
print( (e - s) / 60)

0.030988132953643797


In [14]:
tn, fp, fn, tp = confusion_matrix(labels_model, targets_test).ravel()
print('accuracy:', accuracy_score(labels_model, targets_test))
print('precision:', precision_score(labels_model, targets_test))
print('recall:', recall_score(labels_model, targets_test))
print('f1_score:', f1_score(labels_model, targets_test))
print('fp_rate:', fp / (fp + tn))
print('fn_rate:', fn / (fn + tp))

accuracy: 0.9154929577464789
precision: 0.8826025459688827
recall: 0.9440242057488654
f1_score: 0.912280701754386
fp_rate: 0.10935441370223979
fn_rate: 0.05597579425113464


In [13]:
path = "../data/"

# Data is taken from here
# https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
data_raw = pd.read_csv(path + 'IMDB_Dataset.csv')
indices = np.random.permutation(data_raw.index)
data = data_raw.loc[indices]
data = data_raw.sample(frac=1)
data = data.replace(to_replace=['negative', 'positive'], value=[0, 1])

idx = int(data.shape[0] * 0.1)
test_data = data.iloc[:idx]
train_data = data.iloc[idx:]
targets_train = train_data["sentiment"].values
targets_test = test_data["sentiment"].values

tokenized_texts = tokenization(train_data, 'review')
tokenized_test_texts = tokenization(test_data, 'review')

In [14]:
model = NaiveBayes()
model.train(tokenized_texts, targets_train)
labels_model = model.predict(tokenized_test_texts)

In [15]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
print('accuracy:', accuracy_score(labels_model, targets_test))
print('precision:', precision_score(labels_model, targets_test))
print('recall:', recall_score(labels_model, targets_test))
print('f1_score:', f1_score(labels_model, targets_test))

accuracy: 0.86
precision: 0.8635108481262327
recall: 0.8607943373967755
f1_score: 0.862150452934226


In [35]:
test_data_val = pd.read_csv(path + "Validation.tsv", sep='\t')
test_data_add_val = pd.read_csv(path + "AdditionalDILItest.csv", sep='\t')
tokenized_test_texts_val = tokenization(test_data_val)
tokenized_test_texts_add_val = tokenization(test_data_add_val)

labels_model_val = pd.DataFrame( model.predict(tokenized_test_texts_val) )
labels_model_add_val = pd.DataFrame( model.predic(tokenized_test_texts_add_val) )
labels_model_val.to_csv('../submission/labels_val.csv', index=False)
labels_model_add_val.to_csv('../submission/labels_add_val.csv', index=False)


AttributeError: module 'pandas' has no attribute 'readc_csv'