In [None]:
import os
import skmultilearn
import pickle
import numpy as np
import nltk
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import tensorflow as tf
from kaggle_secrets import UserSecretsClient
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score

In [2]:
df = pd.read_csv("../input/pubmed-multilabel-text-classification/PubMed Multi Label Text Classification Dataset Processed.csv")

In [3]:
df['consolidated'] = df['Title'] + os.linesep + df['abstractText']
df.head()

In [4]:
df.drop_duplicates(inplace = True)
df.head()

In [5]:
df.drop(columns = ['meshMajor', 'pmid', 'meshid', 'meshroot', 'Title', 'abstractText'], inplace = True)
df.head()

In [6]:
corpus = ' '.join(map(str, df['consolidated'].tolist()))
word_dist = nltk.FreqDist(corpus.split())

In [7]:
word_dist.most_common(n = 10)

In [8]:
consolidated_list = df['consolidated'].tolist()
length = []
for x in consolidated_list:
    length.append(len(str(x).split()))

len_art = np.array(length)


print(np.mean(len_art))
print(np.median(len_art))
print(np.std(len_art))

In [9]:
df.consolidated = df.consolidated.astype(str)
df.dtypes

# Exploration

In [10]:
ln = df.shape[0]
for r in range(0, df.shape[1] - 1):
    print("{0}: {1}, {2}".format(df.columns.values.tolist()[r], sum(df.iloc[:,r]), (sum(df.iloc[:,r])/ln)*100))

In [19]:
from skmultilearn.model_selection import iterative_train_test_split
x = np.array(df['consolidated']).reshape(df['consolidated'].shape[0],1)
y = np.array(df[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'L', 'M', 'N', 'Z']])
X_train, y_train, X_test, y_test = iterative_train_test_split(x, y, test_size = 0.3)

In [20]:
# Load data using tf.data.Dataset

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

In [21]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [22]:
for feat, targ in train_dataset.take(3):
  print('text: ', feat.numpy().shape)
  print('label: ', targ.numpy().shape)

In [23]:
BUFFER_SIZE = 10000
BATCH_SIZE = 128

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [24]:
preprocess = tf.keras.layers.TextVectorization(standardize='lower_and_strip_punctuation',
                                              output_mode='int',
                                              output_sequence_length=200)

In [25]:
preprocess.adapt(train_dataset.map(lambda text, label: text))

In [26]:
vocab = np.array(preprocess.get_vocabulary())
vocab.shape

In [27]:
for feat, targ in train_dataset.take(3):
  print('text: ', feat.numpy()[:1])
  print('label: ', targ.numpy()[:1])

In [72]:
preprocess(feat[:3]).numpy()[0].shape

In [28]:
model = tf.keras.Sequential([
    preprocess,
    tf.keras.layers.Embedding(input_dim = vocab.shape[0] , output_dim = 128, mask_zero = True),
    tf.keras.layers.LSTM(128, activation = 'relu', return_sequences = True),
    tf.keras.layers.LSTM(128, activation = 'relu'),
    tf.keras.layers.Dense(64, activation = 'relu'), 
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(14, activation = 'sigmoid')
])

In [29]:
model.compile(optimizer= tf.keras.optimizers.Adam(learning_rate= 0.01), 
              loss = tf.keras.losses.CategoricalCrossentropy(from_logits= False), metrics = ['accuracy']
              )

In [32]:
history = model.fit(train_dataset, epochs=10,
                    validation_data=test_dataset, 
                    validation_steps=5)

In [78]:
test_dataset

# EDA on the Labels

In [11]:
df_labels = df.iloc[:,:-1]
df['label_sum'] = df_labels.sum(axis = 1)
df['label_sum']

In [12]:
df.label_sum.max()

In [13]:
def label_comb(label_sum, cols):
    
    list1 = df[df['label_sum'] == label_sum].index
    indices = []
    var = []
    
    for i in list1:
        
        indices.append(i)
        string = ""
        for c in cols:
            if(df[c][i] == 1): 
                string = string + c
        var.append(string)
        
    return np.array(var), np.array(indices)





def label_indices(var):
    
    df_temp = pd.DataFrame(columns = ['label', 'indices'])

    for label in var.label.unique():
        indices = var[var['label'] == label].reset_index().iloc[:,1:]['index'].values.tolist()
        df_temp = df_temp.append([{'label':label,'indices':indices}], ignore_index = True)

    return df_temp

In [14]:
cols = df.iloc[:,:-2].columns

l1 = pd.DataFrame(label_comb(1, cols), index = ['label','index']).T
l2 = pd.DataFrame(label_comb(2, cols), index = ['label','index']).T
l3 = pd.DataFrame(label_comb(3, cols), index = ['label','index']).T
l4 = pd.DataFrame(label_comb(4, cols), index = ['label','index']).T
l5 = pd.DataFrame(label_comb(5, cols), index = ['label','index']).T
l6 = pd.DataFrame(label_comb(6, cols), index = ['label','index']).T
l7 = pd.DataFrame(label_comb(7, cols), index = ['label','index']).T
l8 = pd.DataFrame(label_comb(8, cols), index = ['label','index']).T
l9 = pd.DataFrame(label_comb(9, cols), index = ['label','index']).T
l10 = pd.DataFrame(label_comb(10, cols), index = ['label','index']).T
l11 = pd.DataFrame(label_comb(11, cols), index = ['label','index']).T
l12 = pd.DataFrame(label_comb(12, cols), index = ['label','index']).T
l13 = pd.DataFrame(label_comb(13, cols), index = ['label','index']).T

In [15]:
l1 = label_indices(l1)
l2 = label_indices(l2)
l3 = label_indices(l3)
l4 = label_indices(l4)
l5 = label_indices(l5)
l6 = label_indices(l6)
l7 = label_indices(l7)
l8 = label_indices(l8)
l9 = label_indices(l9)
l10 = label_indices(l10)
l11 = label_indices(l11)
l12 = label_indices(l12)
l13 = label_indices(l13)

In [19]:
l13