<a href="https://colab.research.google.com/github/talibilat/NLP/blob/main/XMLC_DeepLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Importing Data from Kaggle



In [None]:
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d hsrobo/titlebased-semantic-subject-indexing
  

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
mkdir: cannot create directory ‘/root/.kaggle’: File exists
Downloading titlebased-semantic-subject-indexing.zip to /content
 99% 1.22G/1.24G [00:12<00:00, 115MB/s]
100% 1.24G/1.24G [00:12<00:00, 102MB/s]


In [None]:
from zipfile import ZipFile
file_name = 'titlebased-semantic-subject-indexing.zip'                          #extracting the zip file
with ZipFile(file_name, 'r') as zip:                                            # using the exact name of the downloaded file
  zip.extractall()
  print('Done')

Done


##Importing required libraries

In [None]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import Word
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, hamming_loss
from keras.layers import Dense, Activation, Dropout, BatchNormalization,Embedding
from keras.models import Sequential
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
nltk.download('stopwords')
nltk.download('punkt')
stop = stopwords.words('english')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


##Loading Dataset

In [None]:
df = pd.read_csv("econbiz.csv")  
df = df.iloc[0:100000]                                                          # Loading econbiz
df = df[['title','labels']]   
df                                                                              # Only keeping the required columns

Unnamed: 0,title,labels
0,Water insecurity and the poor : issues and res...,18377-5\t18378-3\t18232-3\t10488-1\t10492-3
1,Crop biotechnology in developing countries : a...,18317-2\t16696-6\t11529-1\t10492-3
2,"Medicaid, intergovernmental trends and options",13415-6\t19507-6\t19517-3\t11600-0\t17829-1
3,State solvency regulation of property-casualty...,13297-0\t13294-6\t13818-5\t12228-3\t10894-4\t1...
4,Tax and expenditure limits on local governments,11553-4\t11653-0\t11539-5\t11731-6\t17829-1
...,...,...
99995,Adverse selection and intermediation,19227-5\t12809-4\t10213-5\t19073-6
99996,Organizational drift as a response to resource...,13397-3\t13670-4\t18449-6\t16989-5
99997,Taxes and the portfolio composition of househo...,12212-4\t11650-6\t13398-1\t16989-5
99998,"Overdrafts, interest rate regulation and the e...",12251-1\t11460-4\t10135-6\t10382-3


##Pre-Processing

In [None]:
def pprocessing(df):
    df['title'] = df['title'].apply(lambda x: " ".join(x.lower() for x in x.split()))                      # Converting the document to lower case
    df['title'] = df['title'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))             # Removing the stopwords
    df['title'] = df['title'].apply(lambda x: " ".join([Word(word).stem() for word in x.split()]))         # Stemming the document
    return df

def cleaning(title):
    title_processed = re.sub('[^a-zA-Z]', ' ', title)                           # Removing Numbers
    title_processed = re.sub(r"\s+[a-zA-Z]\s+", ' ', title_processed)           # Removing single characters
    title_processed = re.sub(r'\s+', ' ', title_processed)                      # Removing multiple spaces
    return title_processed

In [None]:
X = []
new_df = pprocessing(df)
title_unprocessed = list(df['title'])

for idx,title in enumerate(title_unprocessed):
  X.append(cleaning(title))

mlb = MultiLabelBinarizer()                                                     #Preprocess labels to be a binary representation
mlb.fit(df.labels)
labels = mlb.classes_
y = mlb.transform(df.labels)



x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)   # Dividing the data into train and test.

In [None]:
tokenizer = Tokenizer(num_words=5000, lower=True)                               #Initialize tokenizer from keras that will vectorize title values

tokenizer.fit_on_texts(x_train)

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

vocabulary_size = len(tokenizer.word_index) + 1

x_train = pad_sequences(x_train, padding= 'post',maxlen=51)
x_test = pad_sequences(x_test,padding='post',maxlen=51)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

x_train shape: (70000, 51)
x_test shape: (30000, 51)


In [None]:
model = Sequential()                                                            # Buiding a sequencial model for KERAS BiLSTM
model.add(Embedding(vocabulary_size, 128, input_length=51))                     # Embedding
model.add(Bidirectional(LSTM(64)))                                              # Using single Bidirectional LSTM
model.add(Dropout(0.5))                                                         # Dropping out 50% of neurons
model.add(Dense(y_train.shape[1], activation='sigmoid'))                        # Using last dense layer with sigmoid function
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])




In [None]:
model.summary()

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_18 (Embedding)    (None, 51, 128)           2545152   
                                                                 
 bidirectional_13 (Bidirecti  (None, 128)              98816     
 onal)                                                           
                                                                 
 dropout_23 (Dropout)        (None, 128)               0         
                                                                 
 dense_26 (Dense)            (None, 12)                1548      
                                                                 
Total params: 2,645,516
Trainable params: 2,645,516
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(x_train, y_train,                                                     # Fitting the model
          batch_size=32,
          epochs=2,
          validation_data=[x_test, y_test])

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f06d1fdd710>

In [None]:
pl = np.arange(0.05,1.0,0.05)                                                   # Evaluating the model
for i in pl:
  y_hat = model.predict(x_test)
  y_hat[y_hat>=i] = 1
  y_hat[y_hat<i] = 0
  F1 = f1_score(y_test, y_hat, average="samples")
print('F1 Score of LSTM is :', F1)

F1 Score of LSTM is : 0.6352569133267235


##CNN

In [None]:
model = Sequential()

model.add(Embedding(vocabulary_size,
                    50,
                    input_length=51))
model.add(Dropout(0.2))
model.add(Conv1D(250,
                 3,
                 padding='valid',
                 activation='relu',
                 strides=1))
model.add(GlobalMaxPooling1D())
model.add(Dense(250))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Dense(y_train.shape[1]))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(x_train, y_train,
          batch_size=32,
          epochs=2,
          validation_data=(x_test, y_test))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f06ca2a7f10>

In [None]:
model.summary()

Model: "sequential_19"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_19 (Embedding)    (None, 51, 50)            994200    
                                                                 
 dropout_24 (Dropout)        (None, 51, 50)            0         
                                                                 
 conv1d_5 (Conv1D)           (None, 49, 250)           37750     
                                                                 
 global_max_pooling1d_5 (Glo  (None, 250)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_27 (Dense)            (None, 250)               62750     
                                                                 
 dropout_25 (Dropout)        (None, 250)               0         
                                                     

In [None]:
model.fit(x_train, y_train,
          batch_size=32,
          epochs=2,
          validation_data=[x_test, y_test])

pl2 = np.arange(0.05,1.0,0.05)
for i in pl2:
  y2_hat = model.predict(x_test)
  y2_hat[y2_hat>=i] = 1
  y2_hat[y2_hat<i] = 0
  F11 = f1_score(y_test, y2_hat, average="samples")

print('F1 Score of CNN is :', F11)
print('F1 Score of LSTM is :', F1)

Epoch 1/2
Epoch 2/2
F1 Score of CNN is : 0.6671025230513493
F1 Score of LSTM is : 0.6352569133267235
