#### Here I will be trying to classify sentences according to their labels using CNNs.

In [27]:
# imports
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from urllib.request import urlretrieve
seed=1234
%env TF_FORCE_GPU_ALLOW_GROWTH=true

env: TF_FORCE_GPU_ALLOW_GROWTH=true


#### Downloading data: the data is composed of questions as input and their type as output

In [8]:
url='http://cogcomp.org/Data/QA/QC/'
dir_name='data'
def download_data(dir_name,filename):
    os.makedirs(dir_name,exist_ok=True)
    if not os.path.exists(os.path.join(dir_name,filename)):
        filepath,_=urlretrieve(url+filename,os.path.join(dir_name,filename))
    else:
        filepath=os.path.join(dir_name,filename)
    return filepath

In [9]:
train_filename=download_data(dir_name,'train_5500.label')

In [10]:
test_filename=download_data(dir_name,'TREC_10.label')

##### Loading and preprocessing data: load the text and for each example extract question,category and subcategory

In [11]:
def read_data(filename):
    '''
    Read data from a file with given filename
    Return a list of strings
    '''
    # hold question, cateegory and subcategory
    questions,categories,sub_categories=[],[],[]
    with open(filename,'r',encoding='latin-1') as f:
        # read each line
        # Each string has format <cat>:<sub cat> <question>
        # Split by : to separate cat and (sub_cat + question)
        for row in f:
            row_str=row.split(":")
            cat,sub_cat_and_question=row_str[0],row_str[1]
            tokens=sub_cat_and_question.split(' ')
            sub_cat,question=tokens[0],' '.join(tokens[1:])
            questions.append(question.lower().strip())
            categories.append(cat)
            sub_categories.append(sub_cat)
    return questions,categories,sub_categories

In [12]:
train_questions,train_categories,train_sub_categories=read_data(train_filename)
test_questions,test_categories,test_sub_categories=read_data(test_filename)

In [13]:
# Convert train / test data to pandas dataframe
train_df=pd.DataFrame({'question':train_questions,'category':train_categories,'sub_category':train_sub_categories})
test_df=pd.DataFrame({'question':test_questions,'category':test_categories,'sub_category':test_sub_categories})

In [14]:
train_df.head()

Unnamed: 0,question,category,sub_category
0,how did serfdom develop in and then leave russ...,DESC,manner
1,what films featured the character popeye doyle ?,ENTY,cremat
2,how can i find a list of celebrities ' real na...,DESC,manner
3,what fowl grabs the spotlight after the chines...,ENTY,animal
4,what is the full form of .com ?,ABBR,exp


In [15]:
# Shuffle the data
train_df=train_df.sample(frac=1,random_state=seed)

In [16]:
# Converting string label to integer id
unique_cats=train_df.category.unique()
labels_map=dict(zip(unique_cats,np.arange(unique_cats.shape[0])))
print(f"Label-> ID mapping:{labels_map}")
n_classes=len(labels_map)
train_df['category']=train_df['category'].map(labels_map)
test_df['category']=test_df['category'].map(labels_map)

Label-> ID mapping:{'HUM': 0, 'DESC': 1, 'LOC': 2, 'ENTY': 3, 'NUM': 4, 'ABBR': 5}


In [17]:
train_df.head()

Unnamed: 0,question,category,sub_category
672,who sings the song `` drink to me with thine e...,0,ind
2518,what are the lyrics to the star spangled banner ?,1,desc
2927,which latin american country is the largest ?,2,country
971,where is logan airport ?,2,other
4502,what does larry king do for a living ?,0,title


In [18]:
# Splitting into train and val sets
from sklearn.model_selection import train_test_split
train_df,valid_df=train_test_split(train_df,test_size=0.1)

#### Building a tokenizer that maps words to numerical IDs

In [19]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer=Tokenizer()
tokenizer.fit_on_texts(train_df['question'].tolist())
# compute vocab size
n_vocab=len(tokenizer.index_word)+1
print(f"Vocab size:{n_vocab}")

Vocab size:7895


In [20]:
# convert all of the train, validation, and test inputs to sequences of word IDs.
train_sequences = tokenizer.texts_to_sequences(train_df["question"].tolist())
train_labels = train_df["category"].values
valid_sequences = tokenizer.texts_to_sequences(valid_df["question"].tolist())
valid_labels = valid_df["category"].values
test_sequences = tokenizer.texts_to_sequences(test_df["question"].tolist())
test_labels = test_df["category"].values


#### In the model , questions are feeded in batches and its very unlikely that all questions have same number of tokens, if all questions dont have same no. of tokens, a tensor cannot be formed so shorter sequences need to to pad and sequence longer than specified length need to be truncated. 

In [22]:
max_seq_length=22
# pad short sequence and truncate longer
preprocessed_train_sequences=tf.keras.preprocessing.sequence.pad_sequences(train_sequences,maxlen=max_seq_length,padding='post',
truncating='post')
preprocessed_valid_sequences=tf.keras.preprocessing.sequence.pad_sequences(valid_sequences,maxlen=max_seq_length,padding='post',
truncating='post')
preprocessed_test_sequences=tf.keras.preprocessing.sequence.pad_sequences(test_sequences,maxlen=max_seq_length,padding='post',
truncating='post')


#### Simple CNN to classify sentences

In [24]:
import tensorflow.keras.backend as K
import tensorflow.keras.layers as layers
import tensorflow.keras.regularizers as regularizers
from tensorflow.keras.models import Model
K.clear_session()
word_id_inputs=layers.Input(shape=(max_seq_length,),dtype='int32')
embedding_out=layers.Embedding(input_dim=n_vocab,output_dim=64)(word_id_inputs)
conv1_1=layers.Conv1D(100,kernel_size=3,strides=1,padding='same',activation='relu')(embedding_out)
conv1_2=layers.Conv1D(100,kernel_size=4,strides=1,padding='same',activation='relu')(embedding_out)
conv1_3=layers.Conv1D(100,kernel_size=5,strides=1,padding='same',activation='relu')(embedding_out)
# concatenate to produce a single tensor
conv_out=layers.Concatenate(axis=-1)([conv1_1,conv1_2,conv1_3])
# applying pooling layer - max pooling over sequence lenth
# in other words, each feature map results in a single output
pool_over_time_out=layers.MaxPool1D(pool_size=max_seq_length,padding='valid')(conv_out)
# flatten 
flatten_out=layers.Flatten()(pool_over_time_out)
# final output
out=layers.Dense(n_classes,activation='softmax',kernel_regularizer=regularizers.l2(0.001))(flatten_out)
# define the model
cnn_model=Model(inputs=word_id_inputs,outputs=out)
# compile the model
cnn_model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
cnn_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 22)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 22, 64)       505280      input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 22, 100)      19300       embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 22, 100)      25700       embedding[0][0]                  
______________________________________________________________________________________________

#### Training the model- using callback for improved performance - use decaying learning rate.The idea is to reduce the learning rate (by some fraction) whenever the model has stopped to improve performance

In [25]:
# callback
lr_reduce_callback=tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',factor=0.1,patience=3,verbose=1,
mode='auto',min_delta=0.0001,min_lr=0.000001)
# train 
cnn_model.fit(preprocessed_train_sequences,train_labels,
validation_data=(preprocessed_valid_sequences,valid_labels),batch_size=128,epochs=25,callbacks=[lr_reduce_callback])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 12/25
Epoch 13/25
Epoch 14/25

Epoch 00014: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 15/25
Epoch 16/25
Epoch 17/25

Epoch 00017: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 18/25
Epoch 19/25
Epoch 20/25

Epoch 00020: ReduceLROnPlateau reducing learning rate to 1e-06.
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x1a5d49fd580>

In [26]:
# Testing the model on test set
cnn_model.evaluate(preprocessed_test_sequences,test_labels,return_dict=True)



{'loss': 0.3767916262149811, 'accuracy': 0.9020000100135803}