
# NLP Assignment - Question Classification
This notebook outlines the process for data preprocessing, training, and model creation for Question Classification tasks using pretrained Word2Vec embeddings.


In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras.callbacks import EarlyStopping
import gensim.downloader as api
import random


## Data Preprocessing
Load and preprocess the data for question classification.


In [2]:

# Question Classification Data
train_data = pd.read_csv('../Data/TREC/Raw/train.csv')  # Replace with your path
test_data = pd.read_csv('../Data/TREC/Raw/test.csv')  # Replace with your path
# Preprocess Data (Tokenization, padding, label encoding)
# Add your preprocessing steps here
train_data['text'] = train_data['text'].str.lower()
test_data['text'] = test_data['text'].str.lower()

train_data = train_data.drop(columns=['label-fine'])
test_data = test_data.drop(columns=['label-fine'])

In [3]:
coarse_label = train_data['label-coarse'].unique() # returns an array([])
coarse_label = list(coarse_label) 
selection = random.sample(coarse_label, 2)

In [4]:
for index, row in train_data.iterrows():
    label_coarse = train_data.loc[index, 'label-coarse']
    if label_coarse in selection:
        train_data.loc[index, 'label-coarse'] = 'OTHERS'


for index, row in test_data.iterrows():
    label_coarse = test_data.loc[index, 'label-coarse']
    if label_coarse in selection:
        test_data.loc[index, 'label-coarse'] = 'OTHERS'

  train_data.loc[index, 'label-coarse'] = 'OTHERS'
  test_data.loc[index, 'label-coarse'] = 'OTHERS'


In [5]:
train_data

Unnamed: 0,label-coarse,text
0,OTHERS,how did serfdom develop in and then leave russ...
1,OTHERS,what films featured the character popeye doyle ?
2,OTHERS,how can i find a list of celebrities ' real na...
3,OTHERS,what fowl grabs the spotlight after the chines...
4,2,what is the full form of .com ?
...,...,...
5447,OTHERS,what 's the shape of a camel 's spine ?
5448,OTHERS,what type of currency is used in china ?
5449,4,what is the temperature today ?
5450,4,what is the temperature for cooking ?


In [6]:
train_df, dev_df = train_test_split(train_data, test_size = 500, shuffle = True)

path = '../Data/TREC/Processed/'

# Export DataFrame to a CSV file
train_df.to_csv(f'{path}train.csv', index=False)
dev_df.to_csv(f'{path}dev.csv', index=False)
test_data.to_csv(f'{path}test.csv', index=False)


## Loading Pretrained Word2Vec Embeddings
Load the pretrained Word2Vec model.


In [7]:
path = '../Data/TREC/Processed'

training_dev_df = pd.read_csv(f'{path}/dev.csv')
training_df = pd.read_csv(f'{path}/train.csv')
test_df = pd.read_csv(f'{path}/test.csv')
print(set(training_df['label-coarse']))

{'3', '2', 'OTHERS', '4', '5'}


In [8]:
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

training_df['label-coarse'] = label_encoder.fit_transform(training_df['label-coarse'])
training_dev_df['label-coarse'] = label_encoder.fit_transform(training_dev_df['label-coarse'])
test_df['label-coarse'] = label_encoder.fit_transform(test_df['label-coarse'])


In [9]:
y_train = to_categorical(training_df['label-coarse'])
y_val = to_categorical(training_dev_df['label-coarse'])
y_test = to_categorical(test_df['label-coarse'])

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(training_df['text'])

X_train_seq = tokenizer.texts_to_sequences(training_df['text'])
X_val_seq = tokenizer.texts_to_sequences(training_dev_df['text'])
X_test_seq = tokenizer.texts_to_sequences(test_df['text'])

MAX_LEN = 50

X_train = pad_sequences(X_train_seq,MAX_LEN)
X_val = pad_sequences(X_val_seq,MAX_LEN)
X_test = pad_sequences(X_test_seq,MAX_LEN)

In [11]:

# Load Pretrained Word2Vec Model
w2v_model = api.load('word2vec-google-news-300')




In [12]:
_, embedding_dim = w2v_model.vectors.shape
print (embedding_dim)

300


In [18]:
vocab_size = len(tokenizer.word_index) + 1

embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if word in w2v_model:
        embedding_vec = w2v_model[word]
        if embedding_vec is not None:
            embedding_matrix[i] = embedding_vec
print (embedding_matrix)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.08007812  0.10498047  0.04980469 ...  0.00366211  0.04760742
  -0.06884766]
 [ 0.13964844 -0.00616455  0.21484375 ...  0.05712891  0.09960938
  -0.234375  ]
 ...
 [ 0.34570312 -0.05419922 -0.11816406 ... -0.53125     0.12890625
   0.08740234]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.06054688  0.10693359 -0.03662109 ...  0.39257812  0.07666016
  -0.08056641]]



## Model Building


Using Global Average Pooling

In [19]:
from keras.layers import GlobalAveragePooling1D  # Import the necessary layer

model = Sequential()

# Add Embedding layer
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1,
                    output_dim=embedding_matrix.shape[1],
                    weights=[embedding_matrix],
                    input_length=MAX_LEN,
                    trainable=False))

# Add a Global Average Pooling layer to perform the aggregation
model.add(GlobalAveragePooling1D())

# Add Dense layers (as in your original code)
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=5, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
early_stopper = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [20]:
# Training Question Classification Model
NUM_EPOCH = 200
BATCH_SIZE = 64

model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=NUM_EPOCH, validation_data=(X_val, y_val), callbacks= early_stopper, workers= 4)


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200


<keras.src.callbacks.History at 0x207aac86220>

In [21]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Test Loss: 0.41378143429756165, Test Accuracy: 0.8679999709129333


Using Global Max Pooling

In [22]:
from keras.layers import GlobalMaxPooling1D  # Import the necessary layer

model = Sequential()

# Add Embedding layer
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1,
                    output_dim=embedding_matrix.shape[1],
                    weights=[embedding_matrix],
                    input_length=MAX_LEN,
                    trainable=False))

# Add a Global Max Pooling layer to perform the aggregation
model.add(GlobalMaxPooling1D())

# Add Dense layers (as in your original code)
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=5, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
early_stopper = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [23]:
# Training Question Classification Model
NUM_EPOCH = 200
BATCH_SIZE = 64

model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=NUM_EPOCH, validation_data=(X_val, y_val), callbacks= early_stopper, workers= 4)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200


<keras.src.callbacks.History at 0x207a9c52070>

In [24]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Test Loss: 0.5106096267700195, Test Accuracy: 0.8220000267028809


Using LSTM/RNN 

In [25]:
# Model for Question Classification
# Define question classification model architecture

model = Sequential()

# Add Embedding layer
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1,
                    output_dim=embedding_matrix.shape[1],
                    weights=[embedding_matrix],
                    input_length=MAX_LEN,  # Length of input sequences
                    trainable=False))  # Set to True if you want to fine-tune embeddings

# Add LSTM layer
model.add(LSTM(units=128, return_sequences=False))
model.add(Dropout(0.2))
# Add Dense layer(s)
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=5, activation='softmax'))  # num_classes: number of output classes
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
early_stopper = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


## Training
Training procedures for both NER and question classification models.


In [26]:
# Training Question Classification Model
NUM_EPOCH = 200
BATCH_SIZE = 64

model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=NUM_EPOCH, validation_data=(X_val, y_val), callbacks= early_stopper, workers= 4)


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200


<keras.src.callbacks.History at 0x207a92b2760>


## Evaluation
Metrics and evaluation methods for the models.


In [27]:

# Evaluate NER Model
# Add your evaluation code here
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')
# Evaluate Question Classification Model
# Add your evaluation code here


Test Loss: 0.2971298098564148, Test Accuracy: 0.9259999990463257
