
# NLP Assignment - Question Classification
This notebook outlines the process for data preprocessing, training, and model creation for Question Classification tasks using pretrained Word2Vec embeddings.


In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from gensim.models import Word2Vec
import random


## Data Preprocessing
Load and preprocess the data for question classification.


In [2]:

# Question Classification Data
train_data = pd.read_csv('../Data/TREC/Raw/train.csv')  # Replace with your path
test_data = pd.read_csv('../Data/TREC/Raw/test.csv')  # Replace with your path
# Preprocess Data (Tokenization, padding, label encoding)
# Add your preprocessing steps here
train_data['text'] = train_data['text'].str.lower()
test_data['text'] = test_data['text'].str.lower()

train_data = train_data.drop(columns=['label-fine'])
test_data = test_data.drop(columns=['label-fine'])

In [3]:
coarse_label = train_data['label-coarse'].unique() # returns an array([])
coarse_label = list(coarse_label) 
selection = random.sample(coarse_label, 2)

In [4]:
for index, row in train_data.iterrows():
    label_coarse = train_data.loc[index, 'label-coarse']
    if label_coarse in selection:
        train_data.loc[index, 'label-coarse'] = 'OTHERS'


for index, row in train_data.iterrows():
    label_coarse = train_data.loc[index, 'label-coarse']
    if label_coarse in selection:
        train_data.loc[index, 'label-coarse'] = 'OTHERS'

  train_data.loc[index, 'label-coarse'] = 'OTHERS'


In [5]:
train_data

Unnamed: 0,label-coarse,text
0,0,how did serfdom develop in and then leave russ...
1,1,what films featured the character popeye doyle ?
2,0,how can i find a list of celebrities ' real na...
3,1,what fowl grabs the spotlight after the chines...
4,OTHERS,what is the full form of .com ?
...,...,...
5447,1,what 's the shape of a camel 's spine ?
5448,1,what type of currency is used in china ?
5449,4,what is the temperature today ?
5450,4,what is the temperature for cooking ?


In [6]:
train_df, dev_df = train_test_split(train_data, test_size = 500, shuffle = True)

print("Training set (dev):")
print(dev_df)

print("Training set:")
print(train_df)

print("\nTest set:")
print(test_data)

Training set (dev):
     label-coarse                                               text
2749            0            what do the 12 days of christmas mean ?
2331       OTHERS           what state is niagara falls located in ?
759        OTHERS  where does buzz aldrin want to build a permane...
5057            4  what are the unemployment statistics for the y...
3752       OTHERS                     where was chop suey invented ?
...           ...                                                ...
2405       OTHERS    what state was named the green mountain state ?
1003            1  what car was driven in the 199 release of `` s...
4133            3         who makes the `` die hard '' car battery ?
2646            3  what are the first names of the famous husband...
4445            4             how many beanie baby sites are there ?

[500 rows x 2 columns]
Training set:
     label-coarse                                               text
2816            4  how many cards are dealt t


## Loading Pretrained Word2Vec Embeddings
Load the pretrained Word2Vec model.


In [None]:

# Load Pretrained Word2Vec Model
w2v_model = Word2Vec.load('path/to/word2vec.model')  # Replace with your path



## Model Building
Create LSTM models for NER and question classification.


In [None]:

# Model for Named Entity Recognition
# Define NER model architecture
ner_model = Sequential()
# Add your model layers (e.g., Embedding, LSTM, Dense)

# Model for Question Classification
# Define question classification model architecture
question_model = Sequential()
# Add your model layers (e.g., Embedding, LSTM, Dense)



## Training
Training procedures for both NER and question classification models.


In [None]:

# Training NER Model
# Add your training code here (e.g., ner_model.fit())

# Training Question Classification Model
# Add your training code here (e.g., question_model.fit())



## Evaluation
Metrics and evaluation methods for the models.


In [None]:

# Evaluate NER Model
# Add your evaluation code here

# Evaluate Question Classification Model
# Add your evaluation code here
