## NLP Project to identify Questions in input text file

In [1]:
#!pip install -U nltk  # instal the nltk library

In [2]:
# Reading the input file
import pandas as pd
import csv
data = pd.read_csv("test-inputs.txt", sep='\t', quoting=csv.QUOTE_NONE, header=None)
data.columns = ['body_text']
data.head()

Unnamed: 0,body_text
0,What was the network operated by the Duct PTT ...
1,When did Zhenjin die
2,"The force, therefore, is related directly to t..."
3,In 1785 James Hutton presented what paper to t...
4,What does the ctenophora use to swim


In [3]:
# labelling the data hardcore method
import numpy as np
label = []
Question_words = ['who', 'what', 'when', 'where', 'why', 'whose', 'whom', 'is', 'can', 'does', 'do', 'how']
yesnowords = ["can", "could", "would", "is", "does", "has", "was", "were", "had", "have", "did", "are", "will"]
Interrogative_words = Question_words + yesnowords
for i in range(len(data)):
    words = data['body_text'][i].lower().split(' ')
    if words[0] in Interrogative_words or words[-1]=='?' or words[-1] in Interrogative_words:
        label.append(1)
    else:
        label.append(0)

In [4]:
data['label'] = label
data.head(8)

Unnamed: 0,body_text,label
0,What was the network operated by the Duct PTT ...,1
1,When did Zhenjin die,1
2,"The force, therefore, is related directly to t...",0
3,In 1785 James Hutton presented what paper to t...,0
4,What does the ctenophora use to swim,1
5,"It is the county seat of Duval County, with wh...",0
6,Where is the Asian gold miners strongest in Vi...,1
7,How did france differ from Britain in managing...,1


# Checking for imbalance in dataset

In [5]:
class_count = data['label'].value_counts()
print(class_count)
print(class_count/sum(class_count)*100)

0    14596
1     9699
Name: label, dtype: int64
0    60.078205
1    39.921795
Name: label, dtype: float64


The dataset is not imbalanced as we have enough number of minority samples.

In [6]:
# Creating a new list of punctuations without '?' so as to remove punctuation from our raw data
import string
import re
new_str_punctuation = ''
for char in string.punctuation:
    if '?' in char:
        new_str_punctuation+=''
    else:
        new_str_punctuation+=char

In [7]:
print(new_str_punctuation)

!"#$%&'()*+,-./:;<=>@[\]^_`{|}~


# Creating a Function for cleaning the data which involves removal of punctuations, tokenization and Lemmatization

In [8]:
import re
import nltk
#nltk.download('wordnet')
lemmatizer = nltk.WordNetLemmatizer()

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in new_str_punctuation])
    tokens = re.findall('\S+', text)
    text = [lemmatizer.lemmatize(word) for word in tokens]
    return text

In [9]:
data['cleaned_text'] = data['body_text'].apply(lambda x: ' '.join(clean_text(x)))
data.head()

Unnamed: 0,body_text,label,cleaned_text
0,What was the network operated by the Duct PTT ...,1,what wa the network operated by the duct ptt t...
1,When did Zhenjin die,1,when did zhenjin die
2,"The force, therefore, is related directly to t...",0,the force therefore is related directly to the...
3,In 1785 James Hutton presented what paper to t...,0,in 1785 james hutton presented what paper to t...
4,What does the ctenophora use to swim,1,what doe the ctenophora use to swim


# Extracting the features from input text with CountVectorizer and TFIDF

Countvectorizer Convert a collection of text documents to a matrix of token counts. Link: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

Tfidf; Term frequency–Inverse document frequency: 
Creates a document-term matrix where the columns represent single unique terms (unigrams) but the cell represents a weighting meant to represent how important a word is to a document.


In [95]:
# CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
import pickle
count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(data['body_text'])
X_count_feat = pd.DataFrame(X_count.toarray()) # Creating a sparse matrix

#Save vectorizer.vocabulary
pickle.dump(count_vect.vocabulary_,open("feature_count.pkl","wb"))

# #n-gram (bigram)
# ngram_vect = CountVectorizer(ngram_range=(2, 2)) # search only for bigram
# ngram_counts = ngram_vect.fit_transform(data['cleaned_text'])
# ngram_counts_feat = pd.DataFrame(ngram_counts.toarray())

# TF-IDF
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])
X_tfidf_feat = pd.DataFrame(X_tfidf.toarray())

#Save vectorizer.vocabulary
pickle.dump(tfidf_vect.vocabulary_,open("feature_tfidf.pkl","wb"))

# Training the features

Split into train and test data

In [64]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
X_count_train, X_count_test, y_count_train, y_count_test = train_test_split(X_count_feat, data['label'], test_size=0.2)
# X_ngram_train, X_ngram_test, y_ngram_train, y_ngram_test = train_test_split(ngram_counts_feat, data['label'], test_size=0.2)
X_tfidf_train, X_tfidf_test, y_tfidf_train, y_tfidf_test = train_test_split(X_tfidf_feat, data['label'], test_size=0.2)

In [65]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import GridSearchCV

In [66]:
def scorer(y_pred, y_test):
    precision, recall, fscore, train_support = score(y_test, y_pred, pos_label=1, average='binary')
    print('Fscore: {} / Precision: {} / Recall: {} / Accuracy: {}'.format(round(fscore,3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

    

# Training the random forest classifier and Evaluation with different features

In this case, I have only used 15 estimators due to memory issues. But, we can do a grid search to tune the parameters.

In [103]:
rf = RandomForestClassifier(n_estimators=15, max_depth=None, n_jobs=-1)
rf_model_tfidf = rf.fit(X_tfidf_train, y_tfidf_train)
y_tfidf_pred = rf_model_tfidf.predict(X_tfidf_test)
#F1 = 2 * (precision * recall) / (precision + recall)
print("Score with TFIDF features:")
scorer(y_tfidf_pred, y_tfidf_test)

# Save the trained model
filename = 'rf_model_tfidf.sav'
pickle.dump(rf_model_tfidf, open(filename, 'wb'))


Score with TFIDF features:
Fscore: 0.905 / Precision: 0.865 / Recall: 0.949 / Accuracy: 0.922


In [101]:
rf = RandomForestClassifier(n_estimators=15, max_depth=None, n_jobs=-1)
rf_model_count = rf.fit(X_count_train, y_count_train)
y_count_pred = rf_model_count.predict(X_count_test)

print("Score with count features:")
scorer(y_count_pred, y_count_test)

filename = 'rf_model_count.sav'
pickle.dump(rf_model_count, open(filename, 'wb'))

Score with count features:
Fscore: 0.917 / Precision: 0.88 / Recall: 0.957 / Accuracy: 0.93


Judging from the fscore which is the harmonic mean of precision and recall, I have chosen count features which is slightly better.

# Running the trained model

In [114]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

filename = 'rf_model_count.sav'
rf_model_count = pickle.load(open(filename, 'rb'))

#loading the test data: Final-test-inputs data is same as test-inputs.txt
test_data = pd.read_csv("Final-test-inputs.txt", sep='\t', quoting=csv.QUOTE_NONE, header=None)
test_data.columns = ['body_text']

# Using the same count vocab features in new data as used in training

loaded_vec = CountVectorizer(analyzer=clean_text,vocabulary=pickle.load(open("feature_count.pkl", "rb")))

# Predicitng the ouptut data in batch size of 100
start = 0
data_size = 100
loop_len = len(test_data)//data_size
remainder = len(test_data)%data_size
y_pred_data_1 = []

for i in range(loop_len):
    if i<loop_len:
        count_feat = loaded_vec.fit_transform(test_data['body_text'][start:data_size])
        y_pred_data_2 = rf_model_count.predict(count_feat)
        y_pred_data_1 = np.append(y_pred_data_1, y_pred_data_2)
        start += 100
        data_size +=100
    if remainder !=0 and i==(loop_len-1):
        count_feat = loaded_vec.fit_transform(test_data['body_text'][(data_size-100):len(test_data)])
        y_pred_data_3 = rf_model_count.predict(count_feat) 
        y_pred_data_1 = np.append(y_pred_data_1, y_pred_data_3)

# writing on output file
file = open('output.txt','w')
for i in range(len(y_pred_data_1)):
    if y_pred_data_1[i] == 1:
        file.write('True \n')
    else:
        file.write('false \n')
file.close()
        

In [13]:
#code can be used for finding best hyperparameter

#from sklearn.model_selection import GridSearchCV

# # Create the parameter grid based on the results of random search 
# param_grid = {
#     'bootstrap': [True],
#     'max_depth': [7, 11, 20],
#     'n_estimators': [50, 100, 150]
# }

# # Create a based model
# rf = RandomForestClassifier()

# # Instantiate the grid search model
# grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
#                           cv = 3, verbose = 2)
# grid_search.fit(X_tfidf_train, y_tfidf_train)

# grid_search.fit(X_tfidf_train, y_tfidf_train)

# n = [10, 15, 20]
# for i in range(len(n)):
#     print('For n_estimaotrs :{}'.format(n[i]))
#     rf = RandomForestClassifier(n_estimators=25, max_depth=None, n_jobs=-1)

#     rf_model = rf.fit(X_tfidf_train, y_tfidf_train)

#     y_pred = rf_model.predict(X_tfidf_test)
#     y_test = y_tfidf_test

#     precision, recall, fscore, train_support = score(y_test, y_pred, pos_label=1, average='binary')
#     print('Fscore: {} / Precision: {} / Recall: {} / Accuracy: {}'.format(round(fscore,3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

# codes to save and reuse the models

In [15]:

# import pickle
# filename = 'rf_model_tfidf.sav'
# pickle.dump(rf_model_tfidf, open(filename, 'wb'))

In [15]:
# import pickle
# filename = 'rf_model_count.sav'
# pickle.dump(rf_model_count, open(filename, 'wb'))

In [21]:
# filename = 'rf_model_count.sav'
# rf_model = pickle.load(open(filename, 'rb'))



# codes for gradientboost classifier which I am unable to currenlty run with my cloud memory issues.

In [None]:
# gb = GradientBoostingClassifier(n_estimators=10, max_depth=11)

# gb_model = gb.fit(X_count_train, y_count_train)
# y_pred = gb_model.predict(X_count_test)
# y_test = y_count_test

# precision, recall, fscore, train_support = score(y_test, y_pred, pos_label=1, average='binary')
# print('Fscore: {} / Precision: {} / Recall: {} / Accuracy: {}'.format(
#     round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

We can still improve and select other models that fits the data best by hyperparamter tuning with grid search with cross-validation, and checking for other classifiers as well.
In this project, i have tried to compare the tfidf and count features of the input text data with rf model.