In [1]:
# !pip install -U nltk

In [1]:
# Reading the input file
import pandas as pd
data = pd.read_csv("test-inputs.txt", sep='\n', header=None)
data.columns = ['body_text']
data.head()

Unnamed: 0,body_text
0,What was the network operated by the Duct PTT ...
1,When did Zhenjin die
2,"The force, therefore, is related directly to t..."
3,In 1785 James Hutton presented what paper to t...
4,What does the ctenophora use to swim


In [2]:
# labelling the data hardcore method
import numpy as np
label = []
Question_words = ['who', 'what', 'when', 'where', 'why', 'whose', 'whom', 'is', 'can', 'does', 'do', 'how']
yesnowords = ["can", "could", "would", "is", "does", "has", "was", "were", "had", "have", "did", "are", "will"]
Interrogative_words = Question_words + yesnowords
for i in range(len(data)):
    words = data['body_text'][i].lower().split(' ')
    if words[0] in Interrogative_words or words[-1]=='?':
        label.append(1)
    else:
        label.append(0)

In [3]:
data['label'] = label
data.head(8)

Unnamed: 0,body_text,label
0,What was the network operated by the Duct PTT ...,1
1,When did Zhenjin die,1
2,"The force, therefore, is related directly to t...",0
3,In 1785 James Hutton presented what paper to t...,0
4,What does the ctenophora use to swim,1
5,"It is the county seat of Duval County, with wh...",0
6,Where is the Asian gold miners strongest in Vi...,1
7,How did france differ from Britain in managing...,1


In [4]:
class_count = data['label'].value_counts()
print(class_count)
print(class_count/sum(class_count)*100)

0    14999
1     9232
Name: label, dtype: int64
0    61.900045
1    38.099955
Name: label, dtype: float64


In [5]:
# Creating a new list of punctuations without ?
import string
import re
new_str_punctuation = ''
for char in string.punctuation:
    if '?' in char:
        new_str_punctuation+=''
    else:
        new_str_punctuation+=char

In [6]:
print(new_str_punctuation)

!"#$%&'()*+,-./:;<=>@[\]^_`{|}~


In [7]:
import re
import nltk
# nltk.download('wordnet')
lemmatizer = nltk.WordNetLemmatizer()

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in new_str_punctuation])
    tokens = re.findall('\S+', text)
    text = [lemmatizer.lemmatize(word) for word in tokens]
    return text

In [8]:
data['cleaned_text'] = data['body_text'].apply(lambda x: ' '.join(clean_text(x)))
data.head()

Unnamed: 0,body_text,label,cleaned_text
0,What was the network operated by the Duct PTT ...,1,what wa the network operated by the duct ptt t...
1,When did Zhenjin die,1,when did zhenjin die
2,"The force, therefore, is related directly to t...",0,the force therefore is related directly to the...
3,In 1785 James Hutton presented what paper to t...,0,in 1785 james hutton presented what paper to t...
4,What does the ctenophora use to swim,1,what doe the ctenophora use to swim


In [9]:
# CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(data['body_text'])
X_count_feat = pd.DataFrame(X_count.toarray()) # Creating a sparse matrix


#n-gram (bigram)
ngram_vect = CountVectorizer(ngram_range=(2, 2)) # search only for bigram
ngram_counts = ngram_vect.fit_transform(data['cleaned_text'])
ngram_counts_feat = pd.DataFrame(ngram_counts.toarray())

# TF-IDF
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])
X_tfidf_feat = pd.DataFrame(X_tfidf.toarray())
print(X_tfidf_feat.shape)

(24231, 16394)


Split into train and test data

In [10]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
X_count_train, X_count_test, y_count_train, y_count_test = train_test_split(X_count_feat, data['label'], test_size=0.2)
X_ngram_train, X_ngram_test, y_ngram_train, y_ngram_test = train_test_split(ngram_counts_feat, data['label'], test_size=0.2)
X_tfidf_train, X_tfidf_test, y_tfidf_train, y_tfidf_test = train_test_split(X_tfidf_feat, data['label'], test_size=0.2)



In [11]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import GridSearchCV

In [13]:
#code can be used for finding best hyperparameter

#from sklearn.model_selection import GridSearchCV

# # Create the parameter grid based on the results of random search 
# param_grid = {
#     'bootstrap': [True],
#     'max_depth': [7, 11, 20],
#     'n_estimators': [50, 100, 150]
# }

# # Create a based model
# rf = RandomForestClassifier()

# # Instantiate the grid search model
# grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
#                           cv = 3, verbose = 2)
# grid_search.fit(X_tfidf_train, y_tfidf_train)

In [14]:
# grid_search.fit(X_tfidf_train, y_tfidf_train)

In [None]:
# n = [10, 15, 20]
# for i in range(len(n)):
#     print('For n_estimaotrs :{}'.format(n[i]))
#     rf = RandomForestClassifier(n_estimators=25, max_depth=None, n_jobs=-1)

#     rf_model = rf.fit(X_tfidf_train, y_tfidf_train)

#     y_pred = rf_model.predict(X_tfidf_test)
#     y_test = y_tfidf_test

#     precision, recall, fscore, train_support = score(y_test, y_pred, pos_label=1, average='binary')
#     print('Fscore: {} / Precision: {} / Recall: {} / Accuracy: {}'.format(round(fscore,3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

In [13]:
rf = RandomForestClassifier(n_estimators=15, max_depth=None, n_jobs=-1)
rf_model_tfidf = rf.fit(X_tfidf_train, y_tfidf_train)
y_pred = rf_model_tfidf.predict(X_tfidf_test)
y_test = y_tfidf_test
#F1 = 2 * (precision * recall) / (precision + recall)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label=1, average='binary')
print('Fscore: {} / Precision: {} / Recall: {} / Accuracy: {}'.format(round(fscore,3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fscore: 0.886 / Precision: 0.848 / Recall: 0.927 / Accuracy: 0.91


In [None]:
rf = RandomForestClassifier(n_estimators=15, max_depth=None, n_jobs=-1)
rf_model_count = rf.fit(X_count_train, y_count_train)

y_pred = rf_model_count.predict(X_count_test)
y_test = y_count_test

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label=1, average='binary')
print('Fscore: {} / Precision: {} / Recall: {} / Accuracy: {}'.format(round(fscore,3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

In [15]:
rf = RandomForestClassifier(n_estimators=15, max_depth=None, n_jobs=-1)
rf_model_ngram = rf.fit(X_ngram_train, y_ngram_train)
y_pred = rf_model_ngram.predict(X_ngram_test)
y_test = y_ngram_test

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label=1, average='binary')
print('Fscore: {} / Precision: {} / Recall: {} / Accuracy: {}'.format(round(fscore,3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

MemoryError: 

In [12]:
gb = GradientBoostingClassifier(n_estimators=10, max_depth=11)

gb_model = gb.fit(X_count_train, y_count_train)
y_pred = gb_model.predict(X_count_test)
y_test = y_count_test

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label=1, average='binary')
print('Fscore: {} / Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

MemoryError: 