# Quora Insincere Questions Logistic Regression Baseline Model

In [1]:
# import packages
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import spacy
import nltk
import re

from gensim import corpora, models, similarities
import pyLDAvis.gensim

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

np.random.seed(27)

In [2]:
train = pd.read_csv('../train.csv')
test = pd.read_csv('../test.csv')
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


#### Text Pre-processing

In [3]:
# taking a small sample (with downsampling of majority class) of the training data to correct class imbalance
from sklearn.utils import resample

sincere = train[train.target == 0]
insincere = train[train.target == 1]

train = pd.concat([resample(sincere,
                     replace = False,
                     n_samples = len(insincere)), insincere])

In [4]:
contractions = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

c_re = re.compile('(%s)' % '|'.join(contractions.keys()))

def expandContractions(text, c_re=c_re):
    def replace(match):
        return contractions[match.group(0)]
    return c_re.sub(replace, text)

In [5]:
# function to clean and lemmatize text and remove stopwords
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags, strip_punctuation, strip_numeric
from gensim.parsing.preprocessing import strip_multiple_whitespaces, strip_non_alphanum, remove_stopwords, strip_short

CUSTOM_FILTERS = [lambda x: x.lower(), #lowercase
                  strip_tags, # remove html tags
                  strip_punctuation, # replace punctuation with space
                  strip_multiple_whitespaces,# remove repeating whitespaces
                  strip_non_alphanum, # remove non-alphanumeric characters
                  strip_numeric, # remove numbers
                  remove_stopwords,# remove stopwords
                  strip_short # remove words less than minsize=3 characters long
                 ]
nlp = spacy.load('en')

def gensim_preprocess(docs, logging=True):
    docs = [expandContractions(doc) for doc in docs]
    docs = [preprocess_string(text, CUSTOM_FILTERS) for text in docs]
    texts_out = []
    for doc in docs:
    # https://spacy.io/usage/processing-pipelines
        doc = nlp((" ".join(doc)),  # doc = text to tokenize => creates doc
                  # disable parts of the language processing pipeline we don't need here to speed up processing
                  disable=['ner', # named entity recognition
                           'tagger', # part-of-speech tagger
                           'textcat', # document label categorizer
                          ])
        texts_out.append([tok.lemma_ for tok in doc if tok.lemma_ != '-PRON-'])
    return pd.Series(texts_out)

gensim_preprocess(train.question_text.iloc[10:15])

0                  [invent, modern, portfolio, theory]
1    [place, sleep, amsterdam, airport, night, arrive]
2                      [expansion, general, kth, term]
3                        [accomodation, month, berlin]
4    [buddhism, popular, religion, india, religion,...
dtype: object

In [6]:
# apply text-preprocessing function to training set
%time train_corpus = gensim_preprocess(train.question_text)

CPU times: user 32min 21s, sys: 26min 32s, total: 58min 53s
Wall time: 12min 50s


In [7]:
# create ngrams
ngram_phraser = models.Phrases(train_corpus, threshold=1)
ngram = models.phrases.Phraser(ngram_phraser)
#print example
print(ngram[train_corpus[0]])

# apply model to corpus
texts = [ngram[token] for token in train_corpus]

['well', 'learn', 'develope', 'app', 'linux']


In [8]:
# preparing ngrams for modeling
texts = [' '.join(text) for text in texts]
train['ngrams'] = texts
train.head()

Unnamed: 0,qid,question_text,target,ngrams
243613,2fa4ba03c124bf94fdd8,What is the best to learn developing apps on L...,0,well learn develope app linux
383126,4b16b43ccee0d0375d73,What is the indirect competitors of Starbucks?,0,indirect competitor starbucks
995356,c30ee49476e64839b1c9,How are high level nuclear waste being dispose...,0,high_level nuclear_waste dispose worldwide
737244,9063754fa48fa7c9eed4,What is so great about Google's Associate Prod...,0,great google associate product_manager http_ww...
44857,08c89ffca45ed16018f6,"Which IIT to choose for mtech CSE, IIT Kanpur,...",0,iit choose mtech cse_iit kanpur iit kgp iit_ma...


In [9]:
# represent features as BOW
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(train.ngrams)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

### Logistic Regression Baseline Model

In [10]:
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(train.ngrams, train.target, test_size=0.2)

lr = LogisticRegression()
lr.fit(vectorizer.transform(X_train), y_train)

print('Logistic Regression Score: ', lr.score(vectorizer.transform(X_test), y_test))



Logistic Regression Score:  0.8692921668110383


In [11]:
y_ = lr.predict(vectorizer.transform(X_test))

from sklearn.metrics import classification_report
print(classification_report(y_test, y_))

              precision    recall  f1-score   support

           0       0.85      0.89      0.87     16288
           1       0.89      0.85      0.87     16036

   micro avg       0.87      0.87      0.87     32324
   macro avg       0.87      0.87      0.87     32324
weighted avg       0.87      0.87      0.87     32324



In [14]:
from sklearn.metrics import confusion_matrix
pd.DataFrame(confusion_matrix(y_test, y_))

Unnamed: 0,0,1
0,14528,1760
1,2465,13571


In [13]:
# # preprocessing/lemmatizing/stemming test data
# %time test_corpus = gensim_preprocess(test.question_text)
# test_texts = [ngram[token] for token in test_corpus]

# test_texts = [' '.join(text) for text in test_texts]
# test['ngrams'] = test_texts
# test.head()

Our logistic regression baseline model F1 score is 0.483!

In [15]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()
%time bnb.fit(vectorizer.transform(X_train), y_train)

print('Naive Bayes Score: ', bnb.score(vectorizer.transform(X_test), y_test))

CPU times: user 1.24 s, sys: 76 ms, total: 1.32 s
Wall time: 1.32 s
Naive Bayes Score:  0.8646825887885162


In [16]:
%time bnb_y_ = bnb.predict(vectorizer.transform(X_test))

print(classification_report(y_test, bnb_y_))

CPU times: user 321 ms, sys: 7.33 ms, total: 328 ms
Wall time: 326 ms
              precision    recall  f1-score   support

           0       0.88      0.85      0.86     16288
           1       0.85      0.88      0.87     16036

   micro avg       0.86      0.86      0.86     32324
   macro avg       0.87      0.86      0.86     32324
weighted avg       0.87      0.86      0.86     32324



In [17]:
pd.DataFrame(confusion_matrix(y_test, bnb_y_))

Unnamed: 0,0,1
0,13783,2505
1,1869,14167
