In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [1]:
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [4]:
trdata=pd.read_csv("train.csv")

In [5]:
tesdata=pd.read_csv("test.csv")

In [8]:
%%time
def clean_text(txt):
    """""
    cleans the input text in the following steps
    1- replace contractions
    2- removing punctuation
    3- spliting into words
    4- removing stopwords
    5- removing leftover punctuations
    """""
    contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}
    def _get_contractions(contraction_dict):
        contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
        return contraction_dict, contraction_re

    def replace_contractions(text):
        contractions, contractions_re = _get_contractions(contraction_dict)
        def replace(match):
            return contractions[match.group(0)]
        return contractions_re.sub(replace, text)

    # replace contractions
    txt = replace_contractions(txt)
    
    #remove punctuations
    txt  = "".join([char for char in txt if char not in string.punctuation])
    txt = re.sub('[0-9]+', '', txt)
    
    # split into words
    words = word_tokenize(txt)
    
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    
    # removing leftover punctuations
    words = [word for word in words if word.isalpha()]
    
    cleaned_text = ' '.join(words)
    return cleaned_text
    
trdata['data_cleaned'] = trdata['question_text'].apply(lambda txt: clean_text(txt))

Wall time: 8min 30s


In [9]:
trdata.head()

Unnamed: 0,qid,question_text,target,data_cleaned
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0,How Quebec nationalists see province nation
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0,Do adopted dog would encourage people adopt shop
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0,Why velocity affect time Does velocity affect ...
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0,How Otto von Guericke used Magdeburg hemispheres
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0,Can I convert montra helicon D mountain bike c...


In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
bow_transformer=CountVectorizer(analyzer=clean_text).fit(trdata['data_cleaned'])

In [12]:
tr_bow=bow_transformer.transform(trdata['data_cleaned'])

In [14]:
tr_bow.shape

(1306122, 1313)

In [15]:
from sklearn.feature_extraction.text import TfidfTransformer

In [16]:
tfidf_transformer=TfidfTransformer().fit(tr_bow)

In [17]:
tr_tfidf=tfidf_transformer.transform(tr_bow)

In [18]:
tr_tfidf.shape

(1306122, 1313)

In [19]:
from sklearn.naive_bayes import MultinomialNB

In [20]:
NBmodel=MultinomialNB().fit(tr_tfidf,trdata['target'])

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
xtrain, xtest, ytrain, ytest = train_test_split(trdata['data_cleaned'], trdata['target'].values, shuffle=True, test_size=0.2)

In [23]:
from sklearn.pipeline import Pipeline

In [24]:
pipeline= Pipeline([
    ('bow',CountVectorizer(analyzer=clean_text)),
    ('tfidf',TfidfTransformer()),
    ('classifier',MultinomialNB())
])

In [25]:
pipeline.fit(xtrain,ytrain)

Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function clean_text at 0x0000022A5DAF6A60>)),
                ('tfidf', TfidfTransformer()),
                ('classifier', MultinomialNB())])

In [26]:
predictions=pipeline.predict(xtest)

In [27]:
from sklearn.metrics import classification_report

In [28]:
print(classification_report(ytest,predictions))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97    245010
           1       0.03      0.00      0.00     16215

    accuracy                           0.94    261225
   macro avg       0.48      0.50      0.48    261225
weighted avg       0.88      0.94      0.91    261225



In [29]:
#Confusion Matrix
from  sklearn.metrics import confusion_matrix 
cM = confusion_matrix(ytest,predictions)
print(cM)
#generic matrix
tp, fp, fn, tn = cM.ravel()
print(tp,fp,fn,tn)
recall = tp/(tp+fn)
precision=tp/(tp+fp)
print("Recall = Sensitivity = ",tp/(tp+fn))
print("Specificity =", tn/(tn+fp))
print("Accuracy=",(tp+tn)/(tp+tn+fp+fn))
print("Precision=",tp/(tp+fp))
f1score= 2 *(recall*precision)/(precision+recall)
print("f1 score=", f1score)

[[244973     37]
 [ 16214      1]]
244973 37 16214 1
Recall = Sensitivity =  0.9379218720686711
Specificity = 0.02631578947368421
Accuracy= 0.9377892621303474
Precision= 0.9998489857556835
f1 score= 0.9678958982372475
