# Baseline Accuracy Establishement 

Before the step into the flow as mentioned in proposal, here is a simple prediction on what is the accuracy expected using traditional machine learning approach without any parameter turning or data wangling.  

In [1]:
import datetime, time, psutil, os
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
#  CONTROL PARAMTERS 
size = 0.1
tfTransform = True

In [3]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [4]:
train_data = train_data[: int(len(train_data) * size)]
test_data = test_data[:int(len(test_data) * size)]
train_data.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [5]:
train_data['target'].value_counts()

0    122567
1      8045
Name: target, dtype: int64

In [6]:
train_data['num_words'] = train_data['question_text'].apply(lambda x: len(str(x).split()) )

In [7]:
train_data['num_words'].describe()

count    130612.000000
mean         12.774462
std           7.026658
min           1.000000
25%           8.000000
50%          11.000000
75%          15.000000
max          65.000000
Name: num_words, dtype: float64

In [8]:
%%time

train_text = train_data['question_text']
test_text = test_data['question_text']
train_target = train_data['target']
all_text = train_text.append(test_text)

CPU times: user 8.35 ms, sys: 1.3 ms, total: 9.64 ms
Wall time: 8.37 ms


In [9]:
%%time

count_vectorizer = CountVectorizer()
count_vectorizer.fit(all_text)
train_text_features_cv = count_vectorizer.transform(train_text)
test_text_features_cv = count_vectorizer.transform(test_text)

CPU times: user 3.99 s, sys: 88 ms, total: 4.07 s
Wall time: 4.08 s


In [10]:
%%time

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(all_text)
train_text_features_tf = tfidf_vectorizer.transform(train_text)
test_text_features_tf = tfidf_vectorizer.transform(test_text)

CPU times: user 4.73 s, sys: 96.5 ms, total: 4.83 s
Wall time: 4.26 s


In [11]:
if tfTransform :
    train_text_features = train_text_features_tf
    test_text_features = test_text_features_tf
else:
    train_text_features = train_text_features_cv
    test_text_features = test_text_features_cv

print(train_text_features.shape)

(130612, 67139)


In [12]:
%%time

#BernoulliNB

kfold = KFold(n_splits = 5, shuffle = True, random_state = 1)
test_preds = 0
oof_preds = np.zeros([train_data.shape[0],])

for i, (train_idx,valid_idx) in enumerate(kfold.split(train_data)):
    x_train, x_valid = train_text_features[train_idx,:], train_text_features[valid_idx,:]
    y_train, y_valid = train_target[train_idx], train_target[valid_idx]
    classifier = BernoulliNB()
    classifier.fit(x_train,y_train)
   
    oof_preds[valid_idx] = classifier.predict_proba(x_valid)[:,1]
    test_preds += 0.2*classifier.predict_proba(test_text_features)[:,1]

pred_train = (oof_preds > 0.25).astype(np.int)
accuracy = f1_score(train_target, pred_train)
print(accuracy)

0.42209844949570974
CPU times: user 1.41 s, sys: 71.5 ms, total: 1.48 s
Wall time: 415 ms


In [13]:
%%time

#LogisticRegression

kfold = KFold(n_splits = 5, shuffle = True, random_state = 1)
test_preds = 0
oof_preds = np.zeros([train_data.shape[0],])

for i, (train_idx,valid_idx) in enumerate(kfold.split(train_data)):
    x_train, x_valid = train_text_features[train_idx,:], train_text_features[valid_idx,:]
    y_train, y_valid = train_target[train_idx], train_target[valid_idx]
    classifier = LogisticRegression(solver='liblinear')
    classifier.fit(x_train,y_train)
   
    oof_preds[valid_idx] = classifier.predict_proba(x_valid)[:,1]
    test_preds += 0.2*classifier.predict_proba(test_text_features)[:,1]

pred_train = (oof_preds > 0.25).astype(np.int)
accuracy = f1_score(train_target, pred_train)
print(accuracy)

0.5603949259513841
CPU times: user 10.2 s, sys: 124 ms, total: 10.3 s
Wall time: 2.59 s


In [14]:
%%time

#GradientBoostingClassifier

kfold = KFold(n_splits = 5, shuffle = True, random_state = 1)
test_preds = 0
oof_preds = np.zeros([train_data.shape[0],])

for i, (train_idx,valid_idx) in enumerate(kfold.split(train_data)):
    x_train, x_valid = train_text_features[train_idx,:], train_text_features[valid_idx,:]
    y_train, y_valid = train_target[train_idx], train_target[valid_idx]
    classifier = GradientBoostingClassifier(random_state=1)
    classifier.fit(x_train,y_train)
   
    oof_preds[valid_idx] = classifier.predict_proba(x_valid)[:,1]
    test_preds += 0.2*classifier.predict_proba(test_text_features)[:,1]

pred_train = (oof_preds > 0.25).astype(np.int)
accuracy = f1_score(train_target, pred_train)
print(accuracy)

0.485589519650655
CPU times: user 1h 7min 27s, sys: 6.95 s, total: 1h 7min 34s
Wall time: 1h 2min 37s
