In [8]:
import pandas as pd
from pandas import DataFrame
import nltk
from contextlib import contextmanager
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from nltk.tokenize import word_tokenize
import time
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

In [2]:
@contextmanager
def timer(task_name="timer"):
    print("----{} started".format(task_name))
    t0 = time.time()
    yield
    print("----{} done in {:.0f} seconds".format(task_name, time.time() - t0))

In [4]:
with timer('read data'):
    train = pd.read_csv('../data/train.csv')
    test = pd.read_csv('../data/test.csv')

----read data started
----read data done in 3 seconds


**exploration**

In [49]:
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [54]:
print(len(train), len(test))
print(len(train) + len(test))

1306122 375806
1681928


In [55]:
print('target = 1:', len(df.loc[train['target'] == 1]))
print('target = 0:', len(df.loc[train['target'] == 0]))

target = 1: 80810
target = 0: 1225312


In [56]:
df1 = train[['question_text']]
print('--- taret == 1 ---')
print(df1.loc[train['target'] == 1][:5])
print('--- taret == 0 ---')
print(df1.loc[train['target'] == 0][:5])

--- taret == 1 ---
                                         question_text
22   Has the United States become the largest dicta...
30   Which babies are more sweeter to their parents...
110  If blacks support school choice and mandatory ...
114  I am gay boy and I love my cousin (boy). He is...
115               Which races have the smallest penis?
--- taret == 0 ---
                                       question_text
0  How did Quebec nationalists see their province...
1  Do you have an adopted dog, how would you enco...
2  Why does velocity affect time? Does velocity a...
3  How did Otto von Guericke used the Magdeburg h...
4  Can I convert montra helicon D to a mountain b...


In [20]:
print(df.question_text[0])
print(df.question_text[22])

How did Quebec nationalists see their province as a nation in the 1960s?
Has the United States become the largest dictatorship in the world?


**tfidf**

* tokenize
* reserve all alphabetic words
* convert all words to lowcase forms
* remove stopwords(sklearn)
* stemming

In [5]:
import Stemmer

def tokenize(raw):
    return [w.lower() for w in word_tokenize(raw) if w.isalpha()]

class StemmedTfidfVectorizer(TfidfVectorizer):
    en_stemmer = Stemmer.Stemmer('en')
    
    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        return lambda doc: StemmedTfidfVectorizer.en_stemmer.stemWords(analyzer(doc))

In [6]:
tfidf = StemmedTfidfVectorizer(
    tokenizer=tokenize, 
    analyzer="word", 
    stop_words='english', 
    ngram_range=(1,1), 
    min_df=3    # limit of minimum number of counts: 3
)

In [18]:
with timer('tfidf train'):
    txt_all = pd.concat([train.question_text, test.question_text])
    tfidf.fit(txt_all)

----tfidf train started
----tfidf train done in 256 seconds


In [19]:
len(tfidf.vocabulary_) # without setting min_df: 153886

51771

In [21]:
with timer('construct training dataset'):
    X = tfidf.transform(train.question_text)
    y = train.target.values

In [22]:
# save the result
with open('X_train', 'wb') as f:
    pickle.dump(X, f)
with open('y_train', 'wb') as f:
    pickle.dump(y, f)

# load data
'''
X = pickle.load(open('X_train', 'rb'))
y = pickle.load(open('y_train', 'rb'))
'''

In [23]:
print(type(X))

<class 'scipy.sparse.csr.csr_matrix'>


In [7]:
skf = StratifiedKFold(n_splits=10)
print('number of folds:', skf.get_n_splits(X, y))

f1_list = []
for train_index, test_index in skf.split(X, y):
    clf = MultinomialNB().fit(X[train_index], y[train_index])
    y_pred = clf.predict(X[test_index])
    f1_list.append(f1_score(y[test_index], y_pred, average='micro'))

print('average f1_score:', np.mean(f1_list))

number of folds: 10
average f1_score: 0.9440963399324996


**classification report example**

In [10]:
skf = StratifiedKFold(n_splits=10)

target_names = ['0', '1']

for train_index, test_index in skf.split(X, y):
    clf = MultinomialNB().fit(X[train_index], y[train_index])
    y_pred = clf.predict(X[test_index])
    y_true = y[test_index]
    print(classification_report(y_true, y_pred, target_names=target_names))
    break

              precision    recall  f1-score   support

           0       0.95      0.99      0.97    122532
           1       0.70      0.18      0.28      8081

   micro avg       0.94      0.94      0.94    130613
   macro avg       0.82      0.59      0.63    130613
weighted avg       0.93      0.94      0.93    130613



**helper**

In [3]:
import pickle

X = pickle.load(open('X_train', 'rb'))
y = pickle.load(open('y_train', 'rb'))