In [1]:
import datetime
import numpy as np
import pandas as pd
import os
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
#from sklearn.model_selection import cross_val_score

from sklearn.naive_bayes import GaussianNB,MultinomialNB,ComplementNB,BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
print(str(datetime.datetime.now()) + "   start of prototype")
print(os.listdir("data"))

2019-04-07 18:58:13.693639   start of prototype
['.DS_Store', 'test.csv', 'README.md', 'train.csv', 'sample_submission.csv']


In [3]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [4]:
train_data.shape

(1306122, 3)

In [None]:
train_data.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [None]:
train_data['target'].value_counts()

0    1225312
1      80810
Name: target, dtype: int64

In [None]:
train_data['num_words'] = train_data['question_text'].apply(lambda x: len(str(x).split()) )

In [None]:
train_data['num_words'].describe()

count    1.306122e+06
mean     1.280361e+01
std      7.052437e+00
min      1.000000e+00
25%      8.000000e+00
50%      1.100000e+01
75%      1.500000e+01
max      1.340000e+02
Name: num_words, dtype: float64

In [None]:
train_text = train_data['question_text']
test_text = test_data['question_text']
train_target = train_data['target']
all_text = train_text.append(test_text)

count_vectorizer = CountVectorizer()
count_vectorizer.fit(all_text)
print(str(datetime.datetime.now()) + "   vectorization")

2019-04-07 18:58:39.675580   vectorization


In [None]:
train_text_features_cv = count_vectorizer.transform(train_text)
test_text_features_cv = count_vectorizer.transform(test_text)

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(all_text)
print(str(datetime.datetime.now()) + "   TF-IDF")

2019-04-07 18:59:19.154847   TF-IDF


In [None]:
train_text_features_tf = tfidf_vectorizer.transform(train_text)
test_text_features_tf = tfidf_vectorizer.transform(test_text)

In [None]:
#MultinomialNB
print(str(datetime.datetime.now()) + "   MultinomialNB... start")
kfold = KFold(n_splits = 5, shuffle = True, random_state = 2018)
test_preds1 = 0
oof_preds1 = np.zeros([train_data.shape[0],])

for i, (train_idx,valid_idx) in enumerate(kfold.split(train_data)):
    x_train, x_valid = train_text_features_cv[train_idx,:], train_text_features_cv[valid_idx,:]
    y_train, y_valid = train_target[train_idx], train_target[valid_idx]
    classifier1 = MultinomialNB()
    classifier1.fit(x_train,y_train)
   
    oof_preds1[valid_idx] = classifier1.predict_proba(x_valid)[:,1]
    test_preds1 += 0.2*classifier1.predict_proba(test_text_features_cv)[:,1]

pred_train1 = (oof_preds1 > .3).astype(np.int)
print(str(datetime.datetime.now()) + "   MultinomialNB... end")
f1_score(train_target, pred_train1)


2019-04-07 18:59:39.658414   MultinomialNB... start
2019-04-07 18:59:44.724745   MultinomialNB... end


0.5336924762600438

In [None]:
#BernoulliNB
print(str(datetime.datetime.now()) + "   BernoulliNB...start")
kfold = KFold(n_splits = 5, shuffle = True, random_state = 2018)
test_preds2 = 0
oof_preds2 = np.zeros([train_data.shape[0],])

for i, (train_idx,valid_idx) in enumerate(kfold.split(train_data)):
    x_train, x_valid = train_text_features_cv[train_idx,:], train_text_features_cv[valid_idx,:]
    y_train, y_valid = train_target[train_idx], train_target[valid_idx]
    classifier2 = BernoulliNB()
    classifier2.fit(x_train,y_train)
   
    oof_preds2[valid_idx] = classifier2.predict_proba(x_valid)[:,1]
    test_preds2 += 0.2*classifier2.predict_proba(test_text_features_cv)[:,1] 

pred_train2 = (oof_preds2 > .3).astype(np.int)
print(str(datetime.datetime.now()) + "   BernoulliNB...end")
f1_score(train_target, pred_train2)


2019-04-07 18:59:44.964762   BernoulliNB...start
2019-04-07 18:59:50.654547   BernoulliNB...end


0.523509358390958

In [None]:
#LogisticRegression
print(str(datetime.datetime.now()) + "    LogisticRegression... start")
kfold = KFold(n_splits = 5, shuffle = True, random_state = 2018)
test_preds3 = 0
oof_preds3 = np.zeros([train_data.shape[0],])

for i, (train_idx,valid_idx) in enumerate(kfold.split(train_data)):
    x_train, x_valid = train_text_features_cv[train_idx,:], train_text_features_cv[valid_idx,:]
    y_train, y_valid = train_target[train_idx], train_target[valid_idx]
    classifier3 = LogisticRegression(solver='liblinear')
    classifier3.fit(x_train,y_train)

    oof_preds3[valid_idx] = classifier3.predict_proba(x_valid)[:,1]
    test_preds3 += 0.2*classifier3.predict_proba(test_text_features_tf)[:,1]

pred_train3 = (oof_preds3 > .3).astype(np.int)
print(str(datetime.datetime.now()) + "    LogisticRegression... end")
f1_score(train_target, pred_train3)


2019-04-07 18:59:50.875276    LogisticRegression... start
2019-04-07 19:04:26.772517    LogisticRegression... end


0.614412941315975

In [None]:
#Support Vector Machines
print(str(datetime.datetime.now())+ "   Support Vector Machines...start")
kfold = KFold(n_splits = 5, shuffle = True, random_state = 2018)
test_preds4 = 0
oof_preds4 = np.zeros([train_data.shape[0],])

for i, (train_idx,valid_idx) in enumerate(kfold.split(train_data)):
    x_train, x_valid = train_text_features_cv[train_idx,:], train_text_features_cv[valid_idx,:]
    y_train, y_valid = train_target[train_idx], train_target[valid_idx]
    classifier4 = svm.SVC(gamma='scale')
    classifier4.fit(x_train,y_train)

    oof_preds4[valid_idx] = classifier4.predict_proba(x_valid)[:,1]
    test_preds4 += 0.2*classifier4.predict_proba(test_text_features_tf)[:,1]

pred_train4 = (oof_preds4 > .3).astype(np.int)
print(str(datetime.datetime.now()) + "   Support Vector Machines...end")
f1_score(train_target, pred_train4)


2019-04-07 19:04:27.108257   Support Vector Machines...start


In [None]:
#RandomForestClassifier
print(str(datetime.datetime.now()) + "   RandomForestClassifier...start")
kfold = KFold(n_splits = 5, shuffle = True, random_state = 2018)
test_preds5 = 0
oof_preds5 = np.zeros([train_data.shape[0],])

for i, (train_idx,valid_idx) in enumerate(kfold.split(train_data)):
    x_train, x_valid = train_text_features_cv[train_idx,:], train_text_features_cv[valid_idx,:]
    y_train, y_valid = train_target[train_idx], train_target[valid_idx]
    classifier5 = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=2018)
    classifier5.fit(x_train,y_train)

    oof_preds5[valid_idx] = classifier5.predict_proba(x_valid)[:,1]
    test_preds5 += 0.2*classifier5.predict_proba(test_text_features_tf)[:,1]

pred_train5 = (oof_preds5 > .3).astype(np.int)
print(str(datetime.datetime.now()) + "   RandomForestClassifier...end")
f1_score(train_target, pred_train5)


In [None]:
#GradientBoostingClassifier
print(str(datetime.datetime.now()) + "   GradientBoostingClassifier...start")
kfold = KFold(n_splits = 5, shuffle = True, random_state = 2018)
test_preds6 = 0
oof_preds6 = np.zeros([train_data.shape[0],])

for i, (train_idx,valid_idx) in enumerate(kfold.split(train_data)):
    x_train, x_valid = train_text_features_cv[train_idx,:], train_text_features_cv[valid_idx,:]
    y_train, y_valid = train_target[train_idx], train_target[valid_idx]
    classifier6 = GradientBoostingClassifier(random_state=2018)
    classifier6.fit(x_train,y_train)

    oof_preds6[valid_idx] = classifier6.predict_proba(x_valid)[:,1]
    test_preds6 += 0.2*classifier6.predict_proba(test_text_features_tf)[:,1]

pred_train6 = (oof_preds5 > .3).astype(np.int)
print(str(datetime.datetime.now())+ "   GradientBoostingClassifier...end")
f1_score(train_target, pred_train6)


In [None]:
print(str(datetime.datetime.now())+ "   end of prototype")