In [56]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, f1_score

In [9]:
HOME = Path('/home/kdang/dataset/quora-insincere-questions-classification/')
TRAIN = HOME / 'train.csv'
TEST = HOME / 'test.csv'

In [11]:
train = pd.read_csv(TRAIN)
test = pd.read_csv(TEST)

In [14]:
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [17]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.question_text, train.target, 
                                                  stratify=train.target, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)


In [25]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)

In [43]:
# Fitting a simple Logistic Regression on TFIDF
clf = LogisticRegression()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)



In [31]:
pred = np.argmax(predictions, axis=1)

In [57]:
print('f1 score', f1_score(pred, yvalid))

f1 score 0.4765478424015009


In [59]:
test.head()

Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?


In [65]:
xtest = tfv.transform(list(test.question_text))

In [67]:
ytest_pred = clf.predict_proba(xtest)
ytest = np.argmax(ytest_pred, axis=1)

In [68]:
ytest

array([0, 0, 0, ..., 0, 0, 0])

In [78]:
submission = test.copy()
submission['prediction']= ytest
submission = submission.drop('question_text', axis=1)
submission.head()
submission.to_csv('submission.csv', index=False)

In [None]:
submission