In [224]:
import os
import re

from scipy.sparse import csr_matrix
from scipy.sparse import hstack
import numpy as np
import pandas

from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

import nltk
import urlextract
from collections import Counter

import matplotlib.pyplot as plt
%matplotlib inline

In [225]:
import warnings
warnings.filterwarnings('ignore')

In [226]:
stemmer = nltk.PorterStemmer()
url_extractor = urlextract.URLExtract()

In [227]:
def fetch_data(data_path='DATA', instances=10000):
    
    data_train = pandas.read_csv(os.path.join(data_path, 'train.csv'))
    
    data_negative = data_train.loc[data_train['target'] == 0][:instances//2]
    data_positive = data_train.loc[data_train['target'] == 1][:instances//2]
    
    data_train = pandas.concat([data_negative, data_positive]).sample(frac=1)
    
    """
    data_test = pandas.read_csv(os.path.join(data_path, 'test.csv'))
    data_test_labels = pandas.read_csv(os.path.join(data_path, 'sample_submission.csv'))
    data_test = data_test.join(data_test_labels.set_index('qid'), on='qid')[:1000]
    """
    
    X = data_train.drop(['qid', 'target'], axis=1).values
    y = data_train['target'].values
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)
    
    return (X_train, y_train, X_test, y_test, \
                data_train, data_test)

In [228]:
(X_train, y_train, X_test, y_test, \
         data_train, data_test) = fetch_data(instances=100000)

In [229]:
data_train.head()

Unnamed: 0,qid,question_text,target
42209,0841b659140bfac55067,On what basis is the hypothetical graviton sai...,0
708877,8ac8f21a30f4ad02d803,Is the fact that many people do not understand...,1
13742,02b36e9776859470ca05,Why do people say that I'm attractive but in r...,0
42846,086231e80375c76d8bcd,What percentage of Quora viewers actually vote...,0
6967,015b7142c333c5806952,Which molecule do not contain ribose group?,0


In [230]:
class MessageToWordCounterTransform(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for message in X:
            text = message[0] or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text.replace(url, ' URL ')
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [231]:
class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.most_common_ = most_common
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [232]:
preprocess_pipeline = Pipeline([
    ("email_to_wordcount", MessageToWordCounterTransform()),
    ("wordcount_to_vector", WordCounterToVectorTransformer()),
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)

In [233]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)

log_clf = LogisticRegression(solver="liblinear", random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=kf, verbose=3)
score.mean()

[CV]  ................................................................
[CV] ................................ , score=0.8669375, total=   0.7s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s


[CV] ................................. , score=0.869125, total=   0.6s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.3s remaining:    0.0s


[CV] ................................ , score=0.8703125, total=   0.6s
[CV]  ................................................................
[CV] ................................ , score=0.8654375, total=   0.6s
[CV]  ................................................................
[CV] ................................ , score=0.8601875, total=   0.6s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.2s finished


0.8664

In [234]:
X_test_transformed = preprocess_pipeline.transform(X_test)

log_clf = LogisticRegression(solver="liblinear", random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

In [235]:
print('Accuracy: ', accuracy_score(y_pred, y_test))
print('Precision: ', precision_score(y_pred, y_test))
print('Recall: ', recall_score(y_pred, y_test))
print('F1 Score: ', f1_score(y_pred, y_test))

Accuracy:  0.8634
Precision:  0.8434879256340305
Recall:  0.8758787115727625
F1 Score:  0.8593782170063825
