In [12]:
# All neccessary imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import re
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, auc, roc_auc_score, roc_curve
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from capstone_utils import *
# Inline plotting
%matplotlib inline

In [2]:
def tokenize(text):
    """
    Tokenize input text, including lammatization and stop-word removal. Onlu used for logistic regression

    Args:
    text (str) -- a string to tokenize

    Returns:
    tokens (list) -- a list of words tokenized from text.
    """
    text = re.sub(r"[^a-zA-Z]", " ", text.lower())

    # tokenize text
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    stop_words = stopwords.words("english")
    # lemmatize andremove stop words
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return tokens

In [4]:
def build_model(max_ngram = 1):
    """
    Return a simple pipeline with bag-of-words.
    Param: max_ngram - use this to produce n-grams
    """
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize, ngram_range=(1,max_ngram))),
                         ('clf', LogisticRegression(class_weight='balanced'))
                         ])
    
    return pipeline

In [5]:
# prepare text samples and their labels
path = '../data/'
print('Loading in comments...')
train = pd.read_csv(path+'train.csv')
test = pd.read_csv(path+'test.csv')
print('Cleaning train')
train['question_text'] = train['question_text'].map(lambda x: clean_text(x))
print('Cleaning test')
test['question_text'] = test['question_text'].map(lambda x: clean_text(x))

Loading in comments...
Cleaning train
Cleaning test


In [6]:
# Split the training data into a train and test (validation) set
X_train, X_test, y_train, y_test = train_test_split(train.question_text, train.target, test_size=0.3, random_state=42)

In [7]:
# Get the pipeline and fit model
model = build_model()
print('Training model...')
model.fit(X_train, y_train)

Training model...


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [8]:
# Run prediction (yes - we could obviously set the cutoff to 0.5 and get the same result from predict_proba, 
# but I am lazy..)
predictions = model.predict(X_test)
predictions_proba = model.predict_proba(X_test)

In [9]:
# Print the f1-score
print(f1_score(y_test, predictions))

0.5204351508045078


In [24]:
# Search for the best threshold
best_threshold = threshold_search(y_test, predictions_proba)

F1 score at threshold 0.1 is 0.2712944884853332
F1 score at threshold 0.11 is 0.28311165547506356
F1 score at threshold 0.12 is 0.29395433532312154
F1 score at threshold 0.13 is 0.3043866309661778
F1 score at threshold 0.14 is 0.3144051775290822
F1 score at threshold 0.15 is 0.32404850093847454
F1 score at threshold 0.16 is 0.33329406220546653
F1 score at threshold 0.17 is 0.34199295000568547
F1 score at threshold 0.18 is 0.3501580726043824
F1 score at threshold 0.19 is 0.35817818617658814
F1 score at threshold 0.2 is 0.36625140264229145
F1 score at threshold 0.21 is 0.37360867632380207
F1 score at threshold 0.22 is 0.38119687473169056
F1 score at threshold 0.23 is 0.3878451778208717
F1 score at threshold 0.24 is 0.39459047960234656
F1 score at threshold 0.25 is 0.40131879372385704
F1 score at threshold 0.26 is 0.4076863635940453
F1 score at threshold 0.27 is 0.41409265493859365
F1 score at threshold 0.28 is 0.42032230049213554
F1 score at threshold 0.29 is 0.426101724865751
F1 score a

In [25]:
# Create predictions on the test-set based on the best threshold found above
benchmark_submission_predictions = model.predict(test['question_text'])
benchmark_predictions = (benchmark_submission_predictions > best_threshold.get('threshold')).astype(int)

In [27]:
# Create a submission data frame and write to disk
submission = pd.DataFrame({"qid":test["qid"].values})
submission['prediction'] = benchmark_predictions
submission.to_csv("submission.csv", index=False)

In [22]:
# Use this to avoid restarting the kernel if making changes to capstone_utils.
import importlib
import capstone_utils
importlib.reload(capstone_utils)
from capstone_utils import *

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Error loading WordNet: Package 'WordNet' not found in
[nltk_data]     index


<module 'capstone_utils' from '/home/ubuntu/jupyter_notebooks/data-science/capstone/submit/capstone_utils.py'>