In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

print('Train shape : ', train_df.shape)
print('Test shape : ', test_df.shape)

## Text vectorisation
To apply machine learning/predictive models to text data, we need to transform the unstructured data into a structured form.

From 'Applied Text Analysis with Python' by Tony Ojeda, Rebecca Bilbro, Benjamin Bengfort:

*Machine learning algorithms operate on a numeric feature space, expecting input as a two-dimensional array where rows are instances and columns are features. In order to perform machine learning on text, we need to transform our documents into vector representations such that we can apply numeric machine learning. This process is called feature extraction or more simply, vectorization, and is an essential first step toward language-aware analysis.*

There are a number of vectorisation methods that we will go through below.

#### Bag of words
The basic approach to text vectorisation is the 'bag of words' method. First, a fixed length vector is defined where each entry corresponds to a pre-defined dictionary of words. The size of the vector is the same size as the dictionary. 

The entry for each word in the vector is the number of times that word appears in the text. For example, if our dictionary contains the words {MonkeyLearn, is, the, not, great}, and we want to vectorize the text 'MonkeyLearn is great', we would have the following vector: (1, 1, 0, 0, 1). 

#### Term frequency-inverse document frequency (TFIDF)
A limitation of the bag of words approach is that the technique doesn't capture the meaning of the text, or the context in which the words appear. A single word occuring in a document may be very important, but the noise of frequently occuring words does not allow the word to have an appropriate weighting. TFIDF is a technique for weighting the relative importance of a single word or n-gram.

From Wikipedia:

In information retrieval, tf–idf or TFIDF, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling. The tf–idf value increases proportionally to the number of times a word appears in the document and is offset by the number of documents in the corpus that contain the word, which helps to adjust for the fact that some words appear more frequently in general. Tf–idf is one of the most popular term-weighting schemes today; 83% of text-based recommender systems in digital libraries use tf–idf.

The output of TFIDF is a vector of numbers per document, where the number for each word the word's TFIDF weighting.

In [None]:
from sklearn import model_selection, preprocessing, metrics, ensemble, naive_bayes, linear_model
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tfidf_vec = TfidfVectorizer(stop_words='english', ngram_range=(1,3))

#Learn vocabulary and idf, return term-document matrix
tfidf_vec.fit_transform(train_df['question_text'].values.tolist() + test_df['question_text'].values.tolist())

train_tfidf = tfidf_vec.transform(train_df['question_text'].values.tolist())
test_tfidf = tfidf_vec.transform(test_df['question_text'].values.tolist())

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
import numpy as np
kf = KFold(n_splits = 5, shuffle=True, random_state=2017)

train_y = train_df['target'].values

def runModel(train_X, train_y, test_X, test_y, test_X2):
    model = LogisticRegression(C=5, solver = 'sag') #sag solver better for large datasets
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)[:,1]
    pred_test_y2 = model.predict_proba(test_X2)[:,1]
    return pred_test_y, pred_test_y2, model

# We split the training set into a dev index (to train the model) and a val index
# (to validate/test the model)

print("Building model.")
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train_df.shape[0]])
#The index splits are then applied to the training TFIDF matrix 
for dev_index, val_index in kf.split(train_df):
    dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = runModel(dev_X, dev_y, val_X, val_y, test_tfidf)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index] = pred_val_y 
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    break    

In [None]:
for thresh in np.arange(0.12, 0.20, 0.01):
    thresh = np.round(thresh, 2)
    print('F1 score at threshold {0} is {1}'.format(thresh, metrics.f1_score(val_y, (pred_val_y>thresh).astype(int))))
    print('Precision score at threshold {0} is {1}'.format(thresh, metrics.precision_score(val_y, (pred_val_y>thresh).astype(int))))
    print('Recall score at threshold {0} is {1}'.format(thresh, metrics.recall_score(val_y, (pred_val_y>thresh).astype(int))))
    print()

In [None]:
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = pred_full_test
threshold = 0.17
out_df['prediction'] = np.where(out_df['prediction'] > threshold, 1,0)
out_df.to_csv("submission.csv", index=False)