# Quora Question pairs


In this session we will prepare NLP's Deliverable 1.


The goal is to build a solution for:

https://www.kaggle.com/c/quora-question-pairs/overview

In [1]:
import pandas as pd
import scipy
import sklearn
from sklearn import *
import numpy as np
import os

In [2]:
cl = sklearn.linear_model.Perceptron()

In [None]:
ls

In [None]:
# Create training, validation and test partitions
quora_df = pd.read_csv("./quora_data.csv")
A_df, test_df = sklearn.model_selection.train_test_split(quora_df, test_size=0.05, random_state=123)
train_df, val_df = sklearn.model_selection.train_test_split(A_df, test_size=0.05)
print('train_df.shape=',train_df.shape)
print('val_df.shape=',val_df.shape)
print('test_df.shape=',test_df.shape)

In [None]:
train_df[10:20]

Let us train a first model....

In [None]:
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,1))

all_q1 = list(train_df["question1"]) + list(val_df["question1"]) + list(test_df["question1"])
all_q2 = list(train_df["question2"]) + list(val_df["question2"]) + list(test_df["question2"])
all_questions = all_q1 + all_q2

len(all_questions)

The following code will not work

```python
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,1))
count_vectorizer.fit(all_questions)
```

In [None]:
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,1))
count_vectorizer.fit(all_questions)

We got an error what should we do?

```
ValueError: np.nan is an invalid document, expected byte or unicode string.
```


#####  Exercise 1:  `cast_list_as_strings`

Build a function  **`cast_list_as_strings`** that casts each element in the input list to a string.


In [None]:
import numpy as np
types_ = [type(q).__name__ for q in all_q1]
print(np.unique(types_))

for i in range(len(all_q1)):
    if type(all_q1[i]) == float:
        print(i, all_q1[i])

In [16]:
def cast_list_as_strings(mylist):
    """
    return a list of strings
    """
    mylist_of_strings = []
    for x in mylist:
        mylist_of_strings.append(str(x))

    return mylist_of_strings

In [21]:
q1_train =  cast_list_as_strings(list(train_df["question1"]))
q2_train =  cast_list_as_strings(list(train_df["question2"]))
q1_val =  cast_list_as_strings(list(val_df["question1"]))
q2_val =  cast_list_as_strings(list(val_df["question2"]))
q1_test  =  cast_list_as_strings(list(test_df["question1"]))
q2_test  =  cast_list_as_strings(list(test_df["question2"]))

In [None]:
import numpy as np
types_ = [type(q).__name__ for q in q2_train]
np.unique(types_)

In [None]:
q1_train[1], q2_train[1]

Use all the questions in train partition to build a single list `all_questions` to fit the `count_vectorizer`

In [24]:
all_questions = q1_train + q2_train

In [None]:
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,1))
count_vectorizer.fit(all_questions)

#####  Exercise 2:  `get_features_from_df`

Make a function `get_features_from_df` that given a dataframe containing the format of the training data
it returns a scipy sparse matrix with the features from question 1 and question 2 

In [26]:

def get_features_from_df(df, count_vectorizer):
    """
    returns a sparse matrix containing the features build by the count vectorizer.
    Each row should contain features from question1 and question2.
    """
    q1_casted =  cast_list_as_strings(list(df["question1"]))
    q2_casted =  cast_list_as_strings(list(df["question2"]))
    
    ############### Begin exercise ###################
    # what is kaggle                  q1
    # What is the kaggle platform     q2
    X_q1 = count_vectorizer.transform(q1_casted)
    X_q2 = count_vectorizer.transform(q2_casted)    
    X_q1q2 = scipy.sparse.hstack((X_q1,X_q2))
    ############### End exercise ###################

    return X_q1q2

In [None]:
get_features_from_df(train_df, count_vectorizer)

In [None]:
X_tr_q1q2 = get_features_from_df(train_df, count_vectorizer)
X_te_q1q2  = get_features_from_df(test_df, count_vectorizer)

X_tr_q1q2.shape, train_df.shape, test_df.shape, X_te_q1q2.shape

Now You can use this representation `X_tr_q1q2` to fit a model

In [None]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear",
                                                   random_state=123)
y_train = train_df["is_duplicate"].values
logistic.fit(X_tr_q1q2, y_train)

## Examine mistakes


#####  Exercise 3:  `get_mistakes`

Make a function `get_mistakes` that given a model `clf` a dataframe `df`, the features `X_q1q2` and the target labels `y`returns 


- incorrect_indices: coordinates where the model made a mistake
- predictions: predictions made by the model


In [30]:
def get_mistakes(clf, X_q1q2, y):

    ############### Begin exercise ###################
    predictions = clf.predict(X_q1q2)
    incorrect_predictions = predictions != y 
    incorrect_indices,  = np.where(incorrect_predictions)
    ############### End exercise ###################
    
    if np.sum(incorrect_predictions)==0:
        print("no mistakes in this df")
    else:
        return incorrect_indices, predictions

In [31]:
y_train = train_df["is_duplicate"].values

mistake_indices, predictions = get_mistakes(logistic,
                                            X_tr_q1q2, 
                                            y_train)

In [33]:
def print_mistake_k(k, mistake_indices, predictions):
    print(train_df.iloc[mistake_indices[k]].question1)
    print(train_df.iloc[mistake_indices[k]].question2)
    print("true class:", train_df.iloc[mistake_indices[k]].is_duplicate)
    print("prediction:", predictions[mistake_indices[k]])

In [None]:
print_mistake_k(100, mistake_indices, predictions)

In [None]:
print_mistake_k(12, mistake_indices, predictions)

In [None]:
# w1 w2 ... wn
# v1 v2 ... vm

In [None]:
# w1vec = word2vec.transform(w1)
# len(w1vec) = 256

# w2vec = word2vec.transform(w2)
# len(w2vec) = 256

# EmbA  = w1vec + ... + wnvec -> 256
# EmbB  = v1vec + ... + vnvec -> 256
# concat(256, 256)            -> 512
# word2vec feature: d(EmbA, EmbB)

# Improving feature vectors:

Build a Similarity metric between documents using the tf-idf vectors of the documents.

Build a class from scratch that can learn a tf-idf feature representation.