# Google Quest QA Labeling

## 1. Data retrieval

### 1.1. Import modules

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
from matplotlib_venn import venn2


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelBinarizer

# TODO in the future
# from sklearn.linear_model import MultiTaskElasticNet

import json
import requests
import sys

# import tensorflow_hub as hub
# from transformers import DistilBertTokenizer, DistilBertModel
from transformers import BertTokenizer, BertModel
from tqdm import tqdm

from sklearn.model_selection import KFold
from scipy.stats import spearmanr
from sklearn.linear_model import MultiTaskElasticNet
import tensorflow as tf
import torch
from keras.callbacks import Callback
from keras.optimizers import Adam
from keras.models import Model, Sequential
from keras.layers import LSTM, Input, Dense, Dropout


%matplotlib inline

#### Offline models download

In [None]:
!pip install ../input/sacremoses/sacremoses-master/
!pip install ../input/transformers/transformers-master/
!ls ../input

# add also bert-base-uncased (pytorch_model.bin, vocab.txt, config.json)to input dir (Add data)
sys.path.insert(0, "../input/transformers/transformers-master/")

### 1.2. Reading Data

In [None]:
train_set = pd.read_csv('../input/google-quest-challenge/train.csv')
X_test = pd.read_csv('../input/google-quest-challenge/test.csv')
sample_submission = pd.read_csv('../input/google-quest-challenge/sample_submission.csv')
X_test

In [None]:
train_set = np.split(train_set, [train_set.columns.get_loc('question_asker_intent_understanding')], axis=1)
X_train = train_set[0]
X_train

In [None]:
Y_train = pd.concat([X_train['qa_id'], train_set[1]], axis=1)
Y_train

## 2. Exploratory Data analysis

### 2.1. Data Description

In [None]:
X_train.info()

In [None]:
X_train.describe()

In [None]:
X_test.info()

In [None]:
X_test.describe()

In [None]:
print('Size of X_train', X_train.shape)
print('Size of Y_train', Y_train.shape)
print('Size of X_test', X_test.shape)

### 2.2. Host distribution visualization

In [None]:
X_train_host_dist = X_train["host"].value_counts()
X_test_host_dist = X_test["host"].value_counts()

In [None]:
def host_distribution(distribution, title):
    fig = px.pie(names=distribution.index, 
         values=distribution.values, 
         title=title,
         width=800, 
         height=800)

    fig.update_traces(textposition='inside', textinfo='percent+label')
    fig.update_layout(showlegend=True)
    fig.show()
    
host_distribution(X_train_host_dist, 'Host data distribution on train data')

In [None]:
host_distribution(X_test_host_dist, 'Host data distribution on test data')

### 2.3. Categories distribution

In [None]:
def categories_distribution(categories, title):
    category_share = pd.DataFrame({'share': categories.value_counts() / categories.count()})
    category_share['category'] = category_share.index   
        
    fig = px.bar(category_share, x='category', y='share',
            labels={'share':'share in %'},
            title=title)
        
    fig.show()

In [None]:
categories_distribution(X_train['category'], 'X_train categories distribution barplot')

In [None]:
categories_distribution(X_test['category'], 'X_test categories distribution barplot')

### 2.4. Common features in training and testing set (Venn diagrams)

In [None]:
def venn_diagrams(columns, plot_num, title):
    plt.subplot(plot_num)
    venn2([set(columns[0].unique()), set(columns[1].unique())], set_labels = ('Train set', 'Test set'))
    plt.title(title)
    plt.show()
    
venn_diagrams([X_train['question_user_name'], X_test['question_user_name']], 111, 'Common question_user_name in training and test data')
venn_diagrams([X_train['answer_user_name'], X_test['answer_user_name']], 121, 'Common answer_user_name in training and test data')
venn_diagrams([X_train['question_title'], X_test['question_title']], 131, 'Common question_title in training and test data')

### 2.5. Length of contents

In [None]:
def distribution_imposition(first_col, second_col, title):
    plt.figure(figsize=(20, 6))
    sns.distplot(first_col.str.len())
    sns.distplot(second_col.str.len())
    plt.title(title)
    plt.show()
        
distribution_imposition(X_train['question_title'], X_test['question_title'], 'Length of Question Title represented as cumulative distribution plots')

In [None]:
distribution_imposition(X_train['question_body'], X_test['question_body'], 'Length of Question Body represented as cumulative distribution plots')

In [None]:
distribution_imposition(X_train['answer'], X_test['answer'], 'Length of Question Body represented as cumulative distribution plots')

### 2.6. Most popular questions

In [None]:
X_train.groupby('question_title').count()['qa_id'].sort_values(ascending=False).head(25)

In [None]:
X_train[X_train['question_title'] == 'What is the best introductory Bayesian statistics textbook?']

## 3. Feature Engineering

In [None]:
class Feature_Engineering(object):
    '''
    Helper class used for Feature engineering purposes.
    
    '''
    def __init__(self, dataframe):
        self.dataframe = dataframe
        with open("../input/charset/charset.json", encoding="utf8") as json_file:
            self.charset  = json.load(json_file)
        
        self.vectorizer = TfidfVectorizer(ngram_range=(1, 3))
        self.tsvd = TruncatedSVD(n_components = 128, n_iter=5)
        
#         self.tokenizer = DistilBertTokenizer.from_pretrained('../input/distilbertbaseuncased/') 
#         self.model = DistilBertModel.from_pretrained('../input/distilbertbaseuncased/')
        
        self.tokenizer = BertTokenizer.from_pretrained('../input/bertlargeuncased/bert-large-uncased/')  
        self.model = BertModel.from_pretrained('../input/bertlargeuncased/bert-large-uncased/')
        
        self.binarizer = LabelBinarizer()


    def unconstrained_chars(self, column, index):
        df = self.dataframe[column][index]
        for char in self.charset['CHARS']:
            df = df.replace(char, '')
        return df

    def shortcuts_removal(self, column, index):
        return ' '.join(list(map(lambda word: self.find_and_replace(word), self.dataframe[column][index].split())))
        
    def lower_case(self, column):
        return self.dataframe[column].str.lower()
        
    
    def find_and_replace(self, word):
        for key, value in self.charset['SHORTCUTS'].items():
            if key == word:
                return value
        return word

    def flow(self, column):
        self.dataframe[column] = self.lower_case(column)
        for index in range(self.dataframe[column].shape[0]):
            self.dataframe[column][index] = self.unconstrained_chars(column, index)
            self.dataframe[column][index] = self.shortcuts_removal(column, index)

        return self.dataframe[column]
    
    def tfidf_vec(self, column):
        return list(self.tsvd.fit_transform(self.vectorizer.fit_transform(self.dataframe[column].values)))
    
    def binarize(self, column, other_df):
        if len(self.dataframe[column].value_counts()) < len(other_df[column].value_counts()):
            diff = abs(len(self.dataframe[column].value_counts()) - len(other_df[column].value_counts()))
            return list(np.concatenate([list(self.binarizer.fit_transform(self.dataframe[column].values)), np.zeros((self.dataframe.shape[0], diff))], axis=1))
        return list(self.binarizer.fit_transform(self.dataframe[column].values))
    
    def bert_separators(self, column):
        for index in range(self.dataframe[column].shape[0]):
            self.dataframe[column][index] = self.dataframe[column][index].split('.')
            
        return self.dataframe[column]
    
    def model_conf(self):
        self.model.cpu()
#         self.model.cuda()
        
    def make_vectors(self, column):
        ids = self.dataframe[column].str.slice(0, 500).apply(self.tokenizer.encode)
        vectors = []

        for column in tqdm(ids):
            input_ids = torch.Tensor(column).to(torch.int64).unsqueeze(0)
            try:
                outputs = self.model(input_ids.cpu())
#                 outputs = self.model(input_ids.cuda())
                vectors.append(outputs[0].detach().cpu().numpy().max(axis = 1))

            except:
                vectors.append(np.zeros(outputs[0].detach().cpu().numpy().max(axis = 1)).shape)
        
        return vectors

### 3.1. Data Cleaning

In [None]:
train_features = Feature_Engineering(X_train)
test_features = Feature_Engineering(X_test)

X_train['question_title'] = train_features.flow('question_title')
X_train['question_body'] = train_features.flow('question_body')
X_train['answer'] = train_features.flow('answer')
X_test['question_title'] = test_features.flow('question_title')
X_test['question_body'] = test_features.flow('question_body')
X_test['answer'] = test_features.flow('answer')

X_train.drop(columns=['question_user_page', 'answer_user_page', 'url'], inplace=True)
X_test.drop(columns=['question_user_page', 'answer_user_page', 'url'], inplace=True)

### 3.2. Transform frequent operations on documents

In [None]:
X_train['question_title_tfidf_vec'] = train_features.tfidf_vec('question_title')
X_train['question_body_tfidf_vec'] = train_features.tfidf_vec('question_body')
X_train['answer_tfidf_vec'] = train_features.tfidf_vec('answer')

X_test['question_title_tfidf_vec'] = test_features.tfidf_vec('question_title')
X_test['question_body_tfidf_vec'] = test_features.tfidf_vec('question_body')
X_test['answer_tfidf_vec'] = test_features.tfidf_vec('answer')

### 3.3. Encoding categorical features

In [None]:
# X_train['category_vec'] = list(LabelBinarizer().fit_transform(X_train['category'].values))
# X_train['question_user_name_vec'] = list(LabelBinarizer().fit_transform(X_train['question_user_name'].values))
# X_train['answer_user_name_vec'] = list(LabelBinarizer().fit_transform(X_train['answer_user_name'].values))
# X_train['host_vec'] = list(LabelBinarizer().fit_transform(X_train['host'].values))

# diff = abs(len(X_test['question_user_name'].value_counts()) - len(X_train['question_user_name'].value_counts()))
# d = np.zeros((X_test.shape[0], diff))
# a = list(LabelBinarizer().fit_transform(X_train['question_user_name'].values))
# b = list(LabelBinarizer().fit_transform(X_test['question_user_name'].values))
# a = np.array(a)
# c = np.concatenate([b, d], axis=1)
# print(np.array(a).shape)
# print(np.array(b).shape)
# print(np.array(c).shape)

X_train['category_vec'] = train_features.binarize('category', X_test)
X_train['question_user_name_vec'] = train_features.binarize('question_user_name', X_test)
X_train['answer_user_name_vec'] = train_features.binarize('answer_user_name', X_test)
X_train['host_vec'] = train_features.binarize('host', X_test)

X_test['category_vec'] = test_features.binarize('category', X_train)
X_test['question_user_name_vec'] = test_features.binarize('question_user_name', X_train)
X_test['answer_user_name_vec'] = test_features.binarize('answer_user_name', X_train)
X_test['host_vec'] = test_features.binarize('host', X_train)

In [None]:
X_train

In [None]:
X_test

### 3.4. Bidirectional Encoder Representations from Transformers (BERT)

In [None]:
train_features.model_conf()
test_features.model_conf()

In [None]:
X_train['question_title'] = train_features.bert_separators('question_title')
X_train['question_body'] = train_features.bert_separators('question_body')
X_train['answer'] = train_features.bert_separators('answer')

X_test['question_title'] = test_features.bert_separators('question_title')
X_test['question_body'] = test_features.bert_separators('question_body')
X_test['answer'] = test_features.bert_separators('answer')

In [None]:
question_title_vectors_train = train_features.make_vectors("question_title")
question_body_vectors_train = train_features.make_vectors("question_body")
answer_vectors_train = train_features.make_vectors("answer")

question_title_vectors_test = test_features.make_vectors("question_title")
question_body_vectors_test = test_features.make_vectors("question_body")
answer_vectors_test = test_features.make_vectors("answer")

In [None]:

X_train = np.concatenate([
                     np.vstack(X_train['category_vec']),
                     np.vstack(X_train['host_vec']),
#                      np.vstack(X_train['question_user_name_vec']),
#                      np.vstack(X_train['answer_user_name_vec']),
                     np.array(question_title_vectors_train)[:,0,:],
                     np.array(question_body_vectors_train)[:,0,:],
                     np.array(answer_vectors_train)[:,0,:],
                     np.vstack(X_train['question_title_tfidf_vec']),
                     np.vstack(X_train['question_body_tfidf_vec']),
                     np.vstack(X_train['answer_tfidf_vec'])
                     ], axis = 1)



In [None]:
                     
X_test = np.concatenate([
                     np.vstack(X_test['category_vec']),
                     np.vstack(X_test['host_vec']),
#                      np.vstack(X_test['question_user_name_vec']),
#                      np.vstack(X_test['answer_user_name_vec']),
                     np.array(question_title_vectors_test)[:,0,:],
                     np.array(question_body_vectors_test)[:,0,:],
                     np.array(answer_vectors_test)[:,0,:],
                     np.vstack(X_test['question_title_tfidf_vec']),
                     np.vstack(X_test['question_body_tfidf_vec']),
                     np.vstack(X_test['answer_tfidf_vec'])
                     ], axis = 1)

In [None]:
pd.DataFrame(X_train)

In [None]:
pd.DataFrame(X_test)

In [None]:
Y_train

## 4. Modeling and training

In [None]:
class SpearmanCallback(Callback):
    '''
    Class for calculating and displaing Spearman Rho value
    as a callback after each epoch.
    '''
    def __init__(self, validation_data, model_name):
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]
        
        self.model_name = model_name

    def on_epoch_end(self, epoch, logs={}):
        y_pred_val = self.model.predict(self.x_val)
        rho_val = np.mean([spearmanr(np.array(self.y_val)[:, 0, ind], np.array(y_pred_val)[:, 0, ind]).correlation for ind in range(y_pred_val.shape[1])])
        
        #self.model.save_weights(self.model_name)
        print('validation rho spearman callback value: {}'.format(rho_val))
        return rho_val

In [None]:
def create_model():
    inps = Input(shape=(1, X_train.shape[1]))
    x = LSTM(128, dropout=0.2, return_sequences=True)(inps)
    x = Dense(128, activation='elu')(x)
    x = Dropout(0.2)(x)
    y = Dense(Y_train.shape[1], activation='sigmoid')(x)
    model = Model(inputs=inps, outputs=y)

    model.compile(
        optimizer=Adam(), # to check
        loss=['binary_crossentropy'] #'mean_squared_error'
    )
    model.summary()
    return model

In [None]:
all_predictions = []

kf = KFold(n_splits=5, random_state=42, shuffle=True)

X_test_tmp = X_test.reshape(-1, 1, X_test.shape[1])

for ind, (train, validation) in enumerate(kf.split(X_train)):
    X_train_part= X_train[train]
    Y_train_part = Y_train.iloc[train, :]
    X_validate_part = X_train[validation]
    Y_validate_part = Y_train.iloc[validation, :]
    
    X_train_part = X_train_part.reshape(-1, 1, X_train.shape[1])
    Y_train_part = np.array(Y_train_part).reshape(-1, 1, Y_train.shape[1])
    X_validate_part = X_validate_part.reshape(-1, 1, X_validate_part.shape[1])
    Y_validate_part = np.array(Y_validate_part).reshape(-1, 1, Y_validate_part.shape[1])
    
    model = create_model()
    model.fit(X_train_part, Y_train_part, epochs=100, batch_size=32, validation_data=(X_validate_part, Y_validate_part), verbose=True,
        callbacks=[SpearmanCallback(validation_data=(X_validate_part, Y_validate_part), model_name=f'best_model_batch{ind}.h5')])
    all_predictions.append(model.predict(X_test_tmp))

In [None]:
# Add MultiTaskElasticNet here using X_test_tmp
# and reshape predictions then append to all_predictions
kf = KFold(n_splits=5, random_state=42, shuffle=True)

pred_mt = []
for ind, (train, val) in enumerate(kf.split(X_train)):
    X_train_part= X_train[train]
    Y_train_part = Y_train.iloc[train, :]

    model = MultiTaskElasticNet(alpha=0.001)
    model.fit(X_train_part, Y_train_part)
    pred_mt.append(model.predict(X_test))

In [None]:
all_pred_stash = all_predictions
all_pred_stash

In [None]:
all_predictions = np.concatenate([np.array(all_predictions)[:,:,0,:], np.array(pred_mt)])

### 4.1. Averaging predictions

In [None]:
final = pd.DataFrame(np.array(all_predictions)[0,:,:])
final = pd.concat([pd.DataFrame(np.array(all_predictions)[valid_pred,:,:]) for valid_pred in range(10)], axis=1)
for num in range(31):
    final[num] = final.iloc[:, [num * valid_pred for valid_pred in range(10)]].mean(axis=1)
final = final.iloc[:, :31]

### 4.2. Submiting 

In [None]:
final.columns = sample_submission.columns

In [None]:
final['qa_id'] = sample_submission['qa_id']

In [None]:
final

In [None]:
final.to_csv('submission.csv', index=False)