https://www.kaggle.com/abhishek/distilbert-use-features-oof

In [1]:
!pip install ../input/sacremoses/sacremoses-master/ > /dev/null

import os
import sys
import glob
import torch

sys.path.insert(0, "../input/transformers/transformers-master/")
import transformers
import numpy as np
import pandas as pd
import math

from tqdm import tqdm

In [2]:
pd.options.display.max_columns=999

In [3]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [4]:
DEVICE = torch.device("cuda")
tokenizer = transformers.DistilBertTokenizer.from_pretrained("../input/distilbertbaseuncased/")
model = transformers.DistilBertModel.from_pretrained("../input/distilbertbaseuncased/")
model.to(DEVICE)

'''
class new_model():
    def __init__(self):
        self.basemodel = transformers.DistilBertModel.from_pretrained("../input/distilbertbaseuncased/")
        self.pooling = torch.nn.A
'''

'\nclass new_model():\n    def __init__(self):\n        self.basemodel = transformers.DistilBertModel.from_pretrained("../input/distilbertbaseuncased/")\n        self.pooling = torch.nn.A\n'

In [5]:
sample_string = "What am I losing when using extension tubes instead of a macro lens?"
tokenized = []
x = " ".join(sample_string.strip().split()[:300])
tok = tokenizer.encode(x, add_special_tokens=True)
tokenized_text = tokenizer.tokenize(x)
tokenized.append(tok[:512])
print (tokenized)
print (tokenized_text)
print (len(tokenized_text), len(tokenized[0]))

[[101, 2054, 2572, 1045, 3974, 2043, 2478, 5331, 10868, 2612, 1997, 1037, 26632, 10014, 1029, 102]]
['what', 'am', 'i', 'losing', 'when', 'using', 'extension', 'tubes', 'instead', 'of', 'a', 'macro', 'lens', '?']
14 16


In [6]:
max_len = 512
padded = np.array([i + [0] * (max_len - len(i)) for i in tokenized])
attention_mask = np.where(padded != 0, 1, 0)
input_ids = torch.tensor(padded).to(DEVICE)
attention_mask = torch.tensor(attention_mask).to(DEVICE)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [7]:
np.hstack([last_hidden_states[0][:,0,:].cpu().numpy(),last_hidden_states[0].cpu().numpy().mean(axis=1)]).shape

(1, 1536)

In [8]:
def fetch_vectors(string_list, batch_size=64):
    # inspired by https://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/
    DEVICE = torch.device("cuda")
    tokenizer = transformers.DistilBertTokenizer.from_pretrained("../input/distilbertbaseuncased/")
    model = transformers.DistilBertModel.from_pretrained("../input/distilbertbaseuncased/")
    model.to(DEVICE)

    fin_features = []
    for data in tqdm(chunks(string_list, batch_size)):
        tokenized = []
        all_lengths = []
        for x in data:
            x = " ".join(x.strip().split()[:300])
            tok = tokenizer.encode(x, add_special_tokens=True)
            all_lengths.append(len(tok))
            tokenized.append(tok[:512])

        max_len = 512
        padded = np.array([i + [0] * (max_len - len(i)) for i in tokenized])
        attention_mask = np.where(padded != 0, 1, 0)
        input_ids = torch.tensor(padded).to(DEVICE)
        attention_mask = torch.tensor(attention_mask).to(DEVICE)

        with torch.no_grad():
            last_hidden_states = model(input_ids, attention_mask=attention_mask)

        features1 = last_hidden_states[0][:, 0, :].cpu().numpy()
        features2 = last_hidden_states[0].cpu().numpy().mean(axis=1)
        features3 = np.array([last_hidden_states[0].cpu().numpy()[i,:all_lengths[i],:].mean(axis=0) for i in range(len(all_lengths))])
        features = np.hstack([features1,features2, features3])
        fin_features.append(features)

    fin_features = np.vstack(fin_features)
    return fin_features

In [9]:
df_train = pd.read_csv("../input/google-quest-challenge/train.csv").fillna("none")
df_test = pd.read_csv("../input/google-quest-challenge/test.csv").fillna("none")

sample = pd.read_csv("../input/google-quest-challenge/sample_submission.csv")
target_cols = list(sample.drop("qa_id", axis=1).columns)

train_question_title_dense = fetch_vectors(df_train.question_title.values)
train_question_body_dense = fetch_vectors(df_train.question_body.values)
train_answer_dense = fetch_vectors(df_train.answer.values)

test_question_title_dense = fetch_vectors(df_test.question_title.values)
test_question_body_dense = fetch_vectors(df_test.question_body.values)
test_answer_dense = fetch_vectors(df_test.answer.values)


95it [09:04,  5.73s/it]
95it [09:36,  6.06s/it]
95it [09:33,  6.03s/it]
8it [00:41,  5.20s/it]
8it [00:42,  5.31s/it]
8it [00:42,  5.30s/it]


In [10]:
print (train_question_title_dense.shape, train_question_body_dense.shape, train_answer_dense.shape, test_question_title_dense.shape, test_question_body_dense.shape, test_answer_dense.shape)

(6079, 2304) (6079, 2304) (6079, 2304) (476, 2304) (476, 2304) (476, 2304)


In [11]:
import os
import re
import gc
import pickle  
import random
import keras

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import keras.backend as K

from keras.models import Model
from keras.layers import Dense, Input, Dropout, Lambda, BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import Callback, ModelCheckpoint
from scipy.stats import spearmanr, rankdata
from os.path import join as path_join
from numpy.random import seed
from urllib.parse import urlparse
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.linear_model import MultiTaskElasticNet

seed(42)
tf.random.set_seed(42)
random.seed(42)

Using TensorFlow backend.


In [12]:
data_dir = '../input/google-quest-challenge/'
train = pd.read_csv(path_join(data_dir, 'train.csv'))
test = pd.read_csv(path_join(data_dir, 'test.csv'))
print(train.shape, test.shape)
train.head()

(6079, 41) (476, 11)


Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,host,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,question_opinion_seeking,question_type_choice,question_type_compare,question_type_consequence,question_type_definition,question_type_entity,question_type_instructions,question_type_procedure,question_type_reason_explanation,question_type_spelling,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0,What am I losing when using extension tubes in...,After playing around with macro photography on...,ysap,https://photo.stackexchange.com/users/1024,"I just got extension tubes, so here's the skin...",rfusca,https://photo.stackexchange.com/users/1917,http://photo.stackexchange.com/questions/9169/...,LIFE_ARTS,photo.stackexchange.com,1.0,0.333333,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.666667,1.0,1.0,0.8,1.0,0.0,0.0,1.0
1,1,What is the distinction between a city and a s...,I am trying to understand what kinds of places...,russellpierce,https://rpg.stackexchange.com/users/8774,It might be helpful to look into the definitio...,Erik Schmidt,https://rpg.stackexchange.com/users/1871,http://rpg.stackexchange.com/questions/47820/w...,CULTURE,rpg.stackexchange.com,1.0,1.0,0.0,0.5,1.0,1.0,0.444444,0.444444,0.666667,0.0,0.0,0.666667,0.666667,0.0,0.333333,0.0,0.0,0.0,0.333333,0.0,0.888889,0.888889,0.555556,0.888889,0.888889,0.666667,0.0,0.0,0.666667,0.888889
2,2,Maximum protusion length for through-hole comp...,I'm working on a PCB that has through-hole com...,Joe Baker,https://electronics.stackexchange.com/users/10157,Do you even need grooves? We make several pro...,Dwayne Reid,https://electronics.stackexchange.com/users/64754,http://electronics.stackexchange.com/questions...,SCIENCE,electronics.stackexchange.com,0.888889,0.666667,0.0,1.0,1.0,1.0,0.666667,0.444444,0.333333,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.333333,0.0,0.777778,0.777778,0.555556,1.0,1.0,0.666667,0.0,0.333333,1.0,0.888889
3,3,Can an affidavit be used in Beit Din?,"An affidavit, from what i understand, is basic...",Scimonster,https://judaism.stackexchange.com/users/5151,"Sending an ""affidavit"" it is a dispute between...",Y e z,https://judaism.stackexchange.com/users/4794,http://judaism.stackexchange.com/questions/551...,CULTURE,judaism.stackexchange.com,0.888889,0.666667,0.666667,1.0,1.0,1.0,0.444444,0.444444,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.888889,0.833333,0.333333,0.833333,1.0,0.8,0.0,0.0,1.0,1.0
4,5,How do you make a binary image in Photoshop?,I am trying to make a binary image. I want mor...,leigero,https://graphicdesign.stackexchange.com/users/...,Check out Image Trace in Adobe Illustrator. \n...,q2ra,https://graphicdesign.stackexchange.com/users/...,http://graphicdesign.stackexchange.com/questio...,LIFE_ARTS,graphicdesign.stackexchange.com,1.0,0.666667,0.0,1.0,1.0,1.0,0.666667,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.666667,1.0,1.0,0.8,1.0,0.0,1.0,1.0


In [13]:
targets = [
        'question_asker_intent_understanding',
        'question_body_critical',
        'question_conversational',
        'question_expect_short_answer',
        'question_fact_seeking',
        'question_has_commonly_accepted_answer',
        'question_interestingness_others',
        'question_interestingness_self',
        'question_multi_intent',
        'question_not_really_a_question',
        'question_opinion_seeking',
        'question_type_choice',
        'question_type_compare',
        'question_type_consequence',
        'question_type_definition',
        'question_type_entity',
        'question_type_instructions',
        'question_type_procedure',
        'question_type_reason_explanation',
        'question_type_spelling',
        'question_well_written',
        'answer_helpful',
        'answer_level_of_information',
        'answer_plausible',
        'answer_relevance',
        'answer_satisfaction',
        'answer_type_instructions',
        'answer_type_procedure',
        'answer_type_reason_explanation',
        'answer_well_written'    
    ]

input_columns = ['question_title', 'question_body', 'answer']

> # Features

In [14]:
find = re.compile(r"^[^.]*")

train['netloc'] = train['url'].apply(lambda x: re.findall(find, urlparse(x).netloc)[0])
test['netloc'] = test['url'].apply(lambda x: re.findall(find, urlparse(x).netloc)[0])

features = ['netloc', 'category']
merged = pd.concat([train[features], test[features]])
ohe = OneHotEncoder()
ohe.fit(merged)

features_train = ohe.transform(train[features]).toarray()
features_test = ohe.transform(test[features]).toarray()

In [15]:
module_url = "../input/universalsentenceencoderlarge4/"
embed = hub.load(module_url)

In [16]:
embeddings_train = {}
embeddings_test = {}
for text in input_columns:
    print(text)
    train_text = train[text].str.replace('?', '.').str.replace('!', '.').tolist()
    test_text = test[text].str.replace('?', '.').str.replace('!', '.').tolist()
    
    curr_train_emb = []
    curr_test_emb = []
    batch_size = 4
    ind = 0
    while ind*batch_size < len(train_text):
        curr_train_emb.append(embed(train_text[ind*batch_size: (ind + 1)*batch_size])["outputs"].numpy())
        ind += 1
        
    ind = 0
    while ind*batch_size < len(test_text):
        curr_test_emb.append(embed(test_text[ind*batch_size: (ind + 1)*batch_size])["outputs"].numpy())
        ind += 1    
        
    embeddings_train[text + '_embedding'] = np.vstack(curr_train_emb)
    embeddings_test[text + '_embedding'] = np.vstack(curr_test_emb)
    
del embed
K.clear_session()
gc.collect()

question_title
question_body
answer


302250

In [17]:
l2_dist = lambda x, y: np.power(x - y, 2).sum(axis=1)

cos_dist = lambda x, y: (x*y).sum(axis=1)

dist_features_train = np.array([
    l2_dist(embeddings_train['question_title_embedding'], embeddings_train['answer_embedding']),
    l2_dist(embeddings_train['question_body_embedding'], embeddings_train['answer_embedding']),
    l2_dist(embeddings_train['question_body_embedding'], embeddings_train['question_title_embedding']),
    cos_dist(embeddings_train['question_title_embedding'], embeddings_train['answer_embedding']),
    cos_dist(embeddings_train['question_body_embedding'], embeddings_train['answer_embedding']),
    cos_dist(embeddings_train['question_body_embedding'], embeddings_train['question_title_embedding'])
]).T

dist_features_test = np.array([
    l2_dist(embeddings_test['question_title_embedding'], embeddings_test['answer_embedding']),
    l2_dist(embeddings_test['question_body_embedding'], embeddings_test['answer_embedding']),
    l2_dist(embeddings_test['question_body_embedding'], embeddings_test['question_title_embedding']),
    cos_dist(embeddings_test['question_title_embedding'], embeddings_test['answer_embedding']),
    cos_dist(embeddings_test['question_body_embedding'], embeddings_test['answer_embedding']),
    cos_dist(embeddings_test['question_body_embedding'], embeddings_test['question_title_embedding'])
]).T

Generate textual features and topic modelling features

In [18]:
import gensim
from nltk.corpus import stopwords

from scipy.stats import skew, kurtosis, spearmanr
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from nltk import word_tokenize
stop_words = stopwords.words('english')

from collections import Counter
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem.wordnet import WordNetLemmatizer

import string

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score
import xgboost as xgb

In [19]:
def wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return model.wmdistance(s1, s2)


def norm_wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return norm_model.wmdistance(s1, s2)


def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())


In [20]:
data = pd.concat([train.drop(targets,axis=1),test],axis=0)
print (data.shape)

(6555, 12)


In [21]:
data['len_q1'] = data.question_title.apply(lambda x: len(str(x)))
data['len_q2'] = data.question_body.apply(lambda x: len(str(x)))
data['len_a'] = data.answer.apply(lambda x: len(str(x)))
data['diff_len_q'] = data.len_q2 - data.len_q1
data['diff_len_q_frac'] = data['diff_len_q']/data.len_q2

data['diff_len_a1'] = data.len_a - data.len_q1
data['diff_len_a2'] = data.len_a - data.len_q2
data['diff_len_frac_a2'] = data['diff_len_a2']/data['len_a']

data['len_word_q1'] = data.question_title.apply(lambda x: len(str(x).split()))
data['len_word_q2'] = data.question_body.apply(lambda x: len(str(x).split()))
data['len_word_frac_q2'] = data['len_word_q1']/data['len_word_q2']
data['len_word_a'] = data.answer.apply(lambda x: len(str(x).split()))
data['len_word_frac_a'] = data['len_word_q2']/data['len_word_a']

data['common_words_q'] = data.apply(lambda x: len(set(str(x['question_title']).lower().split()).intersection(set(str(x['question_body']).lower().split()))), axis=1)
data['common_words_frac_q'] = data['common_words_q']/data.len_word_q1
data['common_words_frac2_q'] = data['common_words_q']/data.len_word_q2
data['common_words_a1'] = data.apply(lambda x: len(set(str(x['question_title']).lower().split()).intersection(set(str(x['answer']).lower().split()))), axis=1)
data['common_words_a2'] = data.apply(lambda x: len(set(str(x['answer']).lower().split()).intersection(set(str(x['question_body']).lower().split()))), axis=1)
data['common_words_frac_a2'] = data['common_words_a2']/data['len_word_a']
data['common_words_frac2_a2'] = data['common_words_a2']/data['len_word_q2']

In [22]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
st = PorterStemmer()
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()

def clean(data):
    tokens = tokenizer.tokenize(data.lower())
    stop_free = " ".join([st.stem(i) for i in tokens if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized
    
def processSingleReview(review, d=None):
    """
    Convert a raw review to a string of words
    """
    letters_only = re.sub("[^a-zA-Z]", " ", review)
    words = tokenizer.tokenize(letters_only.lower())
    stops = set(stopwords.words("english"))
    meaningful_words = [st.stem(w) for w in words if w not in stops]
    meaningful_words = [w for w in meaningful_words if pos_tag([w],tagset='universal')[0][1] in ['NOUN','VERB','ADJ']] #
    return(" ".join(meaningful_words))

data['clean_question_title'] = data.apply(lambda row :clean(row['question_title']),axis=1)
data['clean_question_title'] = data.apply(lambda row: re.sub(r'\d+', '',row['clean_question_title']),axis=1)

data['clean_question_body'] = data.apply(lambda row :clean(row['question_body']),axis=1)
data['clean_question_body'] = data.apply(lambda row: re.sub(r'\d+', '',row['clean_question_body']),axis=1)

data['clean_answer'] = data.apply(lambda row :clean(row['answer']),axis=1)
data['clean_answer'] = data.apply(lambda row: re.sub(r'\d+', '',row['clean_answer']),axis=1)

stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
st = PorterStemmer()
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()

def clean(data):
    tokens = tokenizer.tokenize(data.lower())
    stop_free = " ".join([st.stem(i) for i in tokens if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized
    
def processSingleReview(review, d=None):
    """
    Convert a raw review to a string of words
    """
    letters_only = re.sub("[^a-zA-Z]", " ", review)
    words = tokenizer.tokenize(letters_only.lower())
    stops = set(stopwords.words("english"))
    meaningful_words = [st.stem(w) for w in words if w not in stops]
    meaningful_words = [w for w in meaningful_words if pos_tag([w],tagset='universal')[0][1] in ['NOUN','VERB','ADJ']] #
    return(" ".join(meaningful_words))

data['clean_question_title'] = data.apply(lambda row :clean(row['question_title']),axis=1)
data['clean_question_title'] = data.apply(lambda row: re.sub(r'\d+', '',row['clean_question_title']),axis=1)

data['clean_question_body'] = data.apply(lambda row :clean(row['question_body']),axis=1)
data['clean_question_body'] = data.apply(lambda row: re.sub(r'\d+', '',row['clean_question_body']),axis=1)

data['clean_answer'] = data.apply(lambda row :clean(row['answer']),axis=1)
data['clean_answer'] = data.apply(lambda row: re.sub(r'\d+', '',row['clean_answer']),axis=1)

data['question_title_wordlen'] = data.clean_question_title.apply(lambda x: len(x.split()))
data['question_body_wordlen'] = data.clean_question_body.apply(lambda x: len(x.split()))
data['answer_wordlen'] = data.clean_answer.apply(lambda x: len(x.split()))

In [23]:
from gensim.models.word2vec import Word2Vec

input_word2vec = data.clean_question_title.tolist() + data.clean_question_body.tolist() + data.clean_answer.tolist()
input_word2vec = [i.split() for i in input_word2vec]

print (len(input_word2vec))

model = Word2Vec(min_count=5)
model.build_vocab(input_word2vec)
model.train(input_word2vec,total_examples = model.corpus_count,epochs=15)

question1_vectors = np.zeros((data.shape[0], 100))
error_count = 0

for i, q in tqdm(enumerate(data.question_title.values)):
    question1_vectors[i, :] = sent2vec(q)

question2_vectors  = np.zeros((data.shape[0], 100))
for i, q in tqdm(enumerate(data.question_body.values)):
    question2_vectors[i, :] = sent2vec(q)
    
answer_vectors  = np.zeros((data.shape[0], 100))
for i, q in tqdm(enumerate(data.answer.values)):
    answer_vectors[i, :] = sent2vec(q)
    
data['cosine_distance_q'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['cosine_distance_a1'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(answer_vectors),
                                                          np.nan_to_num(question1_vectors))]

data['cosine_distance_a2'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(answer_vectors),
                                                          np.nan_to_num(question2_vectors))]


data['euclidean_distance_q'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['euclidean_distance_a1'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(answer_vectors),
                                                          np.nan_to_num(question1_vectors))]

data['euclidean_distance_a2'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(answer_vectors),
                                                          np.nan_to_num(question2_vectors))]


19665


6555it [00:02, 2594.30it/s]
6555it [00:22, 286.70it/s]
6555it [00:24, 264.82it/s]
  dist = 1.0 - uv / np.sqrt(uu * vv)


In [24]:
def oov_counts(x):
    count = 0
    for word in x.split():
        if word.lower() not in model.wv.vocab and word not in model.wv.vocab:
            count += 1
    return count

data["oov_count_title"] = data.clean_question_title.apply(oov_counts)
data["oov_count_body"] = data.clean_question_body.apply(oov_counts)
data["oov_count_answer"] = data.clean_answer.apply(oov_counts)

data["oov_count_title_frac"] = data["oov_count_title"]/data.question_title_wordlen
data["oov_count_body_frac"] = data["oov_count_body"]/data.question_body_wordlen
data["oov_count_answer_frac"] = data["oov_count_answer"]/data.answer_wordlen

def count_question_words(x):
    count = 0
    count += x.count("?")
    for word in x.lower().split():
        if word.startswith("wh") or word.startswith("how"):
            count += 1
    return count

data["q_count_title"] = data.question_title.apply(count_question_words)
data["q_count_body"] = data.question_body.apply(count_question_words)

data["q_count_title_frac"] = data["q_count_title"]/data.question_title_wordlen
data["q_count_body_frac"] = data["q_count_body"]/data.question_body_wordlen

In [25]:
from sklearn.decomposition import LatentDirichletAllocation
import scipy

lda1 = LatentDirichletAllocation(n_components=20)
lda2 = LatentDirichletAllocation(n_components=20)

cv1 = CountVectorizer(max_df=.7,min_df=5,max_features=50000)

answer_vector = cv1.fit_transform(data.clean_answer)
title_vector = cv1.transform(data.clean_question_title)
body_vector = cv1.transform(data.clean_question_body)

cv2 = CountVectorizer(max_df=.7,min_df=5,max_features=50000)

body_vector2 = cv2.fit_transform(data.clean_question_body)
answer_vector2 = cv2.transform(data.clean_answer)
title_vector2 = cv2.transform(data.clean_question_title)

answer_topics = lda1.fit_transform(answer_vector)
title_topics = lda1.transform(title_vector)
body_topics = lda1.transform(body_vector)

body_topics2 = lda2.fit_transform(body_vector2)
answer_topics2 = lda2.transform(answer_vector2)
title_topics2 = lda2.transform(title_vector2)


title_topic_entropy = scipy.stats.entropy(title_topics.T)
body_topic_entropy = scipy.stats.entropy(body_topics.T)
answer_topic_entropy = scipy.stats.entropy(answer_topics.T)

title_topic_entropy2 = scipy.stats.entropy(title_topics2.T)
body_topic_entropy2 = scipy.stats.entropy(body_topics2.T)
answer_topic_entropy2 = scipy.stats.entropy(answer_topics2.T)

#document_topic_entropy_len_normalized = document_topic_entropy * np.sqrt(word_len/2)
def geometric_mean(x):
    x = [i for i in x if i!=0]
    if len(x) > 0:
        return scipy.stats.mstats.gmean(x)
    else:
        return 0

def popularity(matrix):
    matrix = matrix.toarray()
    idf_matrix = (matrix > 0).astype(int)
    word_freq = idf_matrix.sum(axis=0)
    idf_matrix = idf_matrix * word_freq
    idf_matrix = idf_matrix * 1.0/idf_matrix.shape[0]

    document_popularity = np.array([geometric_mean(x) for x in idf_matrix.tolist()])
    return document_popularity

title_popularity = popularity(title_vector)
body_popularity = popularity(body_vector)
answer_popularity = popularity(answer_vector)

title_popularity2 = popularity(title_vector2)
body_popularity2 = popularity(body_vector2)
answer_popularity2 = popularity(answer_vector2)

data['title_entropy'] = title_topic_entropy
data['body_entropy'] = body_topic_entropy
data['answer_entropy'] = answer_topic_entropy

data['title_entropy2'] = title_topic_entropy2
data['body_entropy2'] = body_topic_entropy2
data['answer_entropy2'] = answer_topic_entropy2

data['title_popularity'] = title_popularity
data['body_popularity'] = body_popularity
data['answer_popularity'] = answer_popularity

data['title_popularity2'] = title_popularity2
data['body_popularity2'] = body_popularity2
data['answer_popularity2'] = answer_popularity2

#from sklearn.metrics.pairwise import manhattan_distances

def manhattan_distance(x,y):
    return np.abs(x-y).sum(axis=1)

def jaccard_distance(x,y):
    return np.abs(x-y).sum(axis=1)*1.0/np.max(np.array([x.sum(1),y.sum(1)]),axis=0)

data['manh_q'] = manhattan_distance(title_vector.toarray(),body_vector.toarray()) #scipy.spatial.distance.cdist(title_vector2.toarray(),body_vector2.toarray(),cityblock)
data['manh_a1'] = manhattan_distance(title_vector.toarray(),answer_vector.toarray())
data['manh_a2'] = manhattan_distance(body_vector.toarray(),answer_vector.toarray())

data['jac_q'] = jaccard_distance(title_vector.toarray(),body_vector.toarray()) #scipy.spatial.distance.cdist(title_vector2.toarray(),body_vector2.toarray(),cityblock)
data['jac_a1'] = jaccard_distance(title_vector.toarray(),answer_vector.toarray())
data['jac_a2'] = jaccard_distance(body_vector.toarray(),answer_vector.toarray())

data['manh_q_2'] = manhattan_distance(title_vector2.toarray(),body_vector2.toarray()) #scipy.spatial.distance.cdist(title_vector2.toarray(),body_vector2.toarray(),cityblock)
data['manh_a1_2'] = manhattan_distance(title_vector2.toarray(),answer_vector2.toarray())
data['manh_a2_2'] = manhattan_distance(body_vector2.toarray(),answer_vector2.toarray())

data['jac_q_2'] = jaccard_distance(title_vector2.toarray(),body_vector2.toarray()) #scipy.spatial.distance.cdist(title_vector2.toarray(),body_vector2.toarray(),cityblock)
data['jac_a1_2'] = jaccard_distance(title_vector2.toarray(),answer_vector2.toarray())
data['jac_a2_2'] = jaccard_distance(body_vector2.toarray(),answer_vector2.toarray())

from sklearn.decomposition import NMF

nmf1 = NMF(n_components=20)
nmf2 = NMF(n_components=20)

nmf_ans = nmf1.fit_transform(answer_vector)
nmf_title = nmf1.transform(title_vector)
nmf_body = nmf1.transform(body_vector)

nmf_body2 = nmf2.fit_transform(body_vector2)
nmf_title2 = nmf2.transform(title_vector2)
nmf_ans2 = nmf2.transform(answer_vector2)

In [26]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [27]:
for col in data:
    if data[col].isna().any():
        print (col)
        if 'popularity' in col or 'frac' in col:
            data[col] = data[col].fillna(0)
        elif 'distance' in col:
            data[col] = data[col].fillna(1)
        else:
            data[col] = data[col].fillna(-99)

len_word_frac_q2
common_words_frac2_q
common_words_frac2_a2
cosine_distance_q
cosine_distance_a1
cosine_distance_a2
oov_count_body_frac
q_count_body_frac


In [28]:
for col in data:
    if data[col].isna().any():
        print (col)

In [29]:
new_features = list(data.columns)[12:]
new_features.remove('clean_question_title')
new_features.remove('clean_question_body')
new_features.remove('clean_answer')
print (new_features)

['len_q1', 'len_q2', 'len_a', 'diff_len_q', 'diff_len_q_frac', 'diff_len_a1', 'diff_len_a2', 'diff_len_frac_a2', 'len_word_q1', 'len_word_q2', 'len_word_frac_q2', 'len_word_a', 'len_word_frac_a', 'common_words_q', 'common_words_frac_q', 'common_words_frac2_q', 'common_words_a1', 'common_words_a2', 'common_words_frac_a2', 'common_words_frac2_a2', 'question_title_wordlen', 'question_body_wordlen', 'answer_wordlen', 'cosine_distance_q', 'cosine_distance_a1', 'cosine_distance_a2', 'euclidean_distance_q', 'euclidean_distance_a1', 'euclidean_distance_a2', 'oov_count_title', 'oov_count_body', 'oov_count_answer', 'oov_count_title_frac', 'oov_count_body_frac', 'oov_count_answer_frac', 'q_count_title', 'q_count_body', 'q_count_title_frac', 'q_count_body_frac', 'title_entropy', 'body_entropy', 'answer_entropy', 'title_entropy2', 'body_entropy2', 'answer_entropy2', 'title_popularity', 'body_popularity', 'answer_popularity', 'title_popularity2', 'body_popularity2', 'answer_popularity2', 'manh_q', '

In [30]:
X_train = np.hstack([item for k, item in embeddings_train.items()] + [features_train, dist_features_train, train_question_title_dense, train_question_body_dense, train_answer_dense])
X_test = np.hstack([item for k, item in embeddings_test.items()] + [features_test, dist_features_test, test_question_title_dense, test_question_body_dense, test_answer_dense])
y_train = train[targets].values
print (X_train.shape, X_test.shape)

(6079, 8518) (476, 8518)


In [31]:
from sklearn.preprocessing import MinMaxScaler

mm = MinMaxScaler()

question1_vectors[np.isnan(question1_vectors)] = 0
question2_vectors[np.isnan(question2_vectors)] = 0
answer_vectors[np.isnan(answer_vectors)] = 0

new_X = np.hstack([mm.fit_transform(data[new_features]), question1_vectors, question2_vectors, answer_vectors, nmf_ans, nmf_ans2, nmf_body, nmf_body2, nmf_title, nmf_body2, body_topics, body_topics2, title_topics, title_topics2, answer_topics, answer_topics2])
#new_X_without_data_features = np.hstack([mm.fit_transform(data[new_features]), question1_vectors, question2_vectors, answer_vectors, nmf_ans, nmf_ans2, nmf_body, nmf_body2, nmf_title, nmf_body2, body_topics, body_topics2, title_topics, title_topics2, answer_topics, answer_topics2])

print (new_X.shape)

(6555, 603)


In [32]:
X_train = np.hstack((X_train, new_X[:X_train.shape[0],:]))
X_test = np.hstack((X_test, new_X[X_train.shape[0]:,:]))

print (X_train.shape, X_test.shape)

(6079, 9121) (476, 9121)


In [33]:
y_categorized = train[targets].copy()
categorization_dict = {}
for col in targets:
    keys = np.sort(train[col].unique())
    values = np.arange(len(keys))
    categorization_dict[col] = dict(zip(keys,values))
    y_categorized[col] = train[col].apply(lambda x: categorization_dict[col][x])

# Modeling

In [34]:
# Compatible with tensorflow backend
class SpearmanRhoCallback(Callback):
    def __init__(self, training_data, validation_data, patience, model_name):
        self.x = training_data[0]
        self.y = training_data[1]
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]
        
        self.patience = patience
        self.value = -1
        self.bad_epochs = 0
        self.model_name = model_name

    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        y_pred_val = self.model.predict(self.x_val)
        rho_val = np.mean([spearmanr(self.y_val[:, ind], y_pred_val[:, ind] + np.random.normal(0, 1e-7, y_pred_val.shape[0])).correlation for ind in range(y_pred_val.shape[1])])
        '''
        if len(self.y_val) == 2:
            rho_val = np.mean([spearmanr(self.y_val[:, ind], y_pred_val[:, ind] + np.random.normal(0, 1e-7, y_pred_val.shape[0])).correlation for ind in range(y_pred_val.shape[1])])
        else:
            rho_val = np.mean([spearmanr(self.y_val, y_pred_val + np.random.normal(0, 1e-7, y_pred_val.shape[0])).correlation])
        '''    
        if rho_val >= self.value:
            self.value = rho_val
            self.model.save_weights(self.model_name)
            print ("model saved {}".format(self.model_name))
        else:
            self.bad_epochs += 1
        #if self.bad_epochs >= self.patience:
        #    print("Epoch %05d: early stopping Threshold" % epoch)
        #    self.model.stop_training = True
        print('\rval_spearman-rho: %s' % (str(round(self.value, 4))), end=100*' '+'\n')
        return rho_val

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return

In [35]:
def create_model():
    inps = Input(shape=(X_train.shape[1],))
    x = Dropout(0.2)(inps)
    x = Dense(512, activation='elu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    x = Dense(256, activation='elu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    x = Dense(128, activation='elu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    #x = Dense(64, activation='elu')(x)
    x = Dense(y_train.shape[1], activation='sigmoid')(x) #Dense(y_train.shape[1], activation='sigmoid')(x)
    model = Model(inputs=inps, outputs=x)
    model.compile(
        optimizer=Adam(lr=.0001),
        loss=['binary_crossentropy']
    )
    #model.summary()
    return model

In [36]:
def create_model2():
    inps = Input(shape=(X_train.shape[1],))
    x = Dense(512, activation='elu')(inps)
    x = Dropout(0.2)(x)
    x = Dense(256, activation='elu')(x)
    x = Dropout(0.2)(x)
    x = Dense(1, activation='sigmoid')(x) #Dense(y_train.shape[1], activation='sigmoid')(x)
    model = Model(inputs=inps, outputs=x)
    model.compile(metrics=['mse'],
        optimizer=Adam(lr=.0001),
        loss=['binary_crossentropy']
    )
    #model.summary()
    return model

In [37]:
model = create_model()
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 9121)              0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 9121)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               4670464   
_________________________________________________________________
batch_normalization_1 (Batch (None, 512)               2048      
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               131328    
_________________________________________________________________
batch_normalization_2 (Batch (None, 256)               1024

In [38]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [39]:
n_splits = 5

all_predictions1 = np.zeros((n_splits,X_test.shape[0],y_train.shape[1]))
oof_pred1 = np.zeros((y_train.shape[0],y_train.shape[1]))

kf = KFold(n_splits=n_splits, random_state=42, shuffle=True)
for ind, (tr, val) in enumerate(kf.split(X_train)):
    X_tr = X_train[tr]
    y_tr = y_train[tr]
    X_vl = X_train[val]
    y_vl = y_train[val]
    
    model = create_model()
    early = EarlyStopping(monitor='val_loss', patience=20, verbose=1, mode='auto', baseline=None, restore_best_weights=False)
    lr = ReduceLROnPlateau(monitor='val_loss', factor=0.7, patience=5, verbose=1, mode='auto', min_lr=0.000001)
    rho = SpearmanRhoCallback(training_data=(X_tr, y_tr), validation_data=(X_vl, y_vl),
                                       patience=15, model_name='weights_{}.hdf5'.format(ind))
    
    model.fit(
        X_tr, y_tr, epochs=100, batch_size=32, validation_data=(X_vl, y_vl), verbose=True, 
        callbacks=[lr,rho,early]
    )
    model.load_weights('weights_{}.hdf5'.format(ind))
    
    oof_pred1[val,:] = model.predict(X_vl)
    all_predictions1[ind,:,:] = model.predict(X_test)

all_predictions1 = all_predictions1.mean(axis=0)

Train on 4863 samples, validate on 1216 samples
Epoch 1/100
model saved weights_0.hdf5
val_spearman-rho: 0.2698                                                                                                    
Epoch 2/100
model saved weights_0.hdf5
val_spearman-rho: 0.2898                                                                                                    
Epoch 3/100
model saved weights_0.hdf5
val_spearman-rho: 0.3051                                                                                                    
Epoch 4/100
model saved weights_0.hdf5
val_spearman-rho: 0.3196                                                                                                    
Epoch 5/100
model saved weights_0.hdf5
val_spearman-rho: 0.3253                                                                                                    
Epoch 6/100
model saved weights_0.hdf5
val_spearman-rho: 0.3318                                                                     

In [40]:
'''
all_predictions1 = np.zeros((X_test.shape[0],y_train.shape[1]))
oof_pred1 = np.zeros((y_train.shape[0],y_train.shape[1]))

 #KFold(n_splits=n_splits, random_state=42, shuffle=True)

for col_ind, col in enumerate(targets):
    
    if train[col].nunique() >= 5:
        n_splits = 5
    else:
        n_splits = train[col].nunique()
    
    kf = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)
    
    temp_all_prediction = np.zeros((n_splits,X_test.shape[0]))
    
    for ind, (tr, val) in enumerate(kf.split(X_train,y_categorized[col])):
        X_tr = X_train[tr]
        y_tr = y_train[tr]
        X_vl = X_train[val]
        y_vl = y_train[val]

        model = create_model2()
        early = EarlyStopping(monitor='val_loss', patience=15, verbose=0, mode='auto', baseline=None, restore_best_weights=False)
        lr = ReduceLROnPlateau(monitor='val_loss', factor=0.7, patience=5, verbose=0, mode='auto', min_lr=0.000001)
        checkpointer = ModelCheckpoint(monitor='val_loss',filepath='weights_simple_dnn_{}_{}.hdf5'.format(col,ind), mode='min',verbose=0, save_best_only=True)

        model.fit(
            X_tr, y_tr[:,col_ind], epochs=100, batch_size=32, validation_data=(X_vl, y_vl[:,col_ind]), verbose=0, 
            callbacks=[early,lr,checkpointer]
        )
        model.load_weights('weights_simple_dnn_{}_{}.hdf5'.format(col,ind))

        oof_pred1[val,col_ind] = model.predict(X_vl)[:,0]
        temp_all_prediction[ind,:] = model.predict(X_test)[:,0]
        
    all_predictions1[:,col_ind] = temp_all_prediction.mean(axis=0)
    print ("{} oof spearman correlation {}".format(col, spearmanr(y_train[:,col_ind],oof_pred1[:,col_ind]).correlation))
'''

'\nall_predictions1 = np.zeros((X_test.shape[0],y_train.shape[1]))\noof_pred1 = np.zeros((y_train.shape[0],y_train.shape[1]))\n\n #KFold(n_splits=n_splits, random_state=42, shuffle=True)\n\nfor col_ind, col in enumerate(targets):\n    \n    if train[col].nunique() >= 5:\n        n_splits = 5\n    else:\n        n_splits = train[col].nunique()\n    \n    kf = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)\n    \n    temp_all_prediction = np.zeros((n_splits,X_test.shape[0]))\n    \n    for ind, (tr, val) in enumerate(kf.split(X_train,y_categorized[col])):\n        X_tr = X_train[tr]\n        y_tr = y_train[tr]\n        X_vl = X_train[val]\n        y_vl = y_train[val]\n\n        model = create_model2()\n        early = EarlyStopping(monitor=\'val_loss\', patience=15, verbose=0, mode=\'auto\', baseline=None, restore_best_weights=False)\n        lr = ReduceLROnPlateau(monitor=\'val_loss\', factor=0.7, patience=5, verbose=0, mode=\'auto\', min_lr=0.000001)\n        checkpo

In [41]:
from sklearn.linear_model import BayesianRidge, ElasticNet
from sklearn.ensemble import RandomForestRegressor

In [42]:
all_predictions2 = np.zeros((X_test.shape[0],y_train.shape[1]))
oof_pred2 = np.zeros((y_train.shape[0],y_train.shape[1]))
#X_train2 = new_X[:X_train.shape[0],:]
#X_test2 = new_X[X_train.shape[0]:,:]

for col_ind, col in enumerate(targets):
    
    if train[col].nunique() >= 5:
        n_splits = 5
    else:
        n_splits = train[col].nunique()
    
    kf = KFold(n_splits=n_splits, random_state=42, shuffle=True) #StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)
    
    temp_all_prediction = np.zeros((n_splits,X_test.shape[0]))
    
    for ind, (tr, val) in enumerate(kf.split(X_train,y_categorized[col])):
        X_tr = X_train[tr]
        y_tr = y_train[tr]
        X_vl = X_train[val]
        y_vl = y_train[val]

        model = ElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5) #RandomForestRegressor(n_estimators=100,max_features=.4, random_state=123) #BayesianRidge() #MultiTaskElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5)
        model.fit(X_tr, y_tr[:,col_ind])
        
        oof_pred2[val,col_ind] = model.predict(X_vl).copy()
        temp_all_prediction[ind,:] = model.predict(X_test).copy()
        
    all_predictions2[:,col_ind] = temp_all_prediction.mean(axis=0).copy()
    
    print ("{} oof spearman correlation {}".format(col, spearmanr(y_train[:,col_ind],oof_pred2[:,col_ind]).correlation))


question_asker_intent_understanding oof spearman correlation 0.3730715276704457
question_body_critical oof spearman correlation 0.6698296961033138
question_conversational oof spearman correlation 0.38249373683481447
question_expect_short_answer oof spearman correlation 0.23359051921497417
question_fact_seeking oof spearman correlation 0.3048583449739141
question_has_commonly_accepted_answer oof spearman correlation 0.40775509366903534
question_interestingness_others oof spearman correlation 0.3662207009636641
question_interestingness_self oof spearman correlation 0.5156662155226513
question_multi_intent oof spearman correlation 0.5083096469256448
question_not_really_a_question oof spearman correlation 0.04143603936954794
question_opinion_seeking oof spearman correlation 0.4325142110252646
question_type_choice oof spearman correlation 0.6531958401184711
question_type_compare oof spearman correlation 0.3107138379267106
question_type_consequence oof spearman correlation 0.1236583738389714

In [43]:
oof_pred1 = np.clip(oof_pred1,0.0001,.9999)
all_predictions1 = np.clip(all_predictions1,0.0001,.9999)

oof_pred2 = np.clip(oof_pred2,0.0001,.9999)
all_predictions2 = np.clip(all_predictions2,0.0001,.9999)

In [44]:
score1 = 0
score2 = 0

for i, val in enumerate(targets):
    score1 += spearmanr(y_train[:,i],oof_pred1[:,i]).correlation
    score2 += spearmanr(y_train[:,i],oof_pred2[:,i]).correlation
    print (val,spearmanr(y_train[:,i],oof_pred1[:,i]).correlation, spearmanr(y_train[:,i],oof_pred2[:,i]).correlation) #

print ("Avg scores {}, {}".format(score1/30, score2/30))

question_asker_intent_understanding 0.3469617130492376 0.3730693533470354
question_body_critical 0.6662751459827833 0.6698296961033138
question_conversational 0.4111431713601106 0.38414231089949624
question_expect_short_answer 0.23302461662112858 0.23358259196425132
question_fact_seeking 0.35262325497848024 0.30484451629892223
question_has_commonly_accepted_answer 0.3996983056658262 0.4074688857102291
question_interestingness_others 0.3437042400025455 0.3662207009636641
question_interestingness_self 0.4929949698209517 0.5156662155226513
question_multi_intent 0.5056850916768006 0.5082320168834893
question_not_really_a_question 0.05375430331113943 0.04143605830533567
question_opinion_seeking 0.45541580436233997 0.43250998394591345
question_type_choice 0.6419733407163112 0.653154688282517
question_type_compare 0.3359199284595235 0.3134468608665377
question_type_consequence 0.16153433250898258 0.12340812733756015
question_type_definition 0.34908810825050024 0.3538972210043042
question_type

In [45]:
main_pred1 = all_predictions1.copy() #np.zeros((all_predictions1.shape[0],all_predictions1.shape[1]))
best_oof_pred1 = oof_pred1.copy() #np.zeros((y_train.shape[0],y_train.shape[1]))


for i, val in enumerate(targets):
    if spearmanr(y_train[:,i],oof_pred1[:,i]).correlation > spearmanr(y_train[:,i],oof_pred2[:,i]).correlation:
        best_oof_pred1[:,i] = oof_pred1[:,i]
        main_pred1[:,i] = all_predictions1[:,i]
    else:
        best_oof_pred1[:,i] = oof_pred2[:,i]
        main_pred1[:,i] = all_predictions1[:,i]


In [46]:
'''
model = create_model()
model.fit(X_train, y_train, epochs=33, batch_size=32, verbose=False)
all_predictions.append(model.predict(X_test))
    
model = MultiTaskElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5)
model.fit(X_train, y_train)
all_predictions.append(model.predict(X_test))
'''

'\nmodel = create_model()\nmodel.fit(X_train, y_train, epochs=33, batch_size=32, verbose=False)\nall_predictions.append(model.predict(X_test))\n    \nmodel = MultiTaskElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5)\nmodel.fit(X_train, y_train)\nall_predictions.append(model.predict(X_test))\n'

In [47]:
uniq_numbers = np.unique(y_train.flatten())
print (uniq_numbers)

[0.         0.2        0.26666667 0.3        0.33333333 0.33333333
 0.4        0.44444444 0.46666667 0.5        0.53333333 0.55555556
 0.6        0.66666667 0.66666667 0.7        0.73333333 0.77777778
 0.8        0.83333333 0.86666667 0.88888889 0.9        0.93333333
 1.        ]


In [48]:
def rounder(values):
    def f(x):
        idx = np.argmin(np.abs(values - x))
        return values[idx]
    return np.frompyfunc(f, 1, 1)

In [49]:
rounded_oof_pred1 = np.array([rounder(uniq_numbers)(i) for i in oof_pred1])
#rounded_oof_pred1[:,9] = oof_pred1[:,9]
rounded_oof_pred2 = np.array([rounder(uniq_numbers)(i) for i in oof_pred2])
#rounded_oof_pred2[:,9] = oof_pred2[:,9]

rounded_oof_pred1 = np.clip(rounded_oof_pred1,.0001,.9999)
rounded_oof_pred2 = np.clip(rounded_oof_pred2,.0001,.9999)

rounded_all_prediction1 = np.array([rounder(uniq_numbers)(i) for i in all_predictions1])
#rounded_all_prediction1[:,9] = all_prediction1[:,9]
rounded_all_prediction2 = np.array([rounder(uniq_numbers)(i) for i in all_predictions2])
#rounded_all_prediction2[:,9] = all_prediction2[:,9]

rounded_all_prediction1 = np.clip(rounded_all_prediction1,.0001,.9999)
rounded_all_prediction2 = np.clip(rounded_all_prediction2,.0001,.9999)

In [50]:
score1 = 0
score2 = 0

for i, val in enumerate(targets):
    val1 = spearmanr(y_train[:,i],rounded_oof_pred1[:,i]).correlation
    val2 = spearmanr(y_train[:,i],rounded_oof_pred2[:,i]).correlation

    if pd.notnull(val1) == False:
        val1 = spearmanr(y_train[:,i],oof_pred1[:,i]).correlation
    if pd.notnull(val2) == False:
        val2 = spearmanr(y_train[:,i],oof_pred2[:,i]).correlation
    
    score1 += val1
    score2 += val2
    print (val,val1, val2)
            

print ("Avg scores {}, {}".format(score1/30, score2/30))

question_asker_intent_understanding 0.332018696542939 0.3662668507234609
question_body_critical 0.6642385312477265 0.6694122138881343
question_conversational 0.49307135356507065 0.44393693543088025
question_expect_short_answer 0.23302850657187904 0.23387967545502003
question_fact_seeking 0.35210442544296666 0.30464092502758877
question_has_commonly_accepted_answer 0.40562349924498414 0.40598448205699256
question_interestingness_others 0.3373435498009048 0.3537033912011825
question_interestingness_self 0.49232989490729073 0.5158843222110728
question_multi_intent 0.49812346733427487 0.5025292771173461
question_not_really_a_question 0.05375430331113943 0.04143605830533567
question_opinion_seeking 0.4513018485291257 0.4310858819589078
question_type_choice 0.6418314318406168 0.651241305023619


  c /= stddev[:, None]
  c /= stddev[None, :]
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


question_type_compare 0.4724780446461082 0.41528496381743224
question_type_consequence 0.11823528064988638 0.12340812733756015
question_type_definition 0.5946577908868264 0.5809641451583457
question_type_entity 0.5397049791879915 0.4695930375832562
question_type_instructions 0.7571368386537496 0.7518912940196588
question_type_procedure 0.2929005097920065 0.26862363999933897
question_type_reason_explanation 0.6107535392704313 0.608780426388559
question_type_spelling 0.30106450177380784 0.004015441206002841
question_well_written 0.5265860360291801 0.5256164658213264
answer_helpful 0.19996383864088219 0.21858677044022753
answer_level_of_information 0.3939750063555886 0.40263667847063056
answer_plausible 0.1284534115225415 0.08708058483724049
answer_relevance 0.15555641875005233 0.15168264549570454
answer_satisfaction 0.3094953658227542 0.2900052177519507
answer_type_instructions 0.7456826371651855 0.7493561093239108
answer_type_procedure 0.22525003281426723 0.2142287936121605
answer_type_

In [51]:
main_pred2 = rounded_all_prediction1.copy() #np.zeros((all_predictions1.shape[0],all_predictions1.shape[1]))
best_oof_pred2 = rounded_oof_pred1.copy() #np.zeros((y_train.shape[0],y_train.shape[1]))

for i, val in enumerate(targets):
    if spearmanr(y_train[:,i],rounded_oof_pred1[:,i]).correlation > spearmanr(y_train[:,i],rounded_oof_pred2[:,i]).correlation:
        best_oof_pred2[:,i] = rounded_oof_pred1[:,i]
        main_pred2[:,i] = rounded_all_prediction1[:,i]
    else:
        best_oof_pred2[:,i] = rounded_oof_pred2[:,i]
        main_pred2[:,i] = rounded_all_prediction2[:,i]


In [52]:
for i, val in enumerate(targets):
    if pd.notnull(spearmanr(y_train[:,i],best_oof_pred2[:,i]).correlation) == False:
        best_oof_pred2[:,i] = best_oof_pred1[:,i].copy()
        main_pred2[:,i] = main_pred1[:,i].copy()

In [53]:
main_pred = main_pred2.copy()
'''
main_pred = np.zeros((all_predictions1.shape[0],all_predictions1.shape[1]))
final_score = 0
for i, val in enumerate(targets):
    if spearmanr(y_train[:,i],best_oof_pred1[:,i]).correlation > spearmanr(y_train[:,i],best_oof_pred2[:,i]).correlation:
        final_score += spearmanr(y_train[:,i],best_oof_pred1[:,i]).correlation
        print (val, spearmanr(y_train[:,i],best_oof_pred1[:,i]).correlation)
        main_pred[:,i] = main_pred1[:,i]
    else:
        final_score += spearmanr(y_train[:,i],best_oof_pred2[:,i]).correlation
        print (val, spearmanr(y_train[:,i],best_oof_pred2[:,i]).correlation)
        main_pred[:,i] = main_pred2[:,i]

print ("Avg scores {}".format(final_score/30))
'''

'\nmain_pred = np.zeros((all_predictions1.shape[0],all_predictions1.shape[1]))\nfinal_score = 0\nfor i, val in enumerate(targets):\n    if spearmanr(y_train[:,i],best_oof_pred1[:,i]).correlation > spearmanr(y_train[:,i],best_oof_pred2[:,i]).correlation:\n        final_score += spearmanr(y_train[:,i],best_oof_pred1[:,i]).correlation\n        print (val, spearmanr(y_train[:,i],best_oof_pred1[:,i]).correlation)\n        main_pred[:,i] = main_pred1[:,i]\n    else:\n        final_score += spearmanr(y_train[:,i],best_oof_pred2[:,i]).correlation\n        print (val, spearmanr(y_train[:,i],best_oof_pred2[:,i]).correlation)\n        main_pred[:,i] = main_pred2[:,i]\n\nprint ("Avg scores {}".format(final_score/30))\n'

In [54]:
'''
test_pred1 = all_predictions1.mean(axis=0)
test_pred2 = all_predictions2.mean(axis=0)
main_pred = np.zeros((test_pred1.shape[0],test_pred1.shape[1]))

for i in range(test_pred1.shape[1]):
    if spearmanr(y_train[:,i],rounded_oof_pred1[:,i]).correlation > spearmanr(y_train[:,i],rounded_oof_pred2[:,i]).correlation:
        main_pred[:,i] = rounder(uniq_numbers)(test_pred1[:,i])
    else:
        main_pred[:,i] = rounder(uniq_numbers)(test_pred2[:,i])
        
for i in range(main_pred.shape[1]):
    if main_pred[:,i].sum() == 0:
        if spearmanr(y_train[:,i],oof_pred1[:,i]).correlation > spearmanr(y_train[:,i],oof_pred2[:,i]).correlation:
            main_pred[:,i] = test_pred1[:,i]
        else:
            main_pred[:,i] = test_pred2[:,i]
            
main_pred = np.clip(main_pred,0.0001,0.9999)
'''

'\ntest_pred1 = all_predictions1.mean(axis=0)\ntest_pred2 = all_predictions2.mean(axis=0)\nmain_pred = np.zeros((test_pred1.shape[0],test_pred1.shape[1]))\n\nfor i in range(test_pred1.shape[1]):\n    if spearmanr(y_train[:,i],rounded_oof_pred1[:,i]).correlation > spearmanr(y_train[:,i],rounded_oof_pred2[:,i]).correlation:\n        main_pred[:,i] = rounder(uniq_numbers)(test_pred1[:,i])\n    else:\n        main_pred[:,i] = rounder(uniq_numbers)(test_pred2[:,i])\n        \nfor i in range(main_pred.shape[1]):\n    if main_pred[:,i].sum() == 0:\n        if spearmanr(y_train[:,i],oof_pred1[:,i]).correlation > spearmanr(y_train[:,i],oof_pred2[:,i]).correlation:\n            main_pred[:,i] = test_pred1[:,i]\n        else:\n            main_pred[:,i] = test_pred2[:,i]\n            \nmain_pred = np.clip(main_pred,0.0001,0.9999)\n'

In [55]:
for i in range(30):
    print (i, y_train[:,i].sum(), main_pred[:,i].sum()) #main_pred[:,i].max(), main_pred[:,i].min()

0 5426.500000000001 420.67637777777844
1 3618.833333333333 273.6999999999994
2 348.3333333333333 10.34359999999994
3 4246.333333333333 333.6990999999997
4 4696.833333333333 388.43163333333405
5 4824.833333333334 398.3824888888896
6 3571.277777777778 275.6111111111099
7 3083.722222222222 232.81111111111167
8 1451.3333333333333 111.68677777777816
9 27.166666666666664 2.2950573642912793
10 2613.833333333333 185.54684444444476
11 1732.0 136.29678888888927
12 231.83333333333331 6.622977777777768
13 61.0 3.669311378145359
14 187.0 4.823677777777785
15 396.5 13.498855555555515
16 3024.833333333333 265.3636555555557
17 1009.5 75.80368888888944
18 2348.833333333333 189.6831777777778
19 5.0 1.349946885622922
20 4862.777777777777 373.9000000000003
21 5625.555555555556 439.7426444444451
22 3980.666666666667 312.2777777777774
23 5836.166666666666 465.15688888889235
24 5888.277777777779 469.6061444444483
25 5195.6 411.96666666666704
26 2915.166666666667 248.80369999999962
27 794.1666666666665 61.386

In [56]:
submission = pd.read_csv(path_join(data_dir, 'sample_submission.csv'))
submission[targets] = main_pred
submission.to_csv("submission.csv", index = False)

In [57]:
submission.head()

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,question_opinion_seeking,question_type_choice,question_type_compare,question_type_consequence,question_type_definition,question_type_entity,question_type_instructions,question_type_procedure,question_type_reason_explanation,question_type_spelling,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.933333,0.6,0.0001,0.666667,0.888889,0.666667,0.666667,0.555556,0.4,0.003708,0.266667,0.666667,0.0001,0.041251,0.0001,0.0001,0.0001,0.0001,0.7,0.002718,0.866667,0.888889,0.6,0.933333,0.9999,0.833333,0.2,0.0001,0.733333,0.9
1,46,0.866667,0.466667,0.0001,0.733333,0.777778,0.933333,0.555556,0.466667,0.0001,0.004768,0.444444,0.333333,0.0001,0.001602,0.0001,0.0001,0.933333,0.2,0.2,0.002144,0.555556,0.933333,0.6,0.9999,0.9999,0.866667,0.9,0.0001,0.2,0.888889
2,70,0.9,0.666667,0.0001,0.7,0.933333,0.933333,0.6,0.5,0.2,0.003357,0.2,0.555556,0.0001,0.011951,0.0001,0.0001,0.2,0.0001,0.533333,0.002183,0.866667,0.888889,0.6,0.9999,0.933333,0.833333,0.0001,0.0001,0.833333,0.9
3,132,0.833333,0.3,0.0001,0.7,0.8,0.933333,0.533333,0.444444,0.266667,0.004319,0.533333,0.333333,0.0001,0.005078,0.0001,0.0001,0.833333,0.2,0.666667,0.002819,0.733333,0.933333,0.7,0.9999,0.9999,0.9,0.8,0.2,0.555556,0.9
4,200,0.933333,0.555556,0.0001,0.9,0.8,0.866667,0.6,0.6,0.3,0.003336,0.4,0.466667,0.0001,0.004108,0.0001,0.0001,0.2,0.2,0.533333,0.002433,0.8,0.9,0.666667,0.9999,0.9999,0.888889,0.4,0.2,0.5,0.888889


In [58]:
submission.describe()

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,question_opinion_seeking,question_type_choice,question_type_compare,question_type_consequence,question_type_definition,question_type_entity,question_type_instructions,question_type_procedure,question_type_reason_explanation,question_type_spelling,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
count,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0
mean,5029.186975,0.883774,0.575,0.02173,0.701049,0.816033,0.836938,0.579015,0.489099,0.234636,0.004822,0.389804,0.286338,0.013914,0.007709,0.010134,0.028359,0.557487,0.159251,0.398494,0.002836,0.785504,0.923829,0.656046,0.97722,0.986568,0.865476,0.522697,0.128963,0.505628,0.905696
std,2812.67006,0.042249,0.144524,0.077757,0.107386,0.116399,0.129285,0.049321,0.085507,0.146316,0.002701,0.182906,0.213596,0.068516,0.008685,0.057426,0.103146,0.352534,0.11644,0.271395,0.00141,0.10248,0.024865,0.048916,0.033009,0.027314,0.04766,0.292696,0.114637,0.240487,0.0211
min,39.0,0.777778,0.266667,0.0001,0.266667,0.2,0.2,0.5,0.333333,0.0001,0.000386,0.0001,0.0001,0.0001,0.000933,0.0001,0.0001,0.0001,0.0001,0.0001,0.000697,0.5,0.833333,0.533333,0.866667,0.888889,0.666667,0.0001,0.0001,0.0001,0.833333
25%,2572.0,0.866667,0.466667,0.0001,0.666667,0.777778,0.777778,0.533333,0.444444,0.2,0.003002,0.266667,0.2,0.0001,0.00309,0.0001,0.0001,0.2,0.0001,0.2,0.00188,0.7,0.9,0.6,0.933333,0.9999,0.833333,0.266667,0.0001,0.333333,0.888889
50%,5093.0,0.888889,0.555556,0.0001,0.7,0.833333,0.866667,0.555556,0.466667,0.2,0.00431,0.4,0.2,0.0001,0.004936,0.0001,0.0001,0.7,0.2,0.316667,0.002536,0.8,0.933333,0.666667,0.9999,0.9999,0.866667,0.555556,0.2,0.5,0.9
75%,7482.0,0.9,0.7,0.0001,0.777778,0.9,0.933333,0.6,0.533333,0.333333,0.005812,0.5,0.4,0.0001,0.009005,0.0001,0.0001,0.866667,0.2,0.6,0.003384,0.866667,0.933333,0.666667,0.9999,0.9999,0.9,0.777778,0.2,0.666667,0.933333
max,9640.0,0.9999,0.933333,0.733333,0.9999,0.9999,0.9999,0.7,0.8,0.666667,0.017064,0.933333,0.9999,0.733333,0.076983,0.6,0.8,0.9999,0.555556,0.9999,0.011921,0.933333,0.9999,0.833333,0.9999,0.9999,0.933333,0.9999,0.466667,0.9999,0.933333
