In [1]:
!pip install ../input/sacremoses/sacremoses-master/ > /dev/null

import os
import sys
import glob
import torch

sys.path.insert(0, "../input/transformers/transformers-master/")
import transformers
import numpy as np
import pandas as pd
import math
from tqdm import tqdm

In [2]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [3]:
def fetch_vectors(string_list, batch_size=64):
    # inspired by https://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/
    DEVICE = torch.device("cuda")
    tokenizer = transformers.DistilBertTokenizer.from_pretrained("../input/distilbertbaseuncased/")
    model = transformers.DistilBertModel.from_pretrained("../input/distilbertbaseuncased/")
    model.to(DEVICE)
    max_len = 512
    fin_features = []
    for data in tqdm(chunks(string_list, batch_size)):
        tokenized = []
        all_lengths = []
        for x in data:
            x = " ".join(x.strip().split()[:max_len])
            tok = tokenizer.encode(x, add_special_tokens=True)
            tokenized.append(tok[:max_len])
            all_lengths.append(len(tok))
            
        padded = np.array([i + [0] * (max_len - len(i)) for i in tokenized])
        #print (padded)
        attention_mask = np.where(padded != 0, 1, 0)
        #print (attention_mask)
        input_ids = torch.tensor(padded).to(DEVICE)
        attention_mask = torch.tensor(attention_mask).to(DEVICE)

        with torch.no_grad():
            last_hidden_states = model(input_ids, attention_mask=attention_mask)

        features1 = last_hidden_states[0][:, 0, :].cpu().numpy()
        #features2 = last_hidden_states[0].cpu().numpy().mean(axis=1)
        #features3 = np.array([last_hidden_states[0].cpu().numpy()[i,:all_lengths[i],:].mean(axis=0) for i in range(len(all_lengths))])
        #features = np.hstack([features1,features2])
        #features = last_hidden_states[0][:, 0, :].cpu().numpy()
        fin_features.append(features1)

    fin_features = np.vstack(fin_features)
    return fin_features

In [4]:
import re
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
stop_words = stopwords.words('english')

stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()

def processSingleReview(review, d=None):
    """
    Convert a raw review to a string of words
    """
    letters_only = re.sub("[^a-zA-Z]", " ", review)
    words = tokenizer.tokenize(letters_only.lower())
    words = [i for i in words if i not in exclude]
    stops = set(stopwords.words("english"))
    normalized = [lemma.lemmatize(word) for word in words if word not in stops]
    
    #meaningful_words = [st.stem(w) for w in words if w not in stops]
    meaningful_words = [w for w in normalized if pos_tag([w],tagset='universal')[0][1] in ['NOUN','VERB','ADJ']] #
    return(" ".join(meaningful_words))

In [5]:
processSingleReview("What am I losing when using extension tubes instead of a macro lens?")

'losing using extension tube macro lens'

In [6]:
import os
import re
import gc
import pickle  
import random
import keras

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import keras.backend as K

from keras.models import Model
from keras.layers import Dense, Input, Dropout, Lambda
from keras.optimizers import Adam
from keras.callbacks import Callback
from scipy.stats import spearmanr, rankdata
from os.path import join as path_join
from numpy.random import seed
from urllib.parse import urlparse
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.linear_model import MultiTaskElasticNet

seed(42)
tf.random.set_seed(42)
random.seed(42)

Using TensorFlow backend.


In [7]:
data_dir = '../input/google-quest-challenge/'
train = pd.read_csv(path_join(data_dir, 'train.csv'))
test = pd.read_csv(path_join(data_dir, 'test.csv'))
print(train.shape, test.shape)
train.head()

(6079, 41) (476, 11)


Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0,What am I losing when using extension tubes in...,After playing around with macro photography on...,ysap,https://photo.stackexchange.com/users/1024,"I just got extension tubes, so here's the skin...",rfusca,https://photo.stackexchange.com/users/1917,http://photo.stackexchange.com/questions/9169/...,LIFE_ARTS,...,1.0,1.0,0.666667,1.0,1.0,0.8,1.0,0.0,0.0,1.0
1,1,What is the distinction between a city and a s...,I am trying to understand what kinds of places...,russellpierce,https://rpg.stackexchange.com/users/8774,It might be helpful to look into the definitio...,Erik Schmidt,https://rpg.stackexchange.com/users/1871,http://rpg.stackexchange.com/questions/47820/w...,CULTURE,...,0.888889,0.888889,0.555556,0.888889,0.888889,0.666667,0.0,0.0,0.666667,0.888889
2,2,Maximum protusion length for through-hole comp...,I'm working on a PCB that has through-hole com...,Joe Baker,https://electronics.stackexchange.com/users/10157,Do you even need grooves? We make several pro...,Dwayne Reid,https://electronics.stackexchange.com/users/64754,http://electronics.stackexchange.com/questions...,SCIENCE,...,0.777778,0.777778,0.555556,1.0,1.0,0.666667,0.0,0.333333,1.0,0.888889
3,3,Can an affidavit be used in Beit Din?,"An affidavit, from what i understand, is basic...",Scimonster,https://judaism.stackexchange.com/users/5151,"Sending an ""affidavit"" it is a dispute between...",Y e z,https://judaism.stackexchange.com/users/4794,http://judaism.stackexchange.com/questions/551...,CULTURE,...,0.888889,0.833333,0.333333,0.833333,1.0,0.8,0.0,0.0,1.0,1.0
4,5,How do you make a binary image in Photoshop?,I am trying to make a binary image. I want mor...,leigero,https://graphicdesign.stackexchange.com/users/...,Check out Image Trace in Adobe Illustrator. \n...,q2ra,https://graphicdesign.stackexchange.com/users/...,http://graphicdesign.stackexchange.com/questio...,LIFE_ARTS,...,1.0,1.0,0.666667,1.0,1.0,0.8,1.0,0.0,1.0,1.0


In [8]:
'''
train["clean_answer"] = train.answer.apply(processSingleReview)
train["clean_question"] = train.question_body.apply(processSingleReview)

test["clean_answer"] = test.answer.apply(processSingleReview)
test["clean_question"] = test.question_body.apply(processSingleReview)
'''

'\ntrain["clean_answer"] = train.answer.apply(processSingleReview)\ntrain["clean_question"] = train.question_body.apply(processSingleReview)\n\ntest["clean_answer"] = test.answer.apply(processSingleReview)\ntest["clean_question"] = test.question_body.apply(processSingleReview)\n'

In [9]:
sample = pd.read_csv("../input/google-quest-challenge/sample_submission.csv")
target_cols = list(sample.drop("qa_id", axis=1).columns)

#train_question_title_dense = fetch_vectors(df_train.question_title.values)
train_question_body_dense = fetch_vectors(train.question_body.values)
train_answer_dense = fetch_vectors(train.answer.values)

#test_question_title_dense = fetch_vectors(df_test.question_title.values)
test_question_body_dense = fetch_vectors(test.question_body.values)
test_answer_dense = fetch_vectors(test.answer.values)

95it [01:28,  1.08it/s]
95it [01:27,  1.09it/s]
8it [00:06,  1.17it/s]
8it [00:07,  1.13it/s]


In [10]:
targets = [
        'question_asker_intent_understanding',
        'question_body_critical',
        'question_conversational',
        'question_expect_short_answer',
        'question_fact_seeking',
        'question_has_commonly_accepted_answer',
        'question_interestingness_others',
        'question_interestingness_self',
        'question_multi_intent',
        'question_not_really_a_question',
        'question_opinion_seeking',
        'question_type_choice',
        'question_type_compare',
        'question_type_consequence',
        'question_type_definition',
        'question_type_entity',
        'question_type_instructions',
        'question_type_procedure',
        'question_type_reason_explanation',
        'question_type_spelling',
        'question_well_written',
        'answer_helpful',
        'answer_level_of_information',
        'answer_plausible',
        'answer_relevance',
        'answer_satisfaction',
        'answer_type_instructions',
        'answer_type_procedure',
        'answer_type_reason_explanation',
        'answer_well_written'    
    ]

input_columns = ['question_title', 'question_body', 'answer']

> # Features

In [11]:
find = re.compile(r"^[^.]*")

train['netloc'] = train['url'].apply(lambda x: re.findall(find, urlparse(x).netloc)[0])
test['netloc'] = test['url'].apply(lambda x: re.findall(find, urlparse(x).netloc)[0])

features = ['netloc', 'category']
merged = pd.concat([train[features], test[features]])
ohe = OneHotEncoder()
ohe.fit(merged)

features_train = ohe.transform(train[features]).toarray()
features_test = ohe.transform(test[features]).toarray()

In [12]:
module_url = "../input/universalsentenceencoderlarge4/"
embed = hub.load(module_url)

In [13]:
embeddings_train = {}
embeddings_test = {}
for text in input_columns:
    print(text)
    train_text = train[text].str.replace('?', '.').str.replace('!', '.').tolist()
    test_text = test[text].str.replace('?', '.').str.replace('!', '.').tolist()
    
    curr_train_emb = []
    curr_test_emb = []
    batch_size = 4
    ind = 0
    while ind*batch_size < len(train_text):
        curr_train_emb.append(embed(train_text[ind*batch_size: (ind + 1)*batch_size])["outputs"].numpy())
        ind += 1
        
    ind = 0
    while ind*batch_size < len(test_text):
        curr_test_emb.append(embed(test_text[ind*batch_size: (ind + 1)*batch_size])["outputs"].numpy())
        ind += 1    
        
    embeddings_train[text + '_embedding'] = np.vstack(curr_train_emb)
    embeddings_test[text + '_embedding'] = np.vstack(curr_test_emb)
    
del embed
K.clear_session()
gc.collect()

question_title
question_body
answer


44

In [14]:
l2_dist = lambda x, y: np.power(x - y, 2).sum(axis=1)

cos_dist = lambda x, y: (x*y).sum(axis=1)

dist_features_train = np.array([
    l2_dist(embeddings_train['question_title_embedding'], embeddings_train['answer_embedding']),
    l2_dist(embeddings_train['question_body_embedding'], embeddings_train['answer_embedding']),
    l2_dist(embeddings_train['question_body_embedding'], embeddings_train['question_title_embedding']),
    cos_dist(embeddings_train['question_title_embedding'], embeddings_train['answer_embedding']),
    cos_dist(embeddings_train['question_body_embedding'], embeddings_train['answer_embedding']),
    cos_dist(embeddings_train['question_body_embedding'], embeddings_train['question_title_embedding'])
]).T

dist_features_test = np.array([
    l2_dist(embeddings_test['question_title_embedding'], embeddings_test['answer_embedding']),
    l2_dist(embeddings_test['question_body_embedding'], embeddings_test['answer_embedding']),
    l2_dist(embeddings_test['question_body_embedding'], embeddings_test['question_title_embedding']),
    cos_dist(embeddings_test['question_title_embedding'], embeddings_test['answer_embedding']),
    cos_dist(embeddings_test['question_body_embedding'], embeddings_test['answer_embedding']),
    cos_dist(embeddings_test['question_body_embedding'], embeddings_test['question_title_embedding'])
]).T

In [15]:
X_train = np.hstack([item for k, item in embeddings_train.items()] + [features_train, dist_features_train])
X_test = np.hstack([item for k, item in embeddings_test.items()] + [features_test, dist_features_test])
y_train = train[targets].values

In [16]:
'''
xgboost_oof_train = np.load('../input/qa-challenge-external-data/train_oof.npy')
xgboost_oof_test = np.load('../input/qa-challenge-external-data/test_pred1.npy')

train_tm_features = np.load('../input/qa-challenge-external-data/train_features.npy')
test_tm_features = np.load('../input/qa-challenge-external-data/test_features.npy')

from sklearn.preprocessing import MinMaxScaler
clf = MinMaxScaler()

train_tm_features = clf.fit_transform(train_tm_features)
test_tm_features = clf.transform(test_tm_features)

print (xgboost_oof_train.shape, xgboost_oof_test.shape, train_tm_features.shape, test_tm_features.shape)
'''

"\nxgboost_oof_train = np.load('../input/qa-challenge-external-data/train_oof.npy')\nxgboost_oof_test = np.load('../input/qa-challenge-external-data/test_pred1.npy')\n\ntrain_tm_features = np.load('../input/qa-challenge-external-data/train_features.npy')\ntest_tm_features = np.load('../input/qa-challenge-external-data/test_features.npy')\n\nfrom sklearn.preprocessing import MinMaxScaler\nclf = MinMaxScaler()\n\ntrain_tm_features = clf.fit_transform(train_tm_features)\ntest_tm_features = clf.transform(test_tm_features)\n\nprint (xgboost_oof_train.shape, xgboost_oof_test.shape, train_tm_features.shape, test_tm_features.shape)\n"

In [17]:
X_train = np.hstack((X_train, train_question_body_dense, train_answer_dense))
X_test = np.hstack((X_test, test_question_body_dense, test_answer_dense))

In [18]:
X_train.shape, X_test.shape

((6079, 3142), (476, 3142))

# Modeling

In [19]:
# Compatible with tensorflow backend
class SpearmanRhoCallback(Callback):
    def __init__(self, training_data, validation_data, patience, model_name):
        self.x = training_data[0]
        self.y = training_data[1]
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]
        
        self.patience = patience
        self.value = -1
        self.bad_epochs = 0
        self.model_name = model_name

    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        y_pred_val = self.model.predict(self.x_val)
        rho_val = np.mean([spearmanr(self.y_val[:, ind], y_pred_val[:, ind] + np.random.normal(0, 1e-7, y_pred_val.shape[0])).correlation for ind in range(y_pred_val.shape[1])])
        if rho_val >= self.value:
            self.value = rho_val
            self.model.save_weights(self.model_name)
        else:
            self.bad_epochs += 1
        if self.bad_epochs >= self.patience:
            print("Epoch %05d: early stopping Threshold" % epoch)
            self.model.stop_training = True
        print('\rval_spearman-rho: %s' % (str(round(rho_val, 4))), end=100*' '+'\n')
        return rho_val

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return

In [20]:
def create_model():
    inps = Input(shape=(X_train.shape[1],))
    x = Dense(512, activation='elu')(inps)
    x = Dropout(0.2)(x)
    x = Dense(256, activation='elu')(x)
    x = Dropout(0.2)(x)
    x = Dense(y_train.shape[1], activation='sigmoid')(x)
    model = Model(inputs=inps, outputs=x)
    model.compile(
        optimizer=Adam(lr=.0001),
        loss=['binary_crossentropy']
    )
    #model.summary()
    return model

In [21]:
model = create_model()
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 3142)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               1609216   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               131328    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 30)                7710      
Total params: 1,748,254
Trainable params: 1,748,254
Non-trainable params: 0
_________________________________________________

In [22]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [23]:
n_splits = 5

all_predictions1 = np.zeros((n_splits,X_test.shape[0],y_train.shape[1]))
oof_pred1 = np.zeros((y_train.shape[0],y_train.shape[1]))

kf = KFold(n_splits=n_splits, random_state=42, shuffle=True)
for ind, (tr, val) in enumerate(kf.split(X_train)):
    X_tr = X_train[tr]
    y_tr = y_train[tr]
    X_vl = X_train[val]
    y_vl = y_train[val]
    
    model = create_model()
    early = EarlyStopping(monitor='val_loss', patience=15, verbose=1, mode='min', baseline=None, restore_best_weights=False)
    lr = ReduceLROnPlateau(monitor='val_loss', factor=0.7, patience=5, verbose=1, mode='min', min_lr=0.000001)
    rho = SpearmanRhoCallback(training_data=(X_tr, y_tr), validation_data=(X_vl, y_vl),
                                       patience=15, model_name='weights_simple_lstm_deep_cnn_{}.hdf5'.format(ind))
    
    model.fit(
        X_tr, y_tr, epochs=100, batch_size=32, validation_data=(X_vl, y_vl), verbose=True, 
        callbacks=[lr,rho]
    )
    model.load_weights('weights_simple_lstm_deep_cnn_{}.hdf5'.format(ind))
    
    oof_pred1[val,:] = model.predict(X_vl)
    all_predictions1[ind,:,:] = model.predict(X_test)

Train on 4863 samples, validate on 1216 samples
Epoch 1/100
val_spearman-rho: 0.2825                                                                                                    
Epoch 2/100
val_spearman-rho: 0.3264                                                                                                    
Epoch 3/100
val_spearman-rho: 0.3442                                                                                                    
Epoch 4/100
val_spearman-rho: 0.3549                                                                                                    
Epoch 5/100
val_spearman-rho: 0.3635                                                                                                    
Epoch 6/100
val_spearman-rho: 0.3694                                                                                                    
Epoch 7/100
val_spearman-rho: 0.3721                                                                                              

In [24]:
all_predictions2 = np.zeros((n_splits,X_test.shape[0],y_train.shape[1]))
oof_pred2 = np.zeros((y_train.shape[0],y_train.shape[1]))

kf = KFold(n_splits=n_splits, random_state=2019, shuffle=True)
for ind, (tr, val) in enumerate(kf.split(X_train)):
    X_tr = X_train[tr]
    y_tr = y_train[tr]
    X_vl = X_train[val]
    y_vl = y_train[val]
    
    model = MultiTaskElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5)
    model.fit(X_tr, y_tr)
    
    oof_pred2[val,:] = model.predict(X_vl)
    all_predictions2[ind,:,:] = model.predict(X_test)

  check_random_state(self.random_state), random)
  check_random_state(self.random_state), random)
  check_random_state(self.random_state), random)
  check_random_state(self.random_state), random)
  check_random_state(self.random_state), random)


In [25]:
oof_pred1 = np.clip(oof_pred1,0.0001,.9999)
all_predictions1 = np.clip(all_predictions1,0.0001,.9999)

oof_pred2 = np.clip(oof_pred2,0.0001,.9999)
all_predictions2 = np.clip(all_predictions2,0.0001,.9999)

In [26]:
from scipy.stats import spearmanr

In [27]:
score1 = 0
score2 = 0

for i, val in enumerate(targets):
    score1 += spearmanr(y_train[:,i],oof_pred1[:,i]).correlation
    score2+= spearmanr(y_train[:,i],oof_pred2[:,i]).correlation
    print (val,spearmanr(y_train[:,i],oof_pred1[:,i]).correlation,spearmanr(y_train[:,i],oof_pred2[:,i]).correlation)

print ("Avg scores {}, {}".format(score1/30, score2/30))

question_asker_intent_understanding 0.34972623301942457 0.3482761226428539
question_body_critical 0.6376436045798501 0.6437699887886108
question_conversational 0.4097644461617697 0.3941330399442278
question_expect_short_answer 0.2379809075593805 0.23151784332480524
question_fact_seeking 0.32328066280178136 0.30825234359601855
question_has_commonly_accepted_answer 0.4072594053576393 0.40199360352868513
question_interestingness_others 0.35454104233421185 0.3405281563338947
question_interestingness_self 0.49407717654471267 0.5008552750827329
question_multi_intent 0.46332534315475044 0.4706399256903671
question_not_really_a_question 0.051592891864929914 0.04311265997144773
question_opinion_seeking 0.42482026832061365 0.42997733255833387
question_type_choice 0.6310866042582306 0.6518507337954111
question_type_compare 0.33566595535197596 0.3215648806679264
question_type_consequence 0.15700745480374584 0.14804115312813232
question_type_definition 0.34995736465199473 0.3555698915298218
questio

In [28]:
'''
model = create_model()
model.fit(X_train, y_train, epochs=33, batch_size=32, verbose=False)
all_predictions.append(model.predict(X_test))
    
model = MultiTaskElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5)
model.fit(X_train, y_train)
all_predictions.append(model.predict(X_test))
'''

'\nmodel = create_model()\nmodel.fit(X_train, y_train, epochs=33, batch_size=32, verbose=False)\nall_predictions.append(model.predict(X_test))\n    \nmodel = MultiTaskElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5)\nmodel.fit(X_train, y_train)\nall_predictions.append(model.predict(X_test))\n'

In [29]:
uniq_numbers = np.unique(y_train.flatten())
print (uniq_numbers)

[0.         0.2        0.26666667 0.3        0.33333333 0.33333333
 0.4        0.44444444 0.46666667 0.5        0.53333333 0.55555556
 0.6        0.66666667 0.66666667 0.7        0.73333333 0.77777778
 0.8        0.83333333 0.86666667 0.88888889 0.9        0.93333333
 1.        ]


In [30]:
def rounder(values):
    def f(x):
        idx = np.argmin(np.abs(values - x))
        return values[idx]
    return np.frompyfunc(f, 1, 1)

In [31]:
rounded_oof_pred1 = np.array([rounder(uniq_numbers)(i) for i in oof_pred1])
rounded_oof_pred1[:,9] = oof_pred1[:,9]
rounded_oof_pred2 = np.array([rounder(uniq_numbers)(i) for i in oof_pred2])
rounded_oof_pred2[:,9] = oof_pred2[:,9]

rounded_oof_pred1 = np.clip(rounded_oof_pred1,.0001,.9999)
rounded_oof_pred2 = np.clip(rounded_oof_pred2,.0001,.9999)

In [32]:
score1 = 0
score2 = 0

for i, val in enumerate(targets):
    score1 += spearmanr(y_train[:,i],rounded_oof_pred1[:,i]).correlation
    score2+= spearmanr(y_train[:,i],rounded_oof_pred2[:,i]).correlation
    print (val,spearmanr(y_train[:,i],rounded_oof_pred1[:,i]).correlation,spearmanr(y_train[:,i],rounded_oof_pred2[:,i]).correlation)

print ("Avg scores {}, {}".format(score1/30, score2/30))

question_asker_intent_understanding 0.333226791873266 0.3431997074745129
question_body_critical 0.6368451058559078 0.6433186682145369
question_conversational 0.4766927398293525 0.45552084272806087
question_expect_short_answer 0.23906069406416539 0.23069050762169727
question_fact_seeking 0.32207221662493296 0.309243986437451
question_has_commonly_accepted_answer 0.4115795211369651 0.40266776200674165
question_interestingness_others 0.34694079751419543 0.3376264665053459
question_interestingness_self 0.4944770135639902 0.5004143758950514
question_multi_intent 0.4563973882735221 0.46589928705007755
question_not_really_a_question 0.051592891864929914 0.04311265997144773
question_opinion_seeking 0.42342139205139884 0.4281399847433677
question_type_choice 0.6308801152968414 0.6511438645745767
question_type_compare 0.4358450447662802 0.4157164817383669
question_type_consequence 0.18575466380241445 0.1832892651520298
question_type_definition 0.616461775935941 0.5776756311509881
question_type_e

In [33]:
test_pred1 = all_predictions1.mean(axis=0)
test_pred2 = all_predictions2.mean(axis=0)
main_pred = np.zeros((test_pred1.shape[0],test_pred1.shape[1]))

for i in range(test_pred1.shape[1]):
    if spearmanr(y_train[:,i],rounded_oof_pred1[:,i]).correlation > spearmanr(y_train[:,i],rounded_oof_pred2[:,i]).correlation:
        main_pred[:,i] = rounder(uniq_numbers)(test_pred1[:,i])
    else:
        main_pred[:,i] = rounder(uniq_numbers)(test_pred2[:,i])
        
for i in range(main_pred.shape[1]):
    if main_pred[:,i].sum() == 0:
        if spearmanr(y_train[:,i],oof_pred1[:,i]).correlation > spearmanr(y_train[:,i],oof_pred2[:,i]).correlation:
            main_pred[:,i] = test_pred1[:,i]
        else:
            main_pred[:,i] = test_pred2[:,i]
            
main_pred = np.clip(main_pred,0.0001,0.9999)

In [34]:
for i in range(30):
    print (i, y_train[:,i].sum(), main_pred[:,i].sum()) #main_pred[:,i].max(), main_pred[:,i].min()

0 5426.500000000001 420.9414444444445
1 3618.833333333333 275.4222222222222
2 348.3333333333333 12.2426
3 4246.333333333333 345.4555555555555
4 4696.833333333333 374.5220222222222
5 4824.833333333334 403.8206222222223
6 3571.277777777778 271.47777777777776
7 3083.722222222222 231.36666666666667
8 1451.3333333333333 112.14303333333334
9 27.166666666666664 2.872566092014313
10 2613.833333333333 190.6904888888889
11 1732.0 137.2752666666667
12 231.83333333333331 6.278233333333333
13 61.0 0.6472999999999998
14 187.0 4.034888888888888
15 396.5 13.309466666666667
16 3024.833333333333 258.08197777777775
17 1009.5 75.01320000000001
18 2348.833333333333 183.54934444444444
19 5.0 1.2398295919683018
20 4862.777777777777 373.22222222222223
21 5625.555555555556 441.31812222222226
22 3980.666666666667 312.7222222222222
23 5836.166666666666 458.10052222222225
24 5888.277777777779 467.05408888888894
25 5195.6 410.7666666666667
26 2915.166666666667 248.73703333333336
27 794.1666666666665 63.52851111111

In [35]:
submission = pd.read_csv(path_join(data_dir, 'sample_submission.csv'))
submission[targets] = main_pred
submission.to_csv("submission.csv", index = False)

In [36]:
pd.options.display.max_columns=999

In [37]:
submission.head()

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,question_opinion_seeking,question_type_choice,question_type_compare,question_type_consequence,question_type_definition,question_type_entity,question_type_instructions,question_type_procedure,question_type_reason_explanation,question_type_spelling,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.888889,0.6,0.0001,0.733333,0.8,0.833333,0.666667,0.6,0.4,0.006864,0.466667,0.6,0.0001,0.0001,0.0001,0.0001,0.2,0.0001,0.5,0.0001,0.866667,0.9,0.6,0.933333,0.933333,0.833333,0.0001,0.0001,0.833333,0.933333
1,46,0.866667,0.555556,0.0001,0.666667,0.833333,0.933333,0.533333,0.444444,0.0001,0.007136,0.466667,0.266667,0.0001,0.0001,0.0001,0.0001,0.9,0.2,0.2,0.000555,0.6,0.933333,0.6,0.933333,0.9999,0.833333,0.9999,0.2,0.0001,0.866667
2,70,0.866667,0.6,0.0001,0.733333,0.9,0.9999,0.555556,0.4,0.2,0.006114,0.2,0.444444,0.0001,0.0001,0.0001,0.0001,0.333333,0.0001,0.6,0.004068,0.866667,0.933333,0.6,0.9999,0.933333,0.8,0.2,0.0001,0.8,0.9
3,132,0.833333,0.333333,0.0001,0.666667,0.8,0.9,0.555556,0.444444,0.333333,0.008391,0.4,0.4,0.0001,0.0001,0.0001,0.0001,0.6,0.2,0.666667,0.001043,0.7,0.933333,0.7,0.9999,0.9999,0.9,0.7,0.2,0.666667,0.866667
4,200,0.9999,0.6,0.0001,0.933333,0.833333,0.933333,0.666667,0.6,0.333333,0.005043,0.4,0.555556,0.0001,0.0001,0.0001,0.0001,0.2,0.2,0.4,0.007875,0.833333,0.933333,0.666667,0.9999,0.9999,0.866667,0.266667,0.2,0.555556,0.933333


In [38]:
submission.describe()

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,question_opinion_seeking,question_type_choice,question_type_compare,question_type_consequence,question_type_definition,question_type_entity,question_type_instructions,question_type_procedure,question_type_reason_explanation,question_type_spelling,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
count,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0
mean,5029.186975,0.884331,0.578618,0.02572,0.725747,0.786811,0.848363,0.570331,0.486064,0.235595,0.006035,0.40061,0.288393,0.01319,0.00136,0.008477,0.027961,0.542189,0.157591,0.385608,0.002605,0.78408,0.927139,0.656979,0.962396,0.981206,0.862955,0.522557,0.133463,0.504321,0.903943
std,2812.67006,0.051376,0.13295,0.080119,0.11203,0.114598,0.131663,0.054579,0.094727,0.154696,0.003917,0.159719,0.218487,0.055703,0.015836,0.048846,0.093466,0.289485,0.102264,0.235498,0.003555,0.092532,0.03461,0.054508,0.035354,0.031486,0.049591,0.299874,0.1039,0.252459,0.033715
min,39.0,0.7,0.2,0.0001,0.266667,0.266667,0.2,0.466667,0.333333,0.0001,0.00133,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.0001,0.5,0.733333,0.5,0.833333,0.866667,0.6,0.0001,0.0001,0.0001,0.777778
25%,2572.0,0.833333,0.5,0.0001,0.666667,0.733333,0.833333,0.533333,0.4,0.2,0.003631,0.3,0.2,0.0001,0.0001,0.0001,0.0001,0.3,0.0001,0.2,0.0001,0.7,0.933333,0.6,0.933333,0.933333,0.833333,0.266667,0.0001,0.333333,0.888889
50%,5093.0,0.888889,0.555556,0.0001,0.733333,0.8,0.9,0.555556,0.466667,0.2,0.004988,0.4,0.266667,0.0001,0.0001,0.0001,0.0001,0.6,0.2,0.333333,0.001051,0.8,0.933333,0.666667,0.933333,0.9999,0.866667,0.555556,0.2,0.5,0.9
75%,7482.0,0.933333,0.666667,0.0001,0.8,0.866667,0.933333,0.6,0.533333,0.333333,0.007145,0.5,0.4,0.0001,0.0001,0.0001,0.0001,0.777778,0.2,0.533333,0.004013,0.866667,0.933333,0.7,0.9999,0.9999,0.9,0.777778,0.2,0.666667,0.933333
max,9640.0,0.9999,0.9,0.555556,0.933333,0.9999,0.9999,0.733333,0.777778,0.733333,0.041476,0.933333,0.9999,0.5,0.2,0.555556,0.7,0.9999,0.333333,0.9999,0.025882,0.933333,0.9999,0.833333,0.9999,0.9999,0.933333,0.9999,0.444444,0.9999,0.9999
