In [1]:
import numpy as np
import pandas as pd
import os
from scipy import stats

from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
import gc
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from category_encoders import OneHotEncoder

In [2]:
os.listdir("../input/google-quest-challenge")
root="../input/google-quest-challenge/"
train=pd.read_csv(root+"train.csv")
test=pd.read_csv(root+"test.csv")
train.columns

Index(['qa_id', 'question_title', 'question_body', 'question_user_name',
       'question_user_page', 'answer', 'answer_user_name', 'answer_user_page',
       'url', 'category', 'host', 'question_asker_intent_understanding',
       'question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking',
       'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written', 'answer_helpful',
       'answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfa

In [3]:
drop_cols=['question_user_page','answer_user_page','url','question_user_name','answer_user_name']
train.drop(drop_cols,axis=1,inplace=True)
test.drop(drop_cols,axis=1,inplace=True)
train.shape

(6079, 36)

In [4]:
#one hot encorder
cat_cols=['host',"category"]
merged = pd.concat([train[cat_cols],test[cat_cols]])
ohe = OneHotEncoder(cols=cat_cols,handle_unknown='ignore')
ohe.fit(merged)
train_ohe=ohe.transform(train[cat_cols])
test_ohe=ohe.transform(test[cat_cols])
train=pd.concat([train,train_ohe],axis=1)
test=pd.concat([test,test_ohe],axis=1)
train.drop(cat_cols,axis=1,inplace=True)
test.drop(cat_cols,axis=1,inplace=True)
train.shape

(6079, 103)

In [5]:
#tfidf
tfidf=TfidfVectorizer()
train["question_title"]=tfidf.fit_transform(train["question_title"]).toarray()
test["question_title"]=tfidf.transform(test["question_title"]).toarray()
train["question_body"]=tfidf.fit_transform(train["question_body"]).toarray()
test["question_body"]=tfidf.transform(test["question_body"]).toarray()
train["answer"]=tfidf.fit_transform(train["answer"]).toarray()
test["answer"]=tfidf.transform(test["answer"]).toarray()

In [6]:
targets=['question_asker_intent_understanding',
       'question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking',
       'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written', 'answer_helpful',
       'answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfaction', 'answer_type_instructions',
       'answer_type_procedure', 'answer_type_reason_explanation',
       'answer_well_written']
features=[c for c in test.columns]

In [7]:
%%time
n_fold=3
kf = KFold(n_splits=n_fold)
oof = np.zeros(len(train))
oof_sum=[]
y_pred = np.zeros(len(test))
predictions = []

print('Light GBM Model')
params={"objective": "regression",
        "boosting": "gbdt",
        "num_leaves": 40,#=max_leaves
        'feature_fraction': 0.85,
        'subsample': 0.85,
        "learning_rate": 0.05,
        "metric": "rmse"}
for num, target in enumerate(targets):
    print('Train model {}'.format(num + 1))
    print('Predicting target {}'.format(target))
    for fold_, (trn_idx, val_idx)  in enumerate(kf.split(train)):
        tr_x, tr_y = train[features].iloc[trn_idx], train[target].iloc[trn_idx]
        vl_x, vl_y = train[features].iloc[val_idx], train[target].iloc[val_idx]
        print('fold : ',fold_)
        lgb_train = lgb.Dataset(tr_x, tr_y)
        lgb_eval = lgb.Dataset(vl_x, vl_y, reference=lgb_train)
        model = lgb.train(params, lgb_train, num_boost_round=1000, valid_sets=[lgb_train,lgb_eval],
                          early_stopping_rounds=10,verbose_eval=10)
        oof[val_idx] = model.predict(vl_x)
        y_pred+=model.predict(test)
    oof_sum.append(np.sqrt(mean_squared_error(oof, train[target])))
    print('oof_RMSE : ' ,np.sqrt(mean_squared_error(oof, train[target])))
    predictions.append(np.clip(y_pred, a_min = 0, a_max = 1))
    y_pred = np.zeros(len(test))
print("total_oof : ",np.mean(oof_sum))

Light GBM Model
Train model 1
Predicting target question_asker_intent_understanding
fold :  0
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 0.126548	valid_1's rmse: 0.130031
[20]	training's rmse: 0.12438	valid_1's rmse: 0.12913
[30]	training's rmse: 0.123197	valid_1's rmse: 0.129204
Early stopping, best iteration is:
[21]	training's rmse: 0.124216	valid_1's rmse: 0.129112
fold :  1
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 0.128727	valid_1's rmse: 0.125834
[20]	training's rmse: 0.126262	valid_1's rmse: 0.124971
[30]	training's rmse: 0.124782	valid_1's rmse: 0.124898
[40]	training's rmse: 0.123764	valid_1's rmse: 0.124848
Early stopping, best iteration is:
[39]	training's rmse: 0.123871	valid_1's rmse: 0.124842
fold :  2
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 0.126402	valid_1's rmse: 0.131649
[20]	training's rmse: 0.124116	valid_1's rmse: 0.131092
[30]	training's rms

In [8]:
submission=pd.read_csv(root+"sample_submission.csv")
submission[targets] = np.array(predictions).T
submission.to_csv('submission.csv', index = False)

In [9]:
submission.head(10)

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,1.0,1.0,0.209621,1.0,1.0,1.0,1.0,1.0,0.77581,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.348632,1.0,1.0
1,46,1.0,1.0,0.063408,1.0,1.0,1.0,1.0,1.0,0.644821,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.484416,1.0,1.0
2,70,1.0,1.0,0.209621,1.0,1.0,1.0,1.0,1.0,0.666927,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.352662,1.0,1.0
3,132,1.0,1.0,0.075891,1.0,1.0,1.0,1.0,1.0,0.592466,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.465185,1.0,1.0
4,200,1.0,1.0,0.191656,1.0,1.0,1.0,1.0,1.0,0.934054,...,1.0,1.0,1.0,1.0,1.0,1.0,0.967143,0.409804,1.0,1.0
5,245,1.0,1.0,0.212444,1.0,1.0,1.0,1.0,1.0,0.676555,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.354813,1.0,1.0
6,257,1.0,1.0,0.102304,1.0,1.0,1.0,1.0,1.0,0.545356,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.484416,1.0,1.0
7,267,1.0,1.0,0.532171,1.0,1.0,1.0,1.0,1.0,0.687499,...,1.0,1.0,1.0,1.0,1.0,1.0,0.396604,0.33636,1.0,1.0
8,284,1.0,1.0,0.088391,1.0,1.0,1.0,1.0,1.0,0.538053,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.455907,1.0,1.0
9,292,1.0,1.0,0.107361,1.0,1.0,1.0,1.0,1.0,0.702659,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.363283,1.0,1.0
