# Import Data

In [1]:
import pandas as pd

train_df = pd.read_csv('../../input/feedback-prize-effectiveness/train.csv')
stack_df = pd.read_csv('../../99_Ensemble/99_v1_02/result/stack_99_v1_02_27.csv')

train_df = train_df[['discourse_id']].merge(stack_df, on='discourse_id', how='left')
print(train_df.shape)
train_df.head()

(36765, 28)


Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,Ineffective_0,Adequate_0,Effective_0,label,loss,...,Effective_3,Ineffective_4,Adequate_4,Effective_4,Ineffective_5,Adequate_5,Effective_5,Ineffective,Adequate,Effective
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,0.002175,0.15298,0.011512,1,0.085691,...,0.003841,0.013028,0.149802,0.003837,0.013616,0.150725,0.002326,0.044812,0.919796,0.035392
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,0.009333,0.152729,0.004605,1,0.087332,...,0.006511,0.008662,0.154979,0.003025,0.026612,0.136538,0.003517,0.091948,0.881701,0.026351
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,0.023451,0.141358,0.001857,1,0.164698,...,0.005504,0.029492,0.135902,0.001273,0.0486,0.115757,0.00231,0.172267,0.812169,0.015564
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,0.055733,0.109495,0.001439,1,0.420119,...,0.002196,0.032678,0.132563,0.001426,0.05664,0.108651,0.001375,0.236727,0.752566,0.010708
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,0.04129,0.124674,0.000703,1,0.290297,...,0.001962,0.054301,0.11131,0.001056,0.070796,0.094933,0.000937,0.317408,0.675184,0.007408


# Features

In [2]:
import pickle
import gc

with open("../../input/nlp-word-embeddings/Glove_Embeddings.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
embeddings_index = processed_data['glove_embeddings_index']
print('Word vectors found: {}'.format(len(embeddings_index)))

del processed_data
gc.collect()

Word vectors found: 2196017


0

In [3]:
import nltk
import numpy as np

def sent2vec(text):
    words = nltk.word_tokenize(text)
    words = [w for w in words if w.isalpha()]
    
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            M.append(embeddings_index['unk'])
            continue
    
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    
    return v / np.sqrt((v ** 2).sum())

In [4]:
from tqdm import tqdm

glove_vec = [sent2vec(x) for x in tqdm(train_df["discourse_text"].values)]
col_list = ['discourse_glove_'+str(i) for i in range(300)]
glove_vec_df = pd.DataFrame(np.array(glove_vec), columns=col_list, index=train_df['discourse_id'].values)
print(f"glove_vec_df: {glove_vec_df.shape}")

100%|██████████| 36765/36765 [00:09<00:00, 3936.69it/s]


glove_vec_df: (36765, 300)


In [5]:
train_df = train_df.merge(glove_vec_df.reset_index(), left_on='discourse_id', right_on='index', how='left')
del train_df['index']; gc.collect()

46

In [6]:
train_df.columns

Index(['discourse_id', 'essay_id', 'discourse_text', 'discourse_type',
       'discourse_effectiveness', 'Ineffective_0', 'Adequate_0', 'Effective_0',
       'label', 'loss',
       ...
       'discourse_glove_290', 'discourse_glove_291', 'discourse_glove_292',
       'discourse_glove_293', 'discourse_glove_294', 'discourse_glove_295',
       'discourse_glove_296', 'discourse_glove_297', 'discourse_glove_298',
       'discourse_glove_299'],
      dtype='object', length=328)

# Data Split

In [7]:
from os.path import join as opj

class args:
    fold_path = '../../00_EDA/00_v2_07/result/'
    num_fold = 5
    
fold_path = args.fold_path
import joblib
print('load folds...')
trn_ids_list = joblib.load(opj(fold_path,f'trn_ids_list.joblib'))
val_ids_list = joblib.load(opj(fold_path,f'val_ids_list.joblib'))

load folds...


# Stacking

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_df['discourse_type_label'] = le.fit_transform(train_df['discourse_type'])

In [9]:
cols = []
for col_name in ['Ineffective','Adequate','Effective']:
    cols += [col for col in train_df.columns if (col.startswith(col_name) and '_' in col)] 
    
num_models = len(cols) // 3
num_models

6

In [10]:
import lightgbm as lgb

params = {'num_leaves': 128,
         'min_data_in_leaf': 100, #200, 
         'objective':'multiclass',
         #"metric": 'l2',
         'max_depth': 8, #-1,
         'learning_rate': 0.001, #0.05,
         "boosting": "gbdt",
         "bagging_fraction": 0.85,
         "bagging_freq": 1,
         "feature_fraction": 0.4, #0.20,
         "bagging_seed": 42,
         "verbosity": -1,
         "nthread": -1,
         "random_state": 69}

In [11]:
from sklearn.metrics import log_loss
import pickle
import os
os.makedirs('./result', exist_ok=True)

#cols = cols + ['discourse_type_label']
cols = ['discourse_type_label'] + [col for col in train_df.columns if col.startswith('discourse_glove_')] + cols

score_list = []
oof_df = []
for fold in range(args.num_fold):
    trn_df = train_df[train_df['essay_id'].isin(trn_ids_list[fold])].reset_index(drop=True)
    val_df = train_df[train_df['essay_id'].isin(val_ids_list[fold])].reset_index(drop=True)

    model = lgb.LGBMClassifier(**params, n_estimators = 20000)
    model.fit(trn_df[cols].values, 
              trn_df['label'].values ,
              eval_set=[(val_df[cols].values, val_df['label'].values)], 
              #eval_metric='l2',
              verbose=0, 
              early_stopping_rounds=100)

    pred = model.predict_proba(X=val_df[cols].values)
    target = val_df['label'].values
    score = log_loss(target, pred, labels=[0,1,2])
    print('fold{} : CV={:.4f}'.format(fold, score))
    score_list.append(score)
    val_df['oof_ineffective'] = pred[:,0]
    val_df['oof_adequate'] = pred[:,1]
    val_df['oof_effective'] = pred[:,2]
    oof_df.append(val_df)
    # save model
    joblib.dump(model, f'./result/lgb_fold{fold}.joblib')
    
CV = sum(score_list) / len(score_list)
print('CV={:.4f}'.format(CV))



fold0 : CV=0.5805






KeyboardInterrupt: 

In [None]:
joblib.dump(le, f'./result/label_encoder.joblib')

In [None]:
oof_df = pd.concat(oof_df).reset_index(drop=True)
oof_df = train_df[['discourse_id']].merge(oof_df, on='discourse_id', how='left')
oof_df.to_csv(f'./result/oof_cat.csv', index=False)
print(oof_df.shape)
oof_df.head()

In [None]:
cols = [col for col in oof_df.columns if col.startswith('oof_')]

oof_score = log_loss(oof_df['label'].values, oof_df[cols].values, labels=[0,1,2])
print('oof={:.4f}'.format(oof_score))