In [1]:
import pandas as pd
import numpy as np
import re
import lightgbm as lgb
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning, module='sklearn')
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score


In [2]:
from sklearn.preprocessing import StandardScaler

#######################
# FEATURE ENGINEERING #
#######################
"""
Main function
Input: pandas Series and a feature engineering function
Output: pandas Series
"""
def engineer_feature(series, func, normalize=True):
    feature = series.apply(func)
       
    if normalize:
        feature = pd.Series(z_normalize(feature.values.reshape(-1,1)).reshape(-1,))
    feature.name = func.__name__ 
    return feature

"""
Engineer features
Input: pandas Series and a list of feature engineering functions
Output: pandas DataFrame
"""
def engineer_features(series, funclist, normalize=True):
    features = pd.DataFrame()
    for func in funclist:
        feature = engineer_feature(series, func, normalize)
        features[feature.name] = feature
    return features

"""
Normalizer
Input: NumPy array
Output: NumPy array
"""
scaler = StandardScaler()
def z_normalize(data):
    scaler.fit(data)
    return scaler.transform(data)
    
"""
Feature functions
"""
def asterix_freq(x):
    return x.count('!')/len(x)

def uppercase_freq(x):
    return len(re.findall(r'[A-Z]',x))/len(x)
    

In [3]:
PATH = '~/data/toxic/data/'

train = pd.read_csv(PATH + 'cleaned_train.csv').fillna(' ')
test = pd.read_csv(PATH + 'cleaned_test.csv').fillna(' ')

print(train.shape)
print(test.shape)

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

(159571, 27)
(153164, 21)


In [4]:
INPUT_COLUMN = "comment_text"
# Engineer features
feature_functions = [len, asterix_freq, uppercase_freq]
features = [f.__name__ for f in feature_functions]
F_train = engineer_features(train[INPUT_COLUMN], feature_functions)
F_test = engineer_features(test[INPUT_COLUMN], feature_functions)



In [5]:
F_train.shape, F_test.shape

((159571, 3), (153164, 3))

In [6]:
from base_layer_utils import BaseLayerResultsRepo

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [7]:
base_layer_results_repo = BaseLayerResultsRepo()

load from file


In [8]:
layer1_oof_train_loaded, layer1_oof_test_loaded, base_layer_est_preds_loaded = base_layer_results_repo.get_results(threshold=0.9793)

In [12]:
len((layer1_oof_train_loaded['toxic']))

8

In [13]:
def combine_layer_oof_per_label(layer1_oof_dict, label):
    x = None
    data_list = layer1_oof_dict[label]
    for i in range(len(data_list)):
        if i == 0:
            x = data_list[0]
        else:
            x = np.concatenate((x, data_list[i]), axis=1)
    return x

In [14]:
for i, label in enumerate([label_cols[0]]):
    assert train.shape == (159571, 27)
    x_train = combine_layer_oof_per_label(layer1_oof_train_loaded, label)
    x_test = combine_layer_oof_per_label(layer1_oof_test_loaded, label)
    
    x_train = np.hstack([F_train[features].as_matrix(), x_train])
    x_test = np.hstack([F_test[features].as_matrix(), x_test])  

In [19]:
x_train.shape, F_train[features].shape, F_train[features].as_matrix().shape

((159571, 8), (159571, 3), (159571, 3))

In [20]:
np.hstack([F_train[features].as_matrix(), x_train]).shape

(159571, 11)

In [None]:
  

stacker = lgb.LGBMClassifier(max_depth=3, metric="auc", n_estimators=125, num_leaves=10, boosting_type="gbdt", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)

# Fit and submit
scores = []
for label in LABELS:
    print(label)
    score = cross_val_score(stacker, X_train, train[label], cv=5, scoring='roc_auc')
    print("AUC:", score)
    scores.append(np.mean(score))
    stacker.fit(X_train, train[label])
    sub[label] = stacker.predict_proba(X_test)[:,1]
print("CV score:", np.mean(scores))

sub.to_csv("submission.csv", index=False)