In [1]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

from sklearn.metrics import *
from sklearn.model_selection import KFold

from keras.layers import *
from keras import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints
from keras import backend as K


tqdm.pandas()
%matplotlib inline

Using TensorFlow backend.


## Loading Datasets

In [2]:
TRAIN_DF = pd.read_json('./train.json')
TEST_DF = pd.read_json('./test.json')

X_train = TRAIN_DF['audio_embedding'].tolist()
X_test = TEST_DF['audio_embedding'].tolist()
y_train = TRAIN_DF['is_turkey'].values

X_train = pad_sequences(X_train, maxlen=10) 
X_test = pad_sequences(X_test, maxlen=10) 

In [18]:
TRAIN_DF.head()

Unnamed: 0,audio_embedding,end_time_seconds_youtube_clip,is_turkey,start_time_seconds_youtube_clip,vid_id
0,"[[172, 34, 216, 110, 208, 46, 95, 66, 161, 125...",70,0,60,kDCk3hLIVXo
1,"[[169, 20, 165, 102, 205, 62, 110, 103, 211, 1...",40,1,30,DPcGzqHoo7Y
2,"[[148, 8, 138, 60, 237, 48, 121, 108, 145, 177...",240,1,230,7yM63MTHh5k
3,"[[151, 0, 162, 88, 171, 71, 47, 90, 179, 190, ...",520,1,510,luG3RmUAxxM
4,"[[162, 17, 187, 111, 211, 105, 92, 67, 203, 15...",10,0,0,PIm3cjxTpOk


In [19]:
TEST_DF.head()

Unnamed: 0,audio_embedding,end_time_seconds_youtube_clip,start_time_seconds_youtube_clip,vid_id
0,"[[177, 20, 226, 132, 198, 81, 111, 59, 132, 18...",10,0,pyKh38FXD3E
1,"[[169, 21, 204, 161, 195, 72, 60, 39, 152, 184...",40,30,THhP1idrWXA
2,"[[165, 13, 198, 141, 199, 81, 173, 54, 119, 11...",40,30,jsw3T6GY2Nw
3,"[[167, 18, 188, 159, 198, 63, 156, 36, 179, 22...",24,14,nFkXTMHcjMU
4,"[[178, 32, 181, 100, 198, 46, 82, 83, 136, 227...",40,30,Au8g9kAlrLQ


## Our Deep Learning Model Baseline 

In [3]:
# https://www.kaggle.com/qqgeogor/keras-lstm-attention-glove840b-lb-0-043
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [4]:
import tensorflow as tf
# define roc_callback, inspired by https://github.com/keras-team/keras/issues/6050#issuecomment-329996505
def auc_roc(y_true, y_pred):
    # any tensorflow metric
    value, update_op = tf.metrics.auc(y_true, y_pred)

    # find all variables created for this metric
    metric_vars = [i for i in tf.local_variables() if 'auc_roc' in i.name.split('/')[1]]

    # Add metric variables to GLOBAL_VARIABLES collection.
    # They will be initialized for new session.
    for v in metric_vars:
        tf.add_to_collection(tf.GraphKeys.GLOBAL_VARIABLES, v)

    # force to update metric values
    with tf.control_dependencies([update_op]):
        value = tf.identity(value)
        return value

In [5]:
def get_model(learn_rate=0.01, momentum=0.8, dropout_rate1=0.1, dropout_rate2=0.0, neurons=128):
    model = Sequential()
    model.add(BatchNormalization(momentum=momentum,input_shape=(10, 128)))
    model.add(Bidirectional(GRU(neurons, dropout=dropout_rate1, recurrent_dropout=dropout_rate2, activation='relu', return_sequences=True)))
    model.add(Attention(10))
    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=Adam(lr=learn_rate), metrics=[auc_roc])
    return model

In [8]:
def train_model(X_train, y_train, X_test, n_splits=5, verbose=2):

    missfits = [] ## missclassified items
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=424242)
    preds = []
    historys = []
    fold = 0
    aucs = 0.0
    threshold = 0.5

    for train_idx, val_idx in kf.split(X_train):
        x_train_f = X_train[train_idx]
        y_train_f = y_train[train_idx]
        x_val_f = X_train[val_idx]
        y_val_f = y_train[val_idx]

        model = get_model(neurons=128, momentum=0.6, learn_rate=0.001, dropout_rate2=0.1, dropout_rate1=0.1)
        history = model.fit(x_train_f, y_train_f,
            batch_size=128,
            epochs=20,
            verbose = verbose,
            validation_data=(x_val_f, y_val_f),
            callbacks = [])
        
        preds_val = model.predict([x_val_f], batch_size=512)

        
        preds_test = model.predict(X_test)
        preds.append(preds_test)
        
        historys.append(history)
        
        score = roc_auc_score(y_val_f, preds_val)
        aucs += score
    
        preds_val = preds_val > threshold
        for idx in range(len(x_val_f)):
            if preds_val[0][0] != y_val_f[idx]:
                missfits.append(TRAIN_DF.values[train_idx[idx]])
    
        fold+=1
    
        print('Fold {}, AUC = {}'.format(fold, score))
    
    preds = np.asarray(preds)[...,0]
    preds = np.mean(preds, axis=0)
    print("Cross Validation AUC = {}".format(aucs/n_splits))
    
    return (preds, historys, missfits)

    

    
    
    
    

    
    

In [11]:
preds, historys, missfits = train_model(X_train, y_train, X_test, n_splits=5, verbose=0)

Fold 1, AUC = 0.9855699855699855
Fold 2, AUC = 0.9850862312694374
Fold 3, AUC = 0.9835907335907337
Fold 4, AUC = 0.9936837519965152
Fold 5, AUC = 0.9913315850815851
Cross Validation AUC = 0.9878524575016513


In [13]:
?pd.DataFrame

## Getting more data from external sources

In [15]:
! mkdir -p html/train/0
! mkdir -p html/train/1
! mkdir -p html/test

In [14]:
BASE_URL = "https://www.youtube.com/watch?v="

In [30]:
vid_ids_train_0 = TRAIN_DF[TRAIN_DF.is_turkey==0]['vid_id'].values
vid_ids_train_1 = TRAIN_DF[TRAIN_DF.is_turkey==1]['vid_id'].values
vid_ids_test = TEST_DF['vid_id'].values

In [31]:
print(len(vid_ids_train_0))
print(len(vid_ids_train_1))
print(len(vid_ids_test))

704
491
1196


In [36]:
prefix = "./html/train/0/"
for vid_id in tqdm(vid_ids_train_0):
    url = BASE_URL + vid_id
    output_file = prefix + vid_id + ".html"
    !wget '{url}' -o '{output_file}'

100%|██████████| 704/704 [06:31<00:00,  1.69it/s]


In [None]:
prefix = "./html/train/1/"
for vid_id in tqdm(vid_ids_train_1):
    url = BASE_URL + vid_id
    output_file = prefix + vid_id + ".html"
    !wget '{url}' -o '{output_file}'

 64%|██████▍   | 316/491 [03:00<01:33,  1.86it/s]

In [None]:
prefix = "./html/test/"
for vid_id in tqdm(vid_ids_test):
    url = BASE_URL + vid_id
    output_file = prefix + vid_id + ".html"
    !wget '{url}' -o '{output_file}'

In [None]:
'''        
        for idx, p in enumerate(preds_val):
            unicorn = unicornify(train.loc[val_idx[idx]])
            if unicorn:
                preds_val[idx][0] = unicorn
 '''       
    
    
'''    
    voyanteries = model.predict(x_test)
    preds.append(voyanteries)
    for idx, p in enumerate(voyanteries):
        unicorn = unicornify(test.loc[idx])
        print(unicorn)
        if unicorn:
            voyanteries[idx][0] = unicorn
'''    