In [2]:
import nltk
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

df = pd.read_parquet('./frame.parquet.gzip')
xdf = df[['title', 'tags', 'description', 'publish_hour', 'comments_disabled', 'ratings_disabled', 'video_error_or_removed', 'channel_title']]
ydf = df[['category_id']]
Xtr, Xts, ytr, yts = train_test_split(xdf, ydf, shuffle=True, test_size=0.33)

Num GPUs Available:  0


In [4]:
def convert_to_sequences(x):
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(x)
    _sequences = tokenizer.texts_to_sequences(x)
    maxlen = max(len(_seq) for _seq in _sequences)
    vocab_size = len(tokenizer.word_index) + 1
    _xtr = tf.keras.preprocessing.sequence.pad_sequences(_sequences, maxlen, padding='post')
    
    return _xtr, maxlen, vocab_size

def create_text_channel(maxlen, vocab_size):
    input_ = tf.keras.layers.Input(shape=(maxlen,))
    embedding = tf.keras.layers.Embedding(vocab_size, 1028)(input_)
    conv = tf.keras.layers.Conv1D(filters=32, kernel_size=4, activation='softmax')(embedding)
    drop = tf.keras.layers.Dropout(0.05)(conv)
    pool = tf.keras.layers.MaxPooling1D(pool_size=2)(drop)
    flat = tf.keras.layers.Flatten()(pool)
    
    return input_, flat

def create_numerical_channel(shape=(3,), activation='softmax'):
    input_ = tf.keras.layers.Input(shape=shape)
    dense = tf.keras.layers.Dense(8, activation=activation)(input_)
    dense = tf.keras.layers.Dense(6, activation=activation)(input_)
    #conv = tf.keras.layers.Conv1D(filters=10, kernel_size=4, activation='softmax')(dense)
    drop = tf.keras.layers.Dropout(0.05)(dense)
    flat = tf.keras.layers.Flatten()(drop)
    
    return input_, drop


def create_model(maxlen_vocabsz, tchannels=3, nchannels=3, activation='softmax', loss='binary_crossentropy', optimizer='sgd', metric='categorical_accuracy'):
    inputs = []
    merge = []
    
    for maxlen, vocab_size in maxlen_vocabsz:
        for _ in range(tchannels):
            input_, flat = create_text_channel(maxlen, vocab_size)
            inputs.append(input_)
            merge.append(flat)
            
    for _ in range(nchannels):
        input_, flat = create_numerical_channel(shape=(3,))
        inputs.append(input_)
        merge.append(flat)
    
    merged = tf.keras.layers.Concatenate()(merge)
    dense = tf.keras.layers.Dense(10, activation=activation)(merged)
    outputs = tf.keras.layers.Dense(1, activation=activation)(dense)
    
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.compile(loss=loss, optimizer=optimizer, metrics=[metric])
    
    return model

In [5]:
maxlen_vocabsz = []
Xtrain = []

tchannels = 1
for feat in ['title', 'tags', 'description', 'channel_title']:
    _xtr, _maxlen, _vocab_size = convert_to_sequences(Xtr[feat].to_list())
    maxlen_vocabsz.append((_maxlen, _vocab_size))
    for _ in range(tchannels):
        Xtrain.append(np.array(_xtr))

nfeats = ['publish_hour', 'comments_disabled', 'ratings_disabled']
nchannels = 3
for _ in range(nchannels):
    Xtrain.append(Xtr[nfeats].to_numpy())

In [None]:
model = create_model(maxlen_vocabsz, tchannels=tchannels, nchannels=nchannels)
model.fit(Xtrain, ytr.to_numpy(), epochs=3, batch_size=16)

In [None]:
model.save('model2.h5')

In [None]:
for loss in ['mse', 'binary_crossentropy', 'categorical_crossentropy', 'sparse_categorical_crossentropy']:
    for metric in ['categorical_accuracy', 'accuracy', 'binary_accuracy']:
        for optimizer in ['adam', 'sgd', 'adadelta', 'adagrad']:
            m = create_model(maxlen_vocabsz, tchannels=tchannels, nchannels=nchannels, loss=loss, metric=metric, optimizer=optimizer)
            m.fit(Xtrain, ytr.to_numpy(), epochs=3, batch_size=16)
            m.save('model-{}-{}-{}.h5'.format(loss, metric, optimizer))

Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3