In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Activation, Dropout, Flatten
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Concatenate
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


[nltk_data] Downloading package stopwords to /home/jc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
from sklearn.model_selection import train_test_split

df = pd.read_parquet('./frame.parquet.gzip')

xdf = df[['title', 'tags', 'publish_time', 'anti_participation', 'video_error_or_removed']]
ydf = df[['category_id']]

Xtr, Xts, ytr, yts = train_test_split(xdf, ydf, shuffle=True, test_size=0.33)

In [34]:
def convert_to_sequences(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    _sequences = tokenizer.texts_to_sequences(x)
    maxlen = max(len(_seq) for _seq in _sequences)
    vocab_size = len(tokenizer.word_index) + 1
    _xtr = pad_sequences(_sequences, maxlen, padding='post')
    
    return _xtr, maxlen, vocab_size

def create_input_channel(maxlen, vocab_size):
    input_ = Input(shape=(maxlen,))
    embedding = Embedding(vocab_size, 100)(input_)
    conv = Conv1D(filters=32, kernel_size=4, activation='softmax')(embedding)
    drop = Dropout(0.5)(conv)
    pool = MaxPooling1D(pool_size=2)(drop)
    flat = Flatten()(pool)
    
    return input_, flat

Xtr1, maxlen1, vocab_size1 = convert_to_sequences(Xtr['title'].to_list())
Xtr2, maxlen2, vocab_size2 = convert_to_sequences(Xtr['tags'].to_list())

input11, flat11 = create_input_channel(maxlen1, vocab_size1)
input12, flat12 = create_input_channel(maxlen1, vocab_size1)
input13, flat13 = create_input_channel(maxlen1, vocab_size1)

input21, flat21 = create_input_channel(maxlen2, vocab_size2)
input22, flat22 = create_input_channel(maxlen2, vocab_size2)
input23, flat23 = create_input_channel(maxlen2, vocab_size2)

merged = Concatenate()([
    flat11, flat12, flat13, 
    flat21, flat22, flat23,
])

dense = Dense(10, activation='softmax')(merged)
outputs = Dense(1, activation='softmax')(dense)
model = Model(inputs=[input11, input12, input13, input21, input22, input23], outputs=outputs)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [35]:
model.fit([Xtr1,Xtr1,Xtr1, Xtr2,Xtr2,Xtr2], ytr.to_numpy(), epochs=10, batch_size=16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f08c2da3250>

In [36]:
model.save('model1.h5')