In [1]:
import matplotlib.pyplot as plt
import tensorflow as tf
import sklearn as sk
import pandas as pd
import numpy as np
import nltk.corpus
import nltk
%matplotlib inline

feature = 'tags'
language = 'US', 'english'
#language = 'KR', 'korean'
#language = 'MX', 'spanish'
#language = 'CA', 'english'
#language = 'DE', 'german'
#lanugage = 'FR', 'french'

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

df = pd.read_csv('./archive/{}videos.csv'.format(language[0]))
categories = json.load(open('./archive/{}_category_id.json'.format(language[0]), 'rb'))

# Drop unused columns
df = df.drop(['thumbnail_link', 'video_id'], axis=1)
df = df.dropna()

# Enrich data
df['publish_hour'] = pd.to_datetime(df['publish_time']).dt.hour
df['comments_disabled'] = df['comments_disabled'].apply(lambda row: 0 if row == False else 1)
df['ratings_disabled'] = df['ratings_disabled'].apply(lambda row: 0 if row == False else 1)
df['video_error_or_removed'] = df['video_error_or_removed'].apply(lambda row: 0 if row == False else 1)
df['like_dislike'] = df[['likes', 'dislikes']].apply(lambda row: row['likes'] / np.sum([row['likes'], row['dislikes']]), axis=1)
df['tags'] = df['tags'].apply(lambda row: ' '.join( i.strip('""') for i in row.split('|') ))
df['anti_participation'] = df[['comments_disabled', 'ratings_disabled']].apply(lambda row: row.sum(), axis=1)

#category_name_mapping = {}
#for item in categories['items']:
#    category_name_mapping[int(item['id'])] = item['snippet']['title']
#df['category_name'] = df['category_id'].apply(lambda row: category_name_mapping[int(row)])

category_le = sk.preprocessing.LabelEncoder()
category_le.fit(df['category_id'])
df['category_id'] = category_le.transform(df['category_id'])

# Take out stop words
try:
    stop_words = set(nltk.corpus.stopwords.words(language[1]))
except Exception:
    nltk.download('stopwords')
    stop_words = set(nltk.corpus.stopwords.words(language[1]))
df['tags'] = df['tags'].apply(lambda row: ' '.join(word for word in row.split() if word not in stop_words))
df['description'] = df['description'].apply(lambda row: ' '.join(word for word in row.split() if word not in stop_words))


df.head()

Num GPUs Available:  1


  df['like_dislike'] = df[['likes', 'dislikes']].apply(lambda row: row['likes'] / np.sum([row['likes'], row['dislikes']]), axis=1)


Unnamed: 0,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled,video_error_or_removed,description,publish_hour,like_dislike,anti_participation
0,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,7,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,0,0,0,SHANTELL'S CHANNEL - https://www.youtube.com/s...,17,0.95097,0
1,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,9,2017-11-13T07:30:00.000Z,last week tonight trump presidency last week t...,2418783,97185,6146,12703,0,0,0,"One year presidential election, John Oliver di...",7,0.940521,0
2,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,8,2017-11-12T19:05:24.000Z,racist superman rudy mancuso king bach racist ...,3191434,146033,5339,8181,0,0,0,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...,19,0.964729,0
3,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,9,2017-11-13T11:00:04.000Z,rhett link gmm good mythical morning rhett lin...,343168,10172,666,2146,0,0,0,Today find Link Nickelback amateur secret Nick...,11,0.93855,0
4,17.14.11,I Dare You: GOING BALD!?,nigahiga,9,2017-11-12T18:01:41.000Z,ryan higa higatv nigahiga dare idy rhpc dares ...,2095731,132235,1989,17518,0,0,0,"I know since show, we're back might best episo...",18,0.985181,0


In [2]:
def convert_to_sequences(x):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False)
    tokenizer.fit_on_texts(x)
    _sequences = tokenizer.texts_to_sequences(x)
    maxlen = max(len(_seq) for _seq in _sequences)
    vocab_size = len(tokenizer.word_index) + 1
    _xtr = tf.keras.preprocessing.sequence.pad_sequences(_sequences, maxlen, padding='post')
    
    return np.array(_xtr), maxlen, vocab_size

if feature == 'tags':
    X, maxlen, vocab_size = convert_to_sequences(df['tags'].to_numpy())
elif feature == 'description':
    X, maxlen, vocab_size = convert_to_sequences(df['description'].to_numpy())
elif feature == 'title': 
    X, maxlen, vocab_size = convert_to_sequences(df['title'].to_numpy())

Y = df['category_id'].to_numpy().reshape((-1,1))

X.shape, Y.shape

((40379, 91), (40379, 1))

In [3]:
activation = 'softmax'

model = tf.keras.Sequential()

model.add(tf.keras.layers.Input(shape=(maxlen,)))
model.add(tf.keras.layers.Embedding(vocab_size, 75))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Conv1D(filters=64, kernel_size=8, activation=activation))
model.add(tf.keras.layers.Conv1D(filters=32, kernel_size=6, activation=activation))
model.add(tf.keras.layers.Conv1D(filters=16, kernel_size=4, activation=activation))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(8))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(len(category_le.classes_), activation='softmax'))

model.compile(
    loss='categorical_crossentropy', 
    optimizer='adam', 
    metrics=['categorical_accuracy']
)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 91, 75)            2303025   
_________________________________________________________________
dropout (Dropout)            (None, 91, 75)            0         
_________________________________________________________________
conv1d (Conv1D)              (None, 84, 64)            38464     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 79, 32)            12320     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 76, 16)            2064      
_________________________________________________________________
dropout_1 (Dropout)          (None, 76, 16)            0         
_________________________________________________________________
dense (Dense)                (None, 76, 8)             1

In [None]:
loss_history, accuracy_history = [[],[]], [[],[]]
final_accuracy = -1.

# Kfold 
skf = sk.model_selection.StratifiedKFold(n_splits=10, shuffle=True)
for train_index, val_index in skf.split(X, Y):
    xtrain, xval = X[train_index], X[val_index]
    ytrain, yval = tf.keras.utils.to_categorical(Y[train_index]), tf.keras.utils.to_categorical(Y[val_index])
    
    history = model.fit(xtrain, ytrain, validation_data=(xval, yval), epochs=1, batch_size=64)
    
    loss_history[0].extend(history.history['val_loss'])
    loss_history[1].extend(history.history['loss'])
    accuracy_history[0].extend(history.history['val_categorical_accuracy'])
    accuracy_history[1].extend(history.history['categorical_accuracy'])
    
    final_accuracy = history.history['val_categorical_accuracy'][0]
    



In [None]:
history.history

In [None]:
# Visualize history
# Plot history: Loss
fig, ax = plt.subplots()
ax.plot(loss_history[0], label='validation')
ax.plot(loss_history[1], label='train')
ax.set(title='Validation loss history', xlabel='No. split', ylabel='Loss value')
ax.legend()


# Plot history: Accuracy
fig, ax = plt.subplots()
ax.plot(accuracy_history[0], label='validation')
ax.plot(accuracy_history[1], label='train')
ax.set(title='Validation accuracy history', xlabel='No. split', ylabel='Accuracy value')
ax.legend()

plt.show()

In [None]:
model.save('model-{lang}-{feature}-{accuracy}.h5'.format(lang=language[1], feature=feature, accuracy=final_accuracy))