In [1]:
import matplotlib.pyplot as plt
import tensorflow as tf
import sklearn as sk
import pandas as pd
import numpy as np
import nltk.corpus
import nltk
%matplotlib inline

lower = False
#feature = 'title'
feature = 'tags'
#feature = 'description'
#region, language = 'US', 'english'   # 
#region, language = 'KR', 'korean'    # doesn't work
#region, language = 'MX', 'spanish'   # doesn't work
#region, language = 'CA', 'english'   #
#region, language = 'DE', 'german'    #
region, language = 'FR', 'french'    #

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

def load_region_data(region: str) -> pd.DataFrame:
    df = pd.read_csv('./archive/{}videos.csv'.format(region))

    # Drop unused columns
    df = df.drop(['thumbnail_link', 'video_id'], axis=1)
    df = df.dropna()

    # Enrich data
    df['publish_hour'] = pd.to_datetime(df['publish_time']).dt.hour
    df['comments_disabled'] = df['comments_disabled'].apply(lambda row: 0 if row == False else 1)
    df['ratings_disabled'] = df['ratings_disabled'].apply(lambda row: 0 if row == False else 1)
    df['video_error_or_removed'] = df['video_error_or_removed'].apply(lambda row: 0 if row == False else 1)
    df['like_dislike'] = df[['likes', 'dislikes']].apply(lambda row: row['likes'] / np.sum([row['likes'], row['dislikes']]), axis=1)
    df['tags'] = df['tags'].apply(lambda row: ' '.join( i.strip('""') for i in row.split('|') ))
    df['anti_participation'] = df[['comments_disabled', 'ratings_disabled']].apply(lambda row: row.sum(), axis=1)

    # Transform category_id to label
    category_le = sk.preprocessing.LabelEncoder()
    category_le.fit(df['category_id'])
    df['category_id'] = category_le.transform(df['category_id'])

    # Load stopwords
    try:
        stop_words = set(nltk.corpus.stopwords.words(language))
    except Exception:
        nltk.download('stopwords')
        stop_words = set(nltk.corpus.stopwords.words(language))
    
    # Drop stopwords
    df['tags'] = df['tags'].apply(lambda row: ' '.join(word for word in row.split() if word not in stop_words))
    df['description'] = df['description'].apply(lambda row: ' '.join(word for word in row.split() if word not in stop_words))
    
    return df, len(category_le.classes_)

df, num_classes = load_region_data(region)
df.head()

Num GPUs Available:  1


  df['like_dislike'] = df[['likes', 'dislikes']].apply(lambda row: row['likes'] / np.sum([row['likes'], row['dislikes']]), axis=1)


Unnamed: 0,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled,video_error_or_removed,description,publish_hour,like_dislike,anti_participation
0,17.14.11,Malika LePen : Femme de Gauche - Trailer,Le Raptor Dissident,9,2017-11-13T17:32:55.000Z,Raptor Dissident Expliquez cette merde,212702,29282,1108,3817,0,0,0,Dimanche.\n18h30.\nSoyez présents vidéo plus r...,17,0.963541,0
1,17.14.11,"LA PIRE PARTIE ft Le Rire Jaune, Pierre Croce,...",Le Labo,9,2017-11-12T15:00:02.000Z,[none],432721,14053,576,1161,0,0,0,Le jeu société: https://goo.gl/hhG1Ta\n\nGagne...,15,0.960626,0
2,17.14.11,DESSINS ANIMÉS FRANÇAIS VS RUSSES 2 - Daniil...,Daniil le Russe,8,2017-11-13T17:00:38.000Z,cartoon pokémon école ours мультфильм,482153,76203,477,9580,0,0,0,Une nouvelle dose dessins animés français russ...,17,0.993779,0
3,17.14.11,PAPY GRENIER - METAL GEAR SOLID,Joueur Du Grenier,6,2017-11-12T17:00:02.000Z,Papy grenier Metal Gear Solid PS1 Tirage d'ore...,925222,85016,550,4303,0,0,0,"Nouvel ,épisode Papy Grenier ! Ce mois-ci part...",17,0.993572,0
4,17.14.11,QUI SAUTERA LE PLUS HAUT ? (VÉLO SKATE ROLLER ...,Aurelien Fontenoy,4,2017-11-13T16:30:03.000Z,vélo vtt bmx freestyle bike mtb dirt trottinet...,141695,8091,72,481,0,0,0,Sauts plus 4 mètres haut trampoline park / air...,16,0.99118,0


In [2]:
def convert_to_sequences(x):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=lower)
    tokenizer.fit_on_texts(x)
    _sequences = tokenizer.texts_to_sequences(x)
    maxlen = max(len(_seq) for _seq in _sequences)
    vocab_size = len(tokenizer.word_index) + 1
    _xtr = tf.keras.preprocessing.sequence.pad_sequences(_sequences, maxlen, padding='post')
    
    return np.array(_xtr), maxlen, vocab_size

if feature == 'tags':
    X, maxlen, vocab_size = convert_to_sequences(df['tags'].to_numpy())
elif feature == 'description':
    X, maxlen, vocab_size = convert_to_sequences(df['description'].to_numpy())
elif feature == 'title': 
    X, maxlen, vocab_size = convert_to_sequences(df['title'].to_numpy())

Y = df['category_id'].to_numpy().reshape((-1,1))

X.shape, Y.shape

((37812, 100), (37812, 1))

In [3]:
activation = 'softmax'

model = tf.keras.Sequential()

model.add(tf.keras.layers.Input(shape=(maxlen,)))
model.add(tf.keras.layers.Embedding(vocab_size, 75))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Conv1D(filters=64, kernel_size=8, activation=activation))
model.add(tf.keras.layers.Conv1D(filters=32, kernel_size=6, activation=activation))
model.add(tf.keras.layers.Conv1D(filters=16, kernel_size=4, activation=activation))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(8))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))

model.compile(
    loss='categorical_crossentropy', 
    optimizer='adam', 
    metrics=['categorical_accuracy']
)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 75)           5900100   
_________________________________________________________________
dropout (Dropout)            (None, 100, 75)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 93, 64)            38464     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 88, 32)            12320     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 85, 16)            2064      
_________________________________________________________________
dropout_1 (Dropout)          (None, 85, 16)            0         
_________________________________________________________________
dense (Dense)                (None, 85, 8)             1

In [None]:
# Initialize history variables
loss_history, accuracy_history = [[],[]], [[],[]]
final_accuracy = -1.

to_categorical = tf.keras.utils.to_categorical

# Kfold cross validation
skf = sk.model_selection.StratifiedKFold(n_splits=10, shuffle=True)
for train_index, val_index in skf.split(X, Y):
    # Get train and test fold
    xtrain, xval = X[train_index], X[val_index]
    
    # We need to convert the Y to a categorical type in order for the 
    # multi-classifier to train and validate properly
    ytrain = to_categorical(Y[train_index], num_classes=num_classes)
    yval = to_categorical(Y[val_index], num_classes=num_classes)
    
    # Train and validate
    history = model.fit(xtrain, ytrain, validation_data=(xval, yval), epochs=1, batch_size=64)
    
    # Add values to training history
    loss_history[0].extend(history.history['val_loss'])
    loss_history[1].extend(history.history['loss'])
    accuracy_history[0].extend(history.history['val_categorical_accuracy'])
    accuracy_history[1].extend(history.history['categorical_accuracy'])
    final_accuracy = history.history['val_categorical_accuracy'][0]





In [None]:
# Visualize history
# Plot history: Loss
fig, ax = plt.subplots()
ax.plot(loss_history[0], label='validation')
ax.plot(loss_history[1], label='train')
ax.set(title='Validation loss history', xlabel='No. split', ylabel='Loss value')
ax.legend()


# Plot history: Accuracy
fig, ax = plt.subplots()
ax.plot(accuracy_history[0], label='validation')
ax.plot(accuracy_history[1], label='train')
ax.set(title='Validation accuracy history', xlabel='No. split', ylabel='Accuracy value')
ax.legend()

plt.show()

In [None]:
import os
os.makedirs('.models/', exist_ok=True)

# Save model
model.save('.models/model-{region}-{feature}-{accuracy:.2f}.h5'.format(region=region, feature=feature, accuracy=final_accuracy))