imports and connect to drive

In [None]:
import re
import os
import string
import math
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine

# Tensor flow
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds

# Langdetect 
!pip install langdetect
import langdetect

# Connect drive
from google.colab import drive
drive.mount('/content/drive')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Initialize the data

In [None]:
categories = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/categories.csv")
videos = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/videos.csv", on_bad_lines='skip')[["id", "category_id"]].drop_duplicates().set_index('id')
comments = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/comments.csv", on_bad_lines='skip')[["video_id", "content"]].drop_duplicates().set_index('video_id')
rem = set(comments.index) - set(videos.index)
for r in rem: comments.drop(r)
df = comments.join(videos).drop_duplicates()
df = df[["category_id", "content"]]
df = df.sample(frac=1)

# Preprocess
def remove_emojis(data):
    try:
        emoj = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002500-\U00002BEF"  # chinese char
            u"\U00002702-\U000027B0"
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            u"\U0001f926-\U0001f937"
            u"\U00010000-\U0010ffff"
            u"\u2640-\u2642" 
            u"\u2600-\u2B55"
            u"\u200d"
            u"\u23cf"
            u"\u23e9"
            u"\u231a"
            u"\ufe0f"  # dingbats
            u"\u3030"
                          "]+", re.UNICODE)
        return re.sub(emoj, '', data)
    except:
        return data

def llower(data):
  try:
      return data.lower()
  except:
    return data

def ttranslate(data):
  try:
      return data.translate(str.maketrans('', '', string.punctuation))
  except:
    return data


df["content"] = df["content"].apply(lambda row: remove_emojis(row))
df["content"] = df["content"].apply(lambda row: llower(row))
df["content"] = df["content"].apply(lambda row: ttranslate(row))


print("Full data set contains {} comments belonging to {} videos.".format(len(df), len(df.index.unique())))
category = {
    1: 0,
    #2: 1,
    10: 1,
    #15: 3,
    17: 2,
    #18: 2,
    #19: 6,
    #20: 7,
    #21: 3,
    22: 3,
    23: 4,
    24: 5,
    25: 6,
    26: 7,
    27: 8,
    28: 9,
    #29: 16
}

def is_en(content):
    try:
        if not (50 < len(content) < 600): return False
        return langdetect.detect(content[:100]) == "en"
    except:
        return False

def label(category_id):
    label = [0] * len(category.keys())
    label[category[category_id]] = 1
    return label

df = df[df['category_id'].isin(category.keys())]
df = df[df['content'].apply(lambda content: is_en(content))]
df["label"] = df.apply(lambda row: label(row["category_id"]), axis=1)
print(df.groupby("category_id").count())
print("Filtered data set contains {} comments belonging to {} videos.".format(len(df), len(df.index.unique())))

  exec(code_obj, self.user_global_ns, self.user_ns)


Full data set contains 441936 comments belonging to 2265 videos.
             content  label
category_id                
1               9159   9159
10             21488  21488
17              6862   6862
22             21282  21282
23             19972  19972
24             37945  37945
25             13565  13565
26             23854  23854
27              8845   8845
28             13399  13399
Filtered data set contains 176371 comments belonging to 2122 videos.


Divide data set in train, validate and test set

In [None]:
train, validate, test = np.array_split(df, [int(0.8*len(df)), max(len(df)-10000, int(0.9*len(df)))])
train_data = tf.data.Dataset.from_tensor_slices((train['content'].to_numpy(), list(train['label'].values)))
print("Train data set contains {} comments".format(len(train_data)))
test_data = tf.data.Dataset.from_tensor_slices((test['content'].to_numpy(), list(test['label'].values)))
print("Test data set contains {} comments".format(len(test_data)))
validation_data = tf.data.Dataset.from_tensor_slices((validate['content'].to_numpy(), list(validate['label'].values)))
print("Validation data set contains {} comments".format(len(validation_data)))

Train data set contains 141096 comments
Test data set contains 10000 comments
Validation data set contains 25275 comments


Initialize the model

In [None]:
# Define the encoder
encoder = hub.KerasLayer(
    "https://tfhub.dev/google/universal-sentence-encoder/4",
    input_shape=[], 
    dtype=tf.string, 
    trainable=False
)

# Create the classifier (DNN)
classifier = tf.keras.Sequential(
    [
        encoder,
        tf.keras.layers.Dense(4096, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(2048, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1024, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(len(category.keys()), activation="softmax")
    ],
    name = "YTBCommentClassifier"
)

# Summarize and compile the model
classifier.summary()
classifier.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss="categorical_crossentropy", metrics=["accuracy"])

Model: "YTBCommentClassifier"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 512)               256797824 
                                                                 
 dense (Dense)               (None, 4096)              2101248   
                                                                 
 dropout (Dropout)           (None, 4096)              0         
                                                                 
 dense_1 (Dense)             (None, 2048)              8390656   
                                                                 
 dropout_1 (Dropout)         (None, 2048)              0         
                                                                 
 dense_2 (Dense)             (None, 1024)              2098176   
                                                                 
 dropout_2 (Dropout)         (None, 1024)     

Train the model

In [None]:
batch_size = 1024
epochs = 10
history = classifier.fit(train_data.batch(batch_size), epochs=epochs, validation_data=validation_data.batch(batch_size), verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

KeyboardInterrupt: ignored

Calculate all embeddings for data

In [None]:
# Calculate embeddings of train & validate data
data = pd.concat([train, validate])
data["embedding"] = data["content"].apply(lambda content: encoder([content])[0])
data["video_id"] = data.index
data = data.groupby(level=0, as_index=False).agg({'video_id': 'first', 'category_id': 'first', 'embedding': lambda x: list(x)})
data["embedding"] = data["embedding"].apply(lambda row: tf.convert_to_tensor(row))
data = data.set_index("video_id")
data["embedding"] = data["embedding"].apply(lambda row: tf.math.reduce_mean(row, axis=0, keepdims=False, name=None))

Evaluate model on the test set

In [None]:
# Check accuracy of classifier
ks = [2, 4, 6, 8, 10, 20]
cm = [[0 for i in range(len(category.keys()))] for j in range(len(category.keys()))]
hr_1 = [0]*len(ks)
hr_2 = [0]*len(ks)
hr_3 = [0]*len(ks)
acc = 0
count = 0
for vid, test_row in test.iterrows():

  # Alive check
  count += 1
  if count % 100 == 0:
    print("Classifier accuracy on test set was:", acc/count)
    print("Hitrate of our first recommendation was:", [round(i/count, 4) for i in hr_1])
    print("Hitrate of our second recommendation was:", [round(i/count, 4) for i in hr_2])
    print("Hitrate of our combined recommendation was:", [round(i/count, 4) for i in hr_3])
    print(round(count/len(test), 3), "----------------------------------------------------")

  # Determine evaluation metrics
  prediction = classifier.predict(np.array([test_row["content"]]), verbose = 0)[0]
  pc = np.argmax(prediction)
  tc = np.argmax(test_row["label"])
  cm[tc][pc] += 1
  acc += 1 if pc == tc else 0
  a = encoder([test_row["content"]])[0]


  # Compute cosine similarities
  def similarity(row):
    if category[row["category_id"]] == np.argmax(prediction):
      return 1-np.arccos(np.dot(a, row["embedding"]) / (np.linalg.norm(a)*np.linalg.norm(row["embedding"])))/math.pi
    else:
      return 0
  sim_1 = data.apply(lambda row: similarity(row), axis=1)
  sim_2 = data.apply(lambda row: (1-np.arccos(np.dot(a, row["embedding"]) / (np.linalg.norm(a)*np.linalg.norm(row["embedding"])))/math.pi), axis=1)

  # Assemble recommendations
  recommendations_1 = list(sim_1.nlargest(max(ks)).index)
  recommendations_2 = list(sim_2.nlargest(max(ks)).index)
  recommendations = []
  for i in range(max(ks)):
    if recommendations_2[i] not in recommendations:
      recommendations.append(recommendations_2[i])
    if recommendations_1[i] not in recommendations:
      recommendations.append(recommendations_1[i])

  # Assemble recommendations
  for j in range(len(ks)):
    k = ks[j]
    hr_1[j] += 1 if vid in recommendations_1[:k] else 0
    hr_2[j] += 1 if vid in recommendations_2[:k] else 0
    hr_3[j] += 1 if vid in recommendations[:k] else 0

print(cm)
print("Classifier accuracy on test set was:", acc/len(test))
print("Hitrate of our recommender was:", hr_3/len(test))

Classifier accuracy on test set was: 0.58
Hitrate of our first recommendation was: [0.34, 0.4, 0.42, 0.44, 0.44, 0.46]
Hitrate of our second recommendation was: [0.41, 0.5, 0.53, 0.56, 0.56, 0.61]
Hitrate of our combined recommendation was: [0.39, 0.49, 0.52, 0.55, 0.57, 0.61]
0.01 ----------------------------------------------------
Classifier accuracy on test set was: 0.55
Hitrate of our first recommendation was: [0.33, 0.385, 0.4, 0.42, 0.42, 0.46]
Hitrate of our second recommendation was: [0.38, 0.465, 0.485, 0.52, 0.525, 0.57]
Hitrate of our combined recommendation was: [0.375, 0.455, 0.49, 0.515, 0.525, 0.58]
0.02 ----------------------------------------------------
Classifier accuracy on test set was: 0.5333333333333333
Hitrate of our first recommendation was: [0.3, 0.3467, 0.3667, 0.3833, 0.3833, 0.42]
Hitrate of our second recommendation was: [0.3433, 0.4267, 0.4467, 0.4767, 0.4833, 0.5333]
Hitrate of our combined recommendation was: [0.3333, 0.41, 0.4467, 0.47, 0.49, 0.5433]


TypeError: ignored

For playing around

In [None]:
query = """
Dear students,

January 20th the theory exam of the Information Retrieval course will take place. As mentioned in the course description, the exam is closed-book but you are allowed to use a hand-written formularium:

    Closed book exam (50% of grade): paper based exam regarding the thory of the course. Students are allowed to use a hand-written formularium.

Concretely you are allowed to bring with you: a maximum of 10 a4-sized pages of handwritten notes. There is no restriction on what you can include in these pages; algorithms, formulas, a sort summary of the course, ...

Best regards,
Toon Calders
"""

k = 5

# Determine accuracy
prediction = classifier.predict(np.array([query]), verbose = 0)[0]
temp = {v: prediction[i] for v,i in category.items()}
acc += 1 if np.argmax(prediction) == np.argmax(test_row["label"]) else 0

# Compute cosine similarities
a = encoder([query])[0]

# Compute cosine similarities
def similarity(row):
  if category[row["category_id"]] == np.argmax(prediction):
    return 1-np.arccos(np.dot(a, row["embedding"]) / (np.linalg.norm(a)*np.linalg.norm(row["embedding"])))/math.pi
  else:
    return 0
sim_1 = data.apply(lambda row: similarity(row), axis=1)
sim_2 = data.apply(lambda row: (1-np.arccos(np.dot(a, row["embedding"]) / (np.linalg.norm(a)*np.linalg.norm(row["embedding"])))/math.pi), axis=1)

# Assemble recommendations
recommendations_1 = list(sim_1.nlargest(k).index)
recommendations_2 = list(sim_2.nlargest(k).index)
recommendations = []
for i in range(k):
  if recommendations_2[i] not in recommendations:
    recommendations.append(recommendations_2[i])
  if recommendations_1[i] not in recommendations:
    recommendations.append(recommendations_1[i])
recommendations = recommendations[:k]

# Select top 10
for vid in recommendations:
  print("https://www.youtube.com/watch?v={}".format(vid))



https://www.youtube.com/watch?v=B9SptdjpJBQ
https://www.youtube.com/watch?v=5e0LMJRJFaY
https://www.youtube.com/watch?v=9yUZTTLpDtk
https://www.youtube.com/watch?v=mceaM2_zQd8
https://www.youtube.com/watch?v=fLLe5sODKtg
