# CendekiaOne - Recommender System:

---

**Post scoring Method:**

Post:<br>
*   Per view: +1
*   Per like: +2
*   Per comment: +4



User:
*   Post discipline (tag) = User preferred discipline: * 1.2
*   Post subdiscipline = User preferred subdiscipline: * 1.5

Subdiscipline:

'humanities', 'history', 'law', 'philosophy',
'religious studies', 'divinity', 'theology', 'social science',
'anthropology', 'archaeology', 'economics', 'geography',
'linguistics', 'psychology', 'sociology', 'natural science',
'biology', 'chemistry', 'astronomy', 'physics',
'computer science', 'mathematics', 'applied science',
'agriculture', 'architecture', 'business', 'education',
'engineering', 'environmental studies', 'forestry', 'recreation',
'journalism', 'media studies', 'communication', 'health',
'medicine', 'military', 'public administration', 'public policy',
'social work', 'transportation', 'technology', 'climate change',
'coding', 'investing', 'books', 'inspiration', 'digital marketing',
'lgbtq', 'women', 'art', 'data', 'personal development', 'news',
'finance', 'christmas', 'deep learning', 'work', 'family',
'spirituality', 'money', 'short story', 'ux', 'parenting',
'coronavirus', 'creativity', 'fiction', 'travel', 'social media',
'leadership', 'data visualization', 'humor', 'web development',
'productivity', 'music', 'ethereum', 'entrepreneurship',
'marketing', 'self', 'design', 'culture', 'artificial intelligence',
'relationships', 'javascript', 'mental health', 'business',
'python', 'love', 'software development', 'covid 19',
'self improvement', 'life lessons', 'startup', 'politics',
'writing', 'bitcoin', 'life', 'machine learning', 'cryptocurrency',
'poetry', 'programming', 'data science', 'blockchain'

# Data Preprocessing & Cleaning

---

Data is from Medium (https://www.kaggle.com/datasets/fabiochiusano/medium-articles/data).

In [None]:
## Import Libraries
import tensorflow as tf
import numpy as np
import pandas as pd
import tensorflow_hub as hub

In [None]:
## Load Data from CSV
data_path = 'drive/MyDrive/Dataset/medium_articles.csv'
data = pd.read_csv(data_path, names=["title", "text", "url", "authors", "timestamp",
           "tags"])
data.head()

Unnamed: 0,title,text,url,authors,timestamp,tags
0,title,text,url,authors,timestamp,tags
1,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"['Mental Health', 'Health', 'Psychology', 'Sci..."
2,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"['Mental Health', 'Coronavirus', 'Science', 'P..."
3,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,"['Biotechnology', 'Neuroscience', 'Brain', 'We..."
4,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"['Health', 'Neuroscience', 'Mental Health', 'P..."


In [None]:
## Drop unused columns
non_required_columns = ['url', 'authors', 'timestamp']
filtered_data = data.drop(columns=non_required_columns)
filtered_data.head()

Unnamed: 0,title,text,tags
0,title,text,tags
1,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,"['Mental Health', 'Health', 'Psychology', 'Sci..."
2,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,"['Mental Health', 'Coronavirus', 'Science', 'P..."
3,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,"['Biotechnology', 'Neuroscience', 'Brain', 'We..."
4,The 4 Purposes of Dreams,Passionate about the synergy between science a...,"['Health', 'Neuroscience', 'Mental Health', 'P..."


In [None]:
## Define function to make tags into comma-separated strings for easier processing
def process_tags(tags_text:str):
    # Extract each tag
    tags:list = tags_text[1:-1].split(', ')
    # Remove single quote from the word
    tags = [w[1:-1] for w in tags]
    return ', '.join(tags)

In [None]:
filtered_data['merged_text'] = filtered_data.title + " " + filtered_data.text # Combine article title and text
filtered_data['tags_text'] = filtered_data.tags.apply(process_tags) # Make tags to string
final_data = filtered_data.drop(columns=['title', 'text', 'tags']) # Drop unused data

final_data.head()

Unnamed: 0,merged_text,tags_text
0,title text,
1,Mental Note Vol. 24 Photo by Josh Riemer on Un...,"Mental Health, Health, Psychology, Science, Ne..."
2,Your Brain On Coronavirus Your Brain On Corona...,"Mental Health, Coronavirus, Science, Psycholog..."
3,Mind Your Nose Mind Your Nose\n\nHow smell tra...,"Biotechnology, Neuroscience, Brain, Wellness, ..."
4,The 4 Purposes of Dreams Passionate about the ...,"Health, Neuroscience, Mental Health, Psycholog..."


In [None]:
## Define valid subdisciplines

subdisciplines = ['humanities', 'history', 'law', 'philosophy',
               'religious studies', 'divinity', 'theology', 'social science',
               'anthropology', 'archaeology', 'economics', 'geography',
               'linguistics', 'psychology', 'sociology', 'natural science',
               'biology', 'chemistry', 'astronomy', 'physics',
               'computer science', 'mathematics', 'applied science',
               'agriculture', 'architecture', 'business', 'education',
               'engineering', 'environmental studies', 'forestry', 'recreation',
               'journalism', 'media studies', 'communication', 'health',
               'medicine', 'military', 'public administration', 'public policy',
               'social work', 'transportation', 'technology', 'climate change',
               'coding', 'investing', 'books', 'inspiration', 'digital marketing',
               'lgbtq', 'women', 'art', 'data', 'personal development', 'news',
               'finance', 'christmas', 'deep learning', 'work', 'family',
               'spirituality', 'money', 'short story', 'ux', 'parenting',
               'coronavirus', 'creativity', 'fiction', 'travel', 'social media',
               'leadership', 'data visualization', 'humor', 'web development',
               'productivity', 'music', 'ethereum', 'entrepreneurship',
               'marketing', 'self', 'design', 'culture', 'artificial intelligence',
               'relationships', 'javascript', 'mental health', 'business',
               'python', 'love', 'software development', 'covid 19',
               'self improvement', 'life lessons', 'startup', 'politics',
               'writing', 'bitcoin', 'life', 'machine learning', 'cryptocurrency',
               'poetry', 'programming', 'data science', 'blockchain']



In [None]:
## Data Cleaning
np_data = pd.DataFrame(final_data).to_numpy() # Convert pandas DataFrame to numpy


np_data[:,1] = np.char.lower(np.vectorize(str)(np_data[:,1])) # Make all labels lowercase

# Remove all data with tags not matching the specified subdiscipline
mask = np.array([any(sentence in subdisciplines for sentence in row[1].split(', ')) for row in np_data])
clean_data = np_data[mask]

# Remove all data with non-string text
clean_data = clean_data[np.array([isinstance(element, str) for element in clean_data[:,0]])]

print(len(clean_data))

119552


# Data Conversion


---



*   Split data to test, training, validation
*   Convert label to one-hot-encoding
*   Convert features to embeddings



In [None]:
## Multi Label Encoding for Y
# Create an empty binary matrix with dimensions len(B) x len(A)
one_hot_matrix = np.zeros((len(clean_data[:,1]), len(subdisciplines)), dtype=int)

# Iterate over each element in B and set the corresponding columns to 1
for i, y_string in enumerate(clean_data[:,1]):
    words_in_y = set(y_string.split(', '))
    one_hot_matrix[i, np.isin(subdisciplines, list(words_in_y))] = 1

In [None]:
## Use pre-trained embeddings from Google
embedding = "https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2"
hub_layer = hub.KerasLayer(embedding, input_shape=[],
                           dtype=tf.string, trainable=True)

In [None]:
## Convert numpy array to tf.data.Dataset
clean_data_tf = tf.data.Dataset.from_tensor_slices((clean_data[:,0], one_hot_matrix))

In [None]:
# Define training, test, validation data sizes
test_size = 0.1
val_size = 0.1
train_size = 1 - test_size - val_size

num_train = int(train_size * len(clean_data_tf))
num_test = int(test_size * len(clean_data_tf))
num_val = int(val_size * len(clean_data_tf))

clean_data_tf = clean_data_tf.shuffle(len(clean_data_tf))

# Split data to train, val, test
train_ds = clean_data_tf.take(num_train)
val_ds = clean_data_tf.skip(num_train).take(num_val)
test_ds = clean_data_tf.skip(num_train).skip(num_val)

In [None]:
## Optimizations
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
## Print example of final clean data
train_examples_batch, train_labels_batch = next(iter(val_ds.batch(10)))
print(hub_layer(train_examples_batch[:3]))
print(train_labels_batch[:3])

tf.Tensor(
[[ 4.89016801e-01 -1.81025192e-01  1.74852565e-01  1.75405905e-01
   1.27956614e-01  9.37550142e-02  6.48331642e-02 -4.98378165e-02
  -1.54278561e-01 -3.48237939e-02  3.84046026e-02 -3.54826786e-02
  -2.09723003e-02 -1.03715390e-01 -1.44909054e-01  6.34276718e-02
  -3.85391191e-02  8.51166621e-03 -1.76624596e-01  3.50924999e-01
   1.19511373e-01  1.13382610e-02  7.68053904e-02 -2.04962477e-01
   2.70268712e-02 -3.88185456e-02  1.86781257e-01  1.11163840e-01
  -9.32024643e-02 -3.29929478e-02  8.68987441e-02 -2.19244093e-01
   4.87079658e-02  1.43391609e-01  1.66065581e-02  1.06931232e-01
   3.71863134e-02 -9.69239473e-02  4.63520698e-02  2.65584499e-01
  -1.72808743e-03 -1.60391659e-01 -9.85310227e-02  1.64893046e-01
   1.73364148e-01  9.94656011e-02  1.35640777e-03  1.15773976e-01
   2.54129678e-01 -2.26609837e-02  2.42888570e-01 -1.95497885e-01
   1.17354237e-01  2.51991928e-01 -2.55847991e-01 -1.17174812e-01
   6.02018461e-02 -9.96607468e-02  1.73823237e-02  2.59179264e-01

In [None]:
## Define the Model
embedding_dim = 64
num_subcategories = 103

model = tf.keras.models.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Reshape((1, 128)))
model.add(tf.keras.layers.LSTM(64, return_sequences=True))
model.add(tf.keras.layers.GlobalMaxPooling1D())
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dense(num_subcategories, activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 128)               124642688 
                                                                 
 reshape (Reshape)           (None, 1, 128)            0         
                                                                 
 lstm (LSTM)                 (None, 1, 64)             49408     
                                                                 
 global_max_pooling1d (Glob  (None, 64)                0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 103)               13287     
                                                        

In [None]:
def f1_score(y_true, y_pred):
  # Function to return F1 score
  y_true = tf.cast(y_true, tf.float32)
  y_pred = tf.cast(tf.round(y_pred), tf.float32)

  true_positives = tf.reduce_sum(y_true * y_pred, axis=0)
  predicted_positives = tf.reduce_sum(y_pred, axis=0)
  actual_positives = tf.reduce_sum(y_true, axis=0)

  precision = true_positives / (predicted_positives + tf.keras.backend.epsilon())
  recall = true_positives / (actual_positives + tf.keras.backend.epsilon())

  f1 = 2 * precision * recall / (precision + recall + tf.keras.backend.epsilon())

  # Return the mean F1 score across all classes
  return tf.reduce_mean(f1)

In [None]:
## Compile the Model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy', f1_score])

In [None]:
## Define Callback (To debug)
class FirstLayerOutputCallback(tf.keras.callbacks.Callback):
  def __init__(self, model, dataset):
      super(FirstLayerOutputCallback, self).__init__()
      self.model = model
      self.dataset = dataset
      self.iterator = iter(dataset)

  def on_epoch_begin(self, epoch, logs=None):
      # Reset the iterator at the beginning of each epoch
      self.iterator = iter(self.dataset)

  def on_epoch_end(self, epoch, logs=None):
      # Extract a single data item (input and label) from the dataset
      data_item = next(self.iterator)

      # Unpack the data item into input and label
      input_data, label = data_item[0], data_item[1]

      # Ensure that the first layer of your model expects the same kind of input
      first_layer_output = self.model.layers[0](input_data)

      # Print or use the first layer output and label
      print("Epoch {}: First Layer Output: {}, Label: {}".format(epoch, first_layer_output.numpy(), label.numpy()))

callback = FirstLayerOutputCallback(model, train_ds)

In [None]:
## Train Model
batch_size = 512

history = model.fit(train_ds.shuffle(len(train_ds)).batch(batch_size),
                    validation_data=val_ds.batch(batch_size),
                    epochs=100,
                    verbose=1,)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100