## TITLE

In this approach, I will concatenate the correlations data with the content data to train a multilabel classification model. With that model, I will predict the labels for all of the topics in the topics dataset, as I will have first formatted the topics dataset to match the format of the content dataset.

In [2]:
import pandas as pd
import numpy as np

In [3]:
#load data

topics = pd.read_csv("topics.csv")
topics.head()
content = pd.read_csv('content.csv')
content.head()
correlations = pd.read_csv('correlations.csv')
correlations.head()

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...
4,t_0008768bdee6,c_34e1424229b4 c_7d1a964d66d5 c_aab93ee667f4


In [4]:
content.head()

Unnamed: 0,id,title,description,kind,text,language,copyright_holder,license
0,c_00002381196d,"Sumar números de varios dígitos: 48,029+233,930","Suma 48,029+233,930 mediante el algoritmo está...",video,,es,,
1,c_000087304a9e,Trovare i fattori di un numero,Sal trova i fattori di 120.\n\n,video,,it,,
2,c_0000ad142ddb,Sumar curvas de demanda,Cómo añadir curvas de demanda\n\n,video,,es,,
3,c_0000c03adc8d,Nado de aproximação,Neste vídeo você vai aprender o nado de aproxi...,document,\nNado de aproximação\nSaber nadar nas ondas ...,pt,Sikana Education,CC BY-NC-ND
4,c_00016694ea2a,geometry-m3-topic-a-overview.pdf,geometry-m3-topic-a-overview.pdf,document,Estándares Comunes del Estado de Nueva York\n\...,es,Engage NY,CC BY-NC-SA


In [5]:
NUM_UNIQUE_CONTENT = len(content.id.unique())

In [6]:
print(topics.language.unique())
print(content.language.unique())

['bg' 'en' 'pt' 'gu' 'my' 'zh' 'ar' 'te' 'es' 'fr' 'sw' 'mr' 'hi' 'bn'
 'fil' 'ru' 'it' 'or' 'pnb' 'km' 'as' 'kn' 'ur' 'pl' 'ta' 'swa' 'tr' 'mul']
['es' 'it' 'pt' 'en' 'mr' 'bg' 'gu' 'sw' 'hi' 'ar' 'bn' 'as' 'zh' 'fr'
 'km' 'pl' 'ta' 'fil' 'or' 'ru' 'kn' 'swa' 'my' 'pnb' 'tr' 'te' 'ur']


In [7]:
def combine(correlations, topics, content):
    #Explode correlations rows
    correlations["content_ids"] = correlations["content_ids"].str.split()
    correlations = correlations.explode("content_ids")

    #Format content/topics to match (id/features)
    content["text"] = content["text"].fillna('')
    content = content.dropna()
    content_combined = content["language"] + " " + content["title"] + " " + content["description"] + " " + content["text"]
    content_combined = pd.DataFrame({"features":content_combined, "content_id":content["id"]})

    topics["description"] = topics["description"].fillna('')
    topics = topics.dropna()
    topics_combined = topics["language"] + " " + topics["channel"] + ' ' + topics["title"] + " " + topics["description"]
    topics_combined = pd.DataFrame({"id":topics["id"], "features":topics_combined})

    #Combine to create new dataset
    corr_topics = correlations.merge(topics_combined, how="inner", left_on="topic_id", right_on="id")
    #corr_topic_ids = corr_topics[['topic_id']]
    corr_topics = pd.DataFrame({'features':corr_topics['features'], 'content_id':corr_topics["content_ids"]})
    out = pd.concat([corr_topics, content_combined])
    
    return out


In [8]:
combined_ds = combine(correlations, topics, content)
np.shape(combined_ds)

(321513, 2)

In [9]:
combined_ds.head()

Unnamed: 0,features,content_id
0,bg 000cf7 Откриването на резисторите Изследван...,c_1108dd0c7a5d
1,bg 000cf7 Откриването на резисторите Изследван...,c_376c5a8eb028
2,bg 000cf7 Откриването на резисторите Изследван...,c_5bc0e1e2cba0
3,bg 000cf7 Откриването на резисторите Изследван...,c_76231f9d0b5e
4,pt 8e286a Entradas e saídas de uma função Ente...,c_639ea2ef9c95


In [10]:
combined_ds["content_id"] = combined_ds["content_id"].astype('category')
combined_ds["id"] = combined_ds["content_id"].cat.codes
combined_ds.head()

Unnamed: 0,features,content_id,id
0,bg 000cf7 Откриването на резисторите Изследван...,c_1108dd0c7a5d,10215
1,bg 000cf7 Откриването на резисторите Изследван...,c_376c5a8eb028,33409
2,bg 000cf7 Откриването на резисторите Изследван...,c_5bc0e1e2cba0,55276
3,bg 000cf7 Откриването на резисторите Изследван...,c_76231f9d0b5e,70947
4,pt 8e286a Entradas e saídas de uma função Ente...,c_639ea2ef9c95,59974


In [11]:
id_to_category = pd.Series(combined_ds.content_id.values, index=combined_ds.id).to_dict()

In [12]:
combined_ds = combined_ds.drop("content_id", axis=1)
combined_ds.head()

Unnamed: 0,features,id
0,bg 000cf7 Откриването на резисторите Изследван...,10215
1,bg 000cf7 Откриването на резисторите Изследван...,33409
2,bg 000cf7 Откриването на резисторите Изследван...,55276
3,bg 000cf7 Откриването на резисторите Изследван...,70947
4,pt 8e286a Entradas e saídas de uma função Ente...,59974


In [13]:
from sklearn.model_selection import train_test_split

train_ds, test_ds = train_test_split(
    combined_ds,
    train_size=0.8,
    test_size=0.2,
    random_state=10,
    shuffle=True
)

In [14]:
np.shape(train_ds)

(257210, 2)

In [15]:
train_ds.head()

Unnamed: 0,features,id
91633,es 36a98b Seleccionar procedimientos para calc...,54244
51586,es Lección 1 Determinando las consecuencias en...,51581
100915,"fr c152d6 Dérivée d'un produit, d'un quotient ...",95527
129902,en ebc86c 2C: Understanding Protein Conformation,6342
176813,es 36a98b Ecuación estándar de un círculo Apre...,117491


In [16]:
np.shape(test_ds)

(64303, 2)

In [17]:
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential, Model
from nltk.corpus import stopwords
import string
from unidecode import unidecode

In [18]:
#Create custom standardization function to deal with punctuations, lowering the text, and removing stopwords
#dictionary of languages found in our data
lang_dict = {
    "en":"english",
    "es":"spanish",
    "it":"italian",
    'pt':"portuguese",
    'mr':'marathi',
    'bg':'bulgarian',
    'gu':'gujarati',
    'sw':'swahili',
    'hi':'hindi',
    'ar':'arabic',
    'bn':'bengali',
    'as':'assamese',
    'zh':'chinese',
    'fr':'french',
    'km':'khmer',
    'pl':'polish',
    'ta':'tamil',
    'or':'oriya',
    'ru':'russian',
    'kn':'kannada',
    'swa':'swahili',
    'my':'burmese',
    'pnb':'punjabi',
    'fil':'filipino',
    'tr':'turkish',
    'te':'telugu',
    'ur':'urdu',
    'fi':'finnish',
    'pn':'unknown'}

#list of languages supported by the natural language tool kit (NLTK) module.
supported_languages = stopwords.fileids()

def remove_stopwords(text):
    lang_code = text[0:2]
    if lang_dict[lang_code] in supported_languages:
        for word in stopwords.words(lang_dict[lang_code]):
            text = text.replace(' ' + word + ' ', ' ')
    return text



In [19]:
train_ds['features'] = train_ds.features.apply(remove_stopwords)

In [20]:
train_ds.head()

Unnamed: 0,features,id
91633,es 36a98b Seleccionar procedimientos calcular ...,54244
51586,es Lección 1 Determinando consecuencias cascad...,51581
100915,"fr c152d6 Dérivée d'un produit, d'un quotient ...",95527
129902,en ebc86c 2C: Understanding Protein Conformation,6342
176813,es 36a98b Ecuación estándar círculo Aprende ac...,117491


In [21]:
test_ds['features'] = test_ds.features.apply(remove_stopwords)

In [22]:
test_ds.head()

Unnamed: 0,features,id
76452,zh f83dcf 长方形、正方形面积的计算 计算长方形和正方形的面积,136066
253012,en 0ec697 Defining convergent divergent infini...,112866
98506,es f65044 1 Tomamos decisiones familia cuidar ...,117667
205777,en abd7dc 9: Ontologies Natural Languages,27546
182637,en 0ec697 Appropriate units Learn work units a...,72297


In [23]:
def my_standardize(text):    

    text = tf.strings.lower(text, encoding='utf-8') #lowercase
    text = tf.strings.regex_replace(text, f"([{string.punctuation}])", r"") #remove punctuation
    text = tf.strings.regex_replace(text, '\n', "") #remove newlines
    text = tf.strings.regex_replace(text, ' +', " ") #remove 2+ whitespaces
    text = tf.strings.strip(text) #remove leading and tailing whitespaces

    return text

In [24]:
VOCAB_SIZE = 60000 #INCREASE WHEN TRAINING FR!
MAX_LEN = 50

In [25]:
vectorize_layer = TextVectorization(
    standardize = my_standardize,
    split = "whitespace",
    max_tokens = VOCAB_SIZE + 2,
    output_mode = 'int',
    output_sequence_length = MAX_LEN
)

In [26]:
#Adapt text vectorization layer
vectorize_layer.adapt(train_ds.features)
vocab = vectorize_layer.get_vocabulary()  # To get words back from token indices

In [27]:
#build datapipeline
train_features_raw = tf.data.Dataset.from_tensor_slices(
    tf.cast(train_ds.features, tf.string)
)

train_labels = tf.data.Dataset.from_tensor_slices(
    tf.cast(train_ds.id, tf.int32)
)

test_features_raw = tf.data.Dataset.from_tensor_slices(
    tf.cast(test_ds.features, tf.string)
)

test_labels = tf.data.Dataset.from_tensor_slices(
    tf.cast(test_ds.id, tf.int32)
)

In [28]:
def convert_text_input(sample):
    text = sample
    text = tf.expand_dims(text, -1)  
    return tf.squeeze(vectorize_layer(text))

In [29]:
#Convert text input to vectorized format
train_features = train_features_raw.map(convert_text_input, num_parallel_calls=tf.data.experimental.AUTOTUNE)

test_features = test_features_raw.map(convert_text_input, num_parallel_calls=tf.data.experimental.AUTOTUNE)


In [30]:
for element in train_features.take(3):
    print(element)

tf.Tensor(
[   77   372 12145 12218  1664  8165  4556  5018     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0], shape=(50,), dtype=int64)
tf.Tensor(
[   77   283    13 28317  7306 30159  1078     1     1 10505 26292  3694
   101   116    54    40    44    14    17     1  4802  2422  7800  8178
  8624  3694   926     1    19     7    36    65   334    48    58    47
    59  1308  1323   438     7  1082    21     2  1301   231   307     5
   985  1272], shape=(50,), dtype=int64)
tf.Tensor(
[  256   473  9380  1859  7770  1859  2607  1637  2171 19213     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0   

In [31]:
train_tf_ds = tf.data.Dataset.zip((
    train_features,
    train_labels
))

test_tf_ds = tf.data.Dataset.zip((
    test_features,
    test_labels
))

In [32]:
for X,y in train_tf_ds.take(1):
  print("input (features) X.shape: ", X.shape)
  print("output (label) y.shape: ", y.shape)
  print("input (features) X: ", X)
  print("output (label) y: ", y)

input (features) X.shape:  (50,)
output (label) y.shape:  ()
input (features) X:  tf.Tensor(
[   77   372 12145 12218  1664  8165  4556  5018     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0], shape=(50,), dtype=int64)
output (label) y:  tf.Tensor(54244, shape=(), dtype=int32)


In [33]:
BATCH_SIZE = 64
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_tf_ds = train_tf_ds.batch(batch_size = BATCH_SIZE, drop_remainder=True)\
    .cache()\
        .prefetch(AUTOTUNE)

test_tf_ds = test_tf_ds.batch(batch_size = BATCH_SIZE, drop_remainder=True)\
    .cache()\
        .prefetch(AUTOTUNE)

In [34]:
train_tf_ds.element_spec

(TensorSpec(shape=<unknown>, dtype=tf.int64, name=None),
 TensorSpec(shape=(64,), dtype=tf.int32, name=None))

In [41]:
def create_model():
    input_tokens = Input(shape=(MAX_LEN, ), dtype=tf.int32)
    embedding_layer = Embedding(VOCAB_SIZE, 256)
    GAP_layer = GlobalAveragePooling1D()
    flatten_layer = Flatten()
    
    x = embedding_layer(input_tokens)
    x = GAP_layer(x)
    x = flatten_layer(x)
    x = Dense(128, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    output = Dense(1, activation='sigmoid')(x) #Show 10 best contents for each topics (then we will filter further)

    model = Model(inputs = input_tokens, outputs = output)

    LOSS_FUN = 'binary_crossentropy'
    OPTIMIZER = 'adam'
    METRIC = tf.keras.metrics.SparseCategoricalAccuracy()

    model.compile(optimizer = OPTIMIZER, loss = LOSS_FUN, metrics = METRIC)

    return model

In [42]:
print(NUM_UNIQUE_CONTENT)
print(len(train_ds.features))

154047
257210


In [43]:
my_model = create_model()

In [44]:
my_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 50)]              0         
                                                                 
 embedding_1 (Embedding)     (None, 50, 256)           15360000  
                                                                 
 global_average_pooling1d_1   (None, 256)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 flatten_1 (Flatten)         (None, 256)               0         
                                                                 
 dense_3 (Dense)             (None, 128)               32896     
                                                                 
 dense_4 (Dense)             (None, 64)                8256      
                                                           

In [45]:
my_model.fit(train_tf_ds, verbose=1, epochs=3)

Epoch 1/3
 398/4018 [=>............................] - ETA: 4:48 - loss: -3984096512.0000 - sparse_categorical_accuracy: 0.0000e+00

KeyboardInterrupt: 