# Learning Equality - Curriculum Recommendations Project

#### Taro Iyadomi (UCLA Data Theory '24)

#### 12/23/2022 - Present

In [36]:
#Import necessary packages
import pandas as pd
import numpy as np
import re

## I. Viewing Data

In [37]:
#Sample Submission
sample_submission = pd.read_csv("sample_submission.csv")
print(np.shape(sample_submission))
sample_submission.head()

(5, 2)


Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...
4,t_4054df11a74e,c_3695c5dc1df6 c_f2d184a98231


In [38]:
#Correlations
correlations = pd.read_csv("correlations.csv")
print(np.shape(correlations))
correlations.head()

(61517, 2)


Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...
4,t_0008768bdee6,c_34e1424229b4 c_7d1a964d66d5 c_aab93ee667f4


In [39]:
#Topics
topics = pd.read_csv("topics.csv")
print(np.shape(topics))
topics.head()

(76972, 9)


Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content
0,t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True
1,t_000095e03056,Unit 3.3 Enlargements and Similarities,,b3f329,aligned,2,en,t_aa32fb6252dc,False
2,t_00068291e9a4,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True
3,t_00069b63a70a,Transcripts,,6e3ba4,source,3,en,t_4054df11a74e,True
4,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,000cf7,source,4,bg,t_e2452e21d252,True


In [40]:
#Content
content = pd.read_csv("content.csv")
print(np.shape(content))
content.head()

(154047, 8)


Unnamed: 0,id,title,description,kind,text,language,copyright_holder,license
0,c_00002381196d,"Sumar números de varios dígitos: 48,029+233,930","Suma 48,029+233,930 mediante el algoritmo está...",video,,es,,
1,c_000087304a9e,Trovare i fattori di un numero,Sal trova i fattori di 120.\n\n,video,,it,,
2,c_0000ad142ddb,Sumar curvas de demanda,Cómo añadir curvas de demanda\n\n,video,,es,,
3,c_0000c03adc8d,Nado de aproximação,Neste vídeo você vai aprender o nado de aproxi...,document,\nNado de aproximação\nSaber nadar nas ondas ...,pt,Sikana Education,CC BY-NC-ND
4,c_00016694ea2a,geometry-m3-topic-a-overview.pdf,geometry-m3-topic-a-overview.pdf,document,Estándares Comunes del Estado de Nueva York\n\...,es,Engage NY,CC BY-NC-SA


## II. Preparing Data

In [41]:
#Split train/test data
from sklearn.model_selection import train_test_split

corr_train, corr_test = train_test_split(
    correlations,
    train_size = 0.01,
    test_size = 0.99,
    random_state = 10,
    shuffle = True
)

print(
    len(corr_train),
    len(corr_test)
)

#We train a very small subset of data because Siamese Neural Networks compare each entry to another, leading to n^2 comparisons.
#So, 615 training observations will lead to 615^2 (378,225) comparisons being made.


615 60902


In [42]:
def combine(correlations, topics, content):
    #Drop/combine columns
    content["text"] = content["text"].fillna('')
    content = content.dropna()
    content_combined = content["language"] + " " + content["title"] + " " + content["description"] + " " + content["text"]
    content_combined = pd.DataFrame({"id":content["id"], "features":content_combined})

    topics["description"] = topics["description"].fillna('')
    topics = topics.dropna()
    topics_combined = topics["language"] + " " + topics["title"] + " " + topics["description"] + " " + topics["channel"]
    topics_combined = pd.DataFrame({"id":topics["id"], "features":topics_combined})

    #Explode correlations rows
    correlations["content_ids"] = correlations["content_ids"].str.split()
    correlations = correlations.explode("content_ids")

    #Merge
    merged = correlations.merge(topics_combined, how="inner", left_on="topic_id", right_on="id")
    merged = merged.reset_index().merge(content_combined, how="inner", left_on="content_ids", right_on="id", sort=False, suffixes=("_topics", "_content")).sort_values(axis=0, by="index")
    merged = merged.drop(["content_ids", "topic_id"], axis=1)

    #Split
    corr_topics = merged[['index', 'features_topics']]
    corr_topics.columns = ['index', 'features']
    corr_content = merged[['index', 'features_content']]
    corr_content.columns = ['index', 'features']

    corr_topic_ids = merged[['id_topics']]
    corr_content_ids = merged[['id_content']]

    return corr_topics, corr_content, corr_topic_ids, corr_content_ids



In [43]:
#Test case
from copy import deepcopy
topics_copy = deepcopy(topics)
content_copy = deepcopy(content)
topics_copy2 = deepcopy(topics)
content_copy2 = deepcopy(content)
topics_copy3 = deepcopy(topics)
content_copy3 = deepcopy(content)
corr_copy = deepcopy(correlations)

train_topics, train_content, train_topic_ids, train_content_ids = combine(corr_train, topics_copy, content_copy)
train_topics.head()
train_content.head()

test_topics, test_content, test_topic_ids, test_content_ids= combine(corr_test, topics_copy2, content_copy2)
test_topics.head()
print(test_content.head())

all_topics, all_content, all_topics_ids, all_content_ids = combine(corr_copy, topics_copy3, content_copy3)
print(all_topics.head())

   index                                           features
0     13  es Ondas En este recurso nos preguntamos: ¿Por...
2     14  es Las radiaciones en la vida cotidiana En est...
5     15  pt Curiosidades Matemáticas: O porquê da regra...
7     17  pt Para Saber Mais!: Reconhecimento de Números...
9     21  es Cómo escribir expresiones con variables y p...
    index                                           features
0       5  pt Entradas e saídas de uma função Entenda um ...
3       6  pt Entradas e saídas de uma função Entenda um ...
6       8                             en Transcripts  6e3ba4
7       9  bg Графики на експоненциални функции (Алгебра ...
10     10  bg Графики на експоненциални функции (Алгебра ...


In [44]:
print(all_topics_ids.head())

         id_topics
0   t_00068291e9a4
3   t_00068291e9a4
6   t_00069b63a70a
7   t_0006d41a73a8
10  t_0006d41a73a8


In [42]:
#Create TF Data Pipeline
import tensorflow as tf

#Train
train_topics_raw = tf.data.Dataset.from_tensor_slices(
    tf.cast(train_topics.features, tf.string)
)

train_content_raw = tf.data.Dataset.from_tensor_slices(
    tf.cast(train_content.features, tf.string)
)

#Test
test_topics_raw = tf.data.Dataset.from_tensor_slices(
    tf.cast(test_topics.features, tf.string)
)

test_content_raw = tf.data.Dataset.from_tensor_slices(
    tf.cast(test_content.features, tf.string)
)


In [43]:
print("tf version:",tf.__version__)

print("keras version:", tf.keras.__version__)

tf version: 2.9.1
keras version: 2.9.0


In [44]:
#Hyperparameters
VOCAB_SIZE = 100000
MAX_LEN = 50

In [46]:
#Custom standardization function
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

#lang_code_dict is a dictionary of all languages present in the dataset
lang_dict = {
    "en":"english",
    "es":"spanish",
    "it":"italian",
    'pt':"portuguese",
    'mr':'marathi',
    'bg':'bulgarian',
    'gu':'gujarati',
    'sw':'swahili',
    'hi':'hindi',
    'ar':'arabic',
    'bn':'bengali',
    'as':'assamese',
    'zh':'chinese',
    'fr':'french',
    'km':'khmer',
    'pl':'polish',
    'ta':'tamil',
    'or':'oriya',
    'ru':'russian',
    'kn':'kannada',
    'swa':'swahili',
    'my':'burmese',
    'pnb':'punjabi',
    'fil':'filipino',
    'tr':'turkish',
    'te':'telugu',
    'ur':'urdu'
}

#supported_languages is a list of languages supported by the natural language tool kit (NLTK) module. 
supported_languages = stopwords.fileids()

def custom_standardize(input_string):
    #basic cleaning
    lower = tf.strings.lower(input_string, encoding='utf-8')
    no_stars = tf.strings.regex_replace(lower, "\*", " ")
    no_newline = tf.strings.regex_replace(no_stars, "\n", "")
    no_digits = tf.strings.regex_replace(no_newline, "\w*\d\w*","")

    #testing
    lang_code = input_string[0:2]
    if lang_code == 'en':
        for word in stopwords.words('english'):
            no_stopwords = tf.strings.regex_replace(no_digits, ' ' + word + ' ', " ")

    
    
    no_punctuations = tf.strings.regex_replace(no_digits, f"([{string.punctuation}])", r" ")

    #testing
    
    #remove stopwords
    # lang_code = input_string[0:2]
    # no_stopwords = ' ' + no_punctuations + ' '
    # if lang_code_dict[lang_code] in supported_languages:
    #     for word in stopwords.words(lang_code_dict[lang_code]):
    #         no_stopwords = tf.strings.regex_replace(no_punctuations, ' ' + word[0] + ' ', r" ")
    #     no_extra_space = tf.strings.regex_replace(no_stopwords, " +"," ")

    
    out = tf.strings.strip(no_stopwords)
    #out = tf.strings.strip(no_punctuations)
    
    return out

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Polar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [47]:
def test_standardize(input_string):
    input_string = tf.strings.lower(input_string, encoding='utf-8') #lowercase
    input_string = tf.strings.regex_replace(input_string, f"([{string.punctuation}])", r"") #remove punctuation
    input_string = tf.strings.regex_replace(input_string, '\n', "") #remove newlines
    input_string = tf.strings.regex_replace(input_string, '\w*\d\w*', "") #remove digits
    input_string = tf.strings.regex_replace(input_string, ' +', " ") #remove 2+ whitespaces
    input_string = tf.strings.strip(input_string) #remove leading and tailing whitespaces

    return input_string

In [48]:
#test case
input_string = tf.strings.lower("en HELLO,, my' ^nAme is Taro1234 and 1 1ike to build models!.,  ", encoding='utf-8')
print("input string: ", input_string)

output_string = test_standardize(input_string)
print("output string: ", output_string.numpy().decode('utf-8'))
#print("output string: ", output_string)

lang_dict['en'] in supported_languages

input string:  tf.Tensor(b"en hello,, my' ^name is taro1234 and 1 1ike to build models!.,  ", shape=(), dtype=string)
output string:  en hello name build models


True

In [108]:
#stopwords.words('english')

In [49]:
#Text vectorization layer
from tensorflow.keras.layers import TextVectorization

vectorize_layer = TextVectorization(
    standardize = "lower_and_strip_punctuation",
    #standardize = test_standardize,
    split = "whitespace",
    max_tokens = VOCAB_SIZE + 2,
    output_mode = 'int',
    output_sequence_length = MAX_LEN
)

In [51]:
#Adapt text vectorization layer
vectorize_layer.adapt(train_content.features)
#vocab = vectorize_layer.get_vocabulary()  # To get words back from token indices

In [52]:
#Convert text to tf.string 
def convert_text_input(sample):
    text = sample
    text = tf.expand_dims(text, -1)  
    return tf.squeeze(vectorize_layer(text))

In [53]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_content_mapped = train_content_raw.map(convert_text_input, num_parallel_calls=AUTOTUNE)

train_topics_mapped = train_topics_raw.map(convert_text_input, num_parallel_calls=AUTOTUNE)

test_topics_mapped = test_topics_raw.map(convert_text_input, num_parallel_calls=AUTOTUNE)

test_content_mapped = test_content_raw.map(convert_text_input, num_parallel_calls=AUTOTUNE)


In [65]:
for each in train_topics_mapped.take(3):
  print(each)

tf.Tensor(
[ 3254  9418     1 51922     1     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0], shape=(50,), dtype=int64)
tf.Tensor(
[ 3254  9418     1 51922     1     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0], shape=(50,), dtype=int64)
tf.Tensor(
[ 3254  9418     1 51922     1     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0   

In [59]:
train_ds = tf.data.Dataset.zip(
    (train_topics_mapped, train_content_raw)
)

test_ds = tf.data.Dataset.zip(
    (test_topics_mapped, test_content_raw)
)

In [None]:
for x,y in train_ds.take(1):
  print("input (topic) x.shape: ", x.shape)
  print("output (content) y.shape: ", y.shape)
  print("input (topic) x: ", x)
  print("output (content) y: ",y)
#   input = " ".join([vocab[_] for _ in np.squeeze(x)])
#   output = id_to_category[y.numpy()]
#   print("x: input (review) in text: " , input)
#   print("y: output (category) in text: " , output)

In [60]:
#Finalize data pipeline
BATCH_SIZE = 64
BUFFER_SIZE = train_ds.cardinality().numpy()

train_ds = train_ds.shuffle(buffer_size=BUFFER_SIZE)\
    .batch(batch_size = BATCH_SIZE, drop_remainder=True)\
        .cache()\
            .prefetch(AUTOTUNE)

test_ds = test_ds.shuffle(buffer_size=BUFFER_SIZE)\
    .batch(batch_size = BATCH_SIZE, drop_remainder=True)\
        .cache()\
            .prefetch(AUTOTUNE)


In [61]:
train_ds.element_spec

(TensorSpec(shape=<unknown>, dtype=tf.int64, name=None),
 TensorSpec(shape=(64,), dtype=tf.string, name=None))

## III. Building the Model

In [69]:
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential, Model

inp_content = Input((1, ), name='inp_content') 
inp_topics = Input((1, ), name='inp_topics')

snn = Sequential([
  Reshape((1, )),
  vectorize_layer,
  Embedding(VOCAB_SIZE, 256),
  GlobalAveragePooling1D(),
  Flatten(),
  Dense(64, activation='relu'),
])

feature_vector_content = snn(inp_content)
feature_vector_topics = snn(inp_topics)

concat = Concatenate()([feature_vector_topics, feature_vector_content])

dense = Dense(64, activation='relu')(concat)

output = Dense(1, activation='sigmoid')(dense)

model = Model(inputs=[inp_topics, inp_content], outputs=output)

model.summary()


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 inp_topics (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 inp_content (InputLayer)       [(None, 1)]          0           []                               
                                                                                                  
 sequential_2 (Sequential)      (None, 64)           26419264    ['inp_content[0][0]',            
                                                                  'inp_topics[0][0]']             
                                                                                                  
 concatenate_1 (Concatenate)    (None, 128)          0           ['sequential_2[1][0]',     

## IV. Making Pairs

In [1]:
import itertools

def pair_data(topics, content):
    text_pairs, id_pairs = [], []
    tuples = [(x1, y1) for x1, y1 in zip(topics, content)]

    for t in itertools.product(tuples, tuples):
        item_A, item_B = t
        index_A, text_A = t[0]
        index_B, text_B = t[1]
    
    is_match = int(index_A == index_B)

    


In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              metrics=['accuracy'])