# HW3 Example

In this example we'll use a combination of numeric features (ratings, lectures, course duration), categorical features (instructor), and text embedding to generate the most similar courses.

In [None]:
!pip install transformers
!pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

# this is a massive model - it's ~1.5GB, embeddings are 1024 dimensions, and are many layers stacked  
model = SentenceTransformer('stsb-roberta-large')

Downloading:   0%|          | 0.00/748 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/191 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.92k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
# let's list out each of the parameter shapes in this model:
for parameter in model.parameters():
  print(parameter.shape)

torch.Size([50265, 1024])
torch.Size([514, 1024])
torch.Size([1, 1024])
torch.Size([1024])
torch.Size([1024])
torch.Size([1024, 1024])
torch.Size([1024])
torch.Size([1024, 1024])
torch.Size([1024])
torch.Size([1024, 1024])
torch.Size([1024])
torch.Size([1024, 1024])
torch.Size([1024])
torch.Size([1024])
torch.Size([1024])
torch.Size([4096, 1024])
torch.Size([4096])
torch.Size([1024, 4096])
torch.Size([1024])
torch.Size([1024])
torch.Size([1024])
torch.Size([1024, 1024])
torch.Size([1024])
torch.Size([1024, 1024])
torch.Size([1024])
torch.Size([1024, 1024])
torch.Size([1024])
torch.Size([1024, 1024])
torch.Size([1024])
torch.Size([1024])
torch.Size([1024])
torch.Size([4096, 1024])
torch.Size([4096])
torch.Size([1024, 4096])
torch.Size([1024])
torch.Size([1024])
torch.Size([1024])
torch.Size([1024, 1024])
torch.Size([1024])
torch.Size([1024, 1024])
torch.Size([1024])
torch.Size([1024, 1024])
torch.Size([1024])
torch.Size([1024, 1024])
torch.Size([1024])
torch.Size([1024])
torch.Size([102

In [None]:
# this model is only 256MB
smaller_model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-tas-b')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/547 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
import pandas as pd
# download from gitub
df = pd.read_csv('https://raw.githubusercontent.com/ychennay/dso-560-nlp-text-analytics/main/datasets/top5000_udemy.csv',
                 usecols=['course_name', 'course description', 'reviews_count', 'lectures_count', 'course_duration', 'instructor'])

# extract some numeric fields
df["reviews"] = df.reviews_count.str.extract(r'(\d+) reviews')
df["lectures"] = df.lectures_count.str.extract(r'(\d+) lectures')
df["duration"] = df.course_duration.str.extract(r'(\d+) total hours')
df = df.drop(columns = ["course_duration", "lectures_count", "reviews_count"])
sampled_df = df.dropna().sample(1000)
text_df = sampled_df[["course_name", "course description", "instructor"]]

In [None]:
# make sure pandas treats these new features as numbers (since they were just parsed from text)
sampled_df["reviews"] = pd.to_numeric(sampled_df.reviews)
sampled_df["lectures"] = pd.to_numeric(sampled_df.lectures)
sampled_df["duration"] = pd.to_numeric(sampled_df.duration)

In [None]:
# these are our "normal features" (numeric + categoricals)
other_features = pd.get_dummies(sampled_df[["reviews", "lectures", "duration"]])
other_features

Unnamed: 0,reviews,lectures,duration
4263,167,90,5
4863,131,49,6
2354,453,70,36
2252,483,155,5
3626,221,118,5
...,...,...,...
1721,716,73,4
3806,202,34,4
2835,327,131,5
4405,156,83,7


## Generate Embeddings for Course Name / Description
Combine the embeddings together.

In [None]:
# use the much smaller model to perform embedding lookup
instructors, titles, descriptions = [], [], []

for instructor, title, description in zip(text_df['instructor'], text_df['course_name'], text_df['course description']):
    titles.append(smaller_model.encode(title, convert_to_tensor=True))
    descriptions.append(smaller_model.encode(description, convert_to_tensor=True))
    instructors.append(smaller_model.encode(instructor, convert_to_tensor=True))

# combine the title embeddings and description embeddings
# make sure to specify float32 (we need to convert from tensor -> floats)
# axis = 1 to combine the embeddings horizontally, instead of stacking them on top of each other vertically
embeddings = pd.concat([pd.DataFrame(titles, dtype = 'float32'), 
                        pd.DataFrame(descriptions, dtype = 'float32'),
                        pd.DataFrame(instructors, dtype = 'float32')],
                        axis = 1)

In [None]:
embeddings # 100 x 2304 (768 is the number of embeddings from the model, we use it three to generate embeddings for name and description and instructor)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.005047,0.013253,-0.109376,-0.200288,-0.179692,-0.124724,0.207669,-0.074435,-0.407029,0.258507,...,-0.117422,0.259581,-0.471601,-0.222492,0.421345,-0.357175,0.152319,-0.046407,-0.328001,-0.081029
1,-0.307051,-0.348938,-0.080781,-0.192749,-0.208828,-0.002182,0.024623,0.261838,-0.699893,-0.116687,...,0.082102,0.031171,0.213525,0.109580,0.188346,0.103721,0.341853,0.190267,0.014255,0.426000
2,-0.110549,-0.171605,-0.030344,0.155250,-0.022914,0.081396,0.006876,-0.050959,-0.054874,0.227270,...,-0.038580,0.310873,-0.092411,-0.252908,0.122696,0.256809,-0.284269,0.375727,-0.069933,-0.325357
3,0.187452,0.207181,0.076397,-0.434158,-0.049060,-0.277880,-0.249488,-0.122024,-0.358084,0.184457,...,0.078469,-0.157011,-0.065753,-0.756604,-0.175848,0.212152,0.233283,0.354416,-0.004465,0.358377
4,-0.128318,0.270376,0.296010,-0.131425,-0.181357,-0.175178,0.112655,0.014697,-0.106608,0.007593,...,0.029464,0.097343,0.106686,-0.213647,0.499516,0.242977,0.149255,0.081656,0.218749,0.049777
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.187249,-0.160995,0.125908,-0.094298,0.025104,-0.209405,0.015369,0.116139,0.097772,-0.181167,...,0.274648,0.118046,-0.237457,-0.106877,0.518363,0.080973,0.149183,-0.197172,0.250388,-0.373518
996,0.012472,0.323950,0.063590,-0.284731,0.235582,-0.302728,0.274991,0.308129,-0.054768,-0.278447,...,-0.031014,0.317882,0.233156,-0.519181,0.160875,-0.140532,0.378707,0.518979,-0.258481,0.400863
997,0.008576,-0.054823,0.391985,-0.300619,-0.123331,-0.444911,0.172341,-0.353035,-0.211463,-0.168003,...,-0.169077,-0.021608,-0.550688,-0.127944,0.257627,-0.255048,-0.299338,0.258165,-0.291710,-0.025748
998,-0.176212,-0.079481,0.009960,-0.173600,0.166980,-0.013456,-0.146478,0.126690,0.334601,0.087091,...,0.149605,-0.584060,0.317148,-0.387050,0.448120,-0.009111,0.131656,0.352669,-0.431776,0.069459


In [None]:
combined_df = pd.concat([other_features.reset_index(drop=True), embeddings.reset_index(drop=True)], axis=1)
combined_df # this is now a 500 x 2307 dataframe, a mixture of embeddings + normal features!

Unnamed: 0,reviews,lectures,duration,0,1,2,3,4,5,6,...,758,759,760,761,762,763,764,765,766,767
0,167,90,5,0.005047,0.013253,-0.109376,-0.200288,-0.179692,-0.124724,0.207669,...,-0.117422,0.259581,-0.471601,-0.222492,0.421345,-0.357175,0.152319,-0.046407,-0.328001,-0.081029
1,131,49,6,-0.307051,-0.348938,-0.080781,-0.192749,-0.208828,-0.002182,0.024623,...,0.082102,0.031171,0.213525,0.109580,0.188346,0.103721,0.341853,0.190267,0.014255,0.426000
2,453,70,36,-0.110549,-0.171605,-0.030344,0.155250,-0.022914,0.081396,0.006876,...,-0.038580,0.310873,-0.092411,-0.252908,0.122696,0.256809,-0.284269,0.375727,-0.069933,-0.325357
3,483,155,5,0.187452,0.207181,0.076397,-0.434158,-0.049060,-0.277880,-0.249488,...,0.078469,-0.157011,-0.065753,-0.756604,-0.175848,0.212152,0.233283,0.354416,-0.004465,0.358377
4,221,118,5,-0.128318,0.270376,0.296010,-0.131425,-0.181357,-0.175178,0.112655,...,0.029464,0.097343,0.106686,-0.213647,0.499516,0.242977,0.149255,0.081656,0.218749,0.049777
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,716,73,4,0.187249,-0.160995,0.125908,-0.094298,0.025104,-0.209405,0.015369,...,0.274648,0.118046,-0.237457,-0.106877,0.518363,0.080973,0.149183,-0.197172,0.250388,-0.373518
996,202,34,4,0.012472,0.323950,0.063590,-0.284731,0.235582,-0.302728,0.274991,...,-0.031014,0.317882,0.233156,-0.519181,0.160875,-0.140532,0.378707,0.518979,-0.258481,0.400863
997,327,131,5,0.008576,-0.054823,0.391985,-0.300619,-0.123331,-0.444911,0.172341,...,-0.169077,-0.021608,-0.550688,-0.127944,0.257627,-0.255048,-0.299338,0.258165,-0.291710,-0.025748
998,156,83,7,-0.176212,-0.079481,0.009960,-0.173600,0.166980,-0.013456,-0.146478,...,0.149605,-0.584060,0.317148,-0.387050,0.448120,-0.009111,0.131656,0.352669,-0.431776,0.069459


In [None]:
# get all the similarities at once
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(combined_df.values)
similarity_matrix # 500 x 500

array([[1.        , 0.98590517, 0.93902745, ..., 0.99128515, 0.99703731,
        0.91586688],
       [0.98590517, 1.        , 0.97241822, ..., 0.99383816, 0.98632648,
        0.95633753],
       [0.93902745, 0.97241822, 1.        , ..., 0.97155383, 0.94054955,
        0.99464435],
       ...,
       [0.99128515, 0.99383816, 0.97155383, ..., 1.        , 0.99063692,
        0.95663368],
       [0.99703731, 0.98632648, 0.94054955, ..., 0.99063692, 1.        ,
        0.91662971],
       [0.91586688, 0.95633753, 0.99464435, ..., 0.95663368, 0.91662971,
        1.        ]])

In [None]:
# each similarity_df[i][j] is the similarity between course i and course j
similarity_df = pd.DataFrame(similarity_matrix)
similarity_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,1.000000,0.985905,0.939027,0.980522,0.998006,0.901793,0.925322,0.896552,0.938266,0.891214,...,0.988170,0.989578,0.878806,0.990123,0.965513,0.920932,0.945159,0.991285,0.997037,0.915867
1,0.985905,1.000000,0.972418,0.991959,0.986661,0.945986,0.962555,0.942106,0.971574,0.939123,...,0.994807,0.994467,0.929221,0.994542,0.987309,0.959791,0.976932,0.993838,0.986326,0.956338
2,0.939027,0.972418,1.000000,0.984896,0.940954,0.991818,0.996182,0.990618,0.997271,0.989397,...,0.975220,0.974533,0.985405,0.969399,0.993520,0.995633,0.995808,0.971554,0.940550,0.994644
3,0.980522,0.991959,0.984896,1.000000,0.982370,0.967353,0.980307,0.964326,0.985861,0.960566,...,0.996102,0.994373,0.953291,0.992529,0.996054,0.977912,0.987229,0.996816,0.980104,0.974983
4,0.998006,0.986661,0.940954,0.982370,1.000000,0.905021,0.928082,0.899940,0.940014,0.894330,...,0.988695,0.989970,0.882345,0.989837,0.966872,0.923768,0.946573,0.992605,0.997249,0.918565
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.920932,0.959791,0.995633,0.977912,0.923768,0.998756,0.999776,0.998108,0.998330,0.997275,...,0.964858,0.959346,0.995098,0.956442,0.989229,1.000000,0.994950,0.960502,0.921746,0.999673
996,0.945159,0.976932,0.995808,0.987229,0.946573,0.990128,0.995558,0.988459,0.997681,0.987721,...,0.980151,0.975632,0.982822,0.974320,0.995294,0.994950,1.000000,0.975093,0.946381,0.994089
997,0.991285,0.993838,0.971554,0.996816,0.992605,0.946645,0.963669,0.942783,0.971797,0.938249,...,0.997319,0.996590,0.929110,0.995725,0.988602,0.960502,0.975093,1.000000,0.990637,0.956634
998,0.997037,0.986326,0.940550,0.980104,0.997249,0.902680,0.926024,0.897530,0.938542,0.892499,...,0.987791,0.989712,0.879949,0.989828,0.965651,0.921746,0.946381,0.990637,1.000000,0.916630


In [None]:
# stack all the values into one column, and have a multi-level index (first index is the first course, second index is the second course)
# name the column "similarity" and move the indices to columns
# rename the new columns as course1 and course2
similarities = similarity_df.stack().reset_index(name="similarity").rename(columns={"level_0": "course1", "level_1": "course2"})

similarities

Unnamed: 0,course1,course2,similarity
0,0,0,1.000000
1,0,1,0.985905
2,0,2,0.939027
3,0,3,0.980522
4,0,4,0.998006
...,...,...,...
999995,999,995,0.999673
999996,999,996,0.994089
999997,999,997,0.956634
999998,999,998,0.916630


In [None]:
most_similar_courses = similarities[similarities["similarity"] < 0.99999].sort_values(by="similarity", ascending=False)

In [None]:
sampled_df = sampled_df.reset_index()

## Most Similar Courses Based on Combined Numeric Features + Embeddings

In [None]:
for row in most_similar_courses.drop_duplicates(subset=["similarity"]).head(10).itertuples():
  course1 = sampled_df.loc[row.course1]["course_name"]
  course2 = sampled_df.loc[row.course2]["course_name"]
  print(course1)
  print(sampled_df.loc[row.course1]["course description"])
  print("---" * 50)
  print(course2)
  print(sampled_df.loc[row.course2]["course description"])
  print("\n" * 2)

Learn Database Design with MySQL
A Complete Course to Master Database Design using MySQL
------------------------------------------------------------------------------------------------------------------------------------------------------
Rest API Testing (Automation) from Scratch-Rest Assured Java
"Top Ranked #1 Rest API Test Automation & postman tutorial with Java from Basics to Framework with Real time examples



Django 2.2 & Python | The Ultimate Web Development Bootcamp
Build three complete websites, learn back and front-end web development, and publish your site online with DigitalOcean
------------------------------------------------------------------------------------------------------------------------------------------------------
Angular Testing Masterclass (Angular 14)
A complete guide to Angular 14 Unit Testing and End to End (E2E) Testing, including Testing best practices and CI



JavaScript - The Complete Guide 2023 (Beginner + Advanced)
Modern JavaScript from the beg

## Notice If We Just Use Embeddings We Get "More Similar" Types of Courses

In [None]:
similarity_matrix = cosine_similarity(embeddings.values)
similarity_df = pd.DataFrame(similarity_matrix)
similarities = similarity_df.stack().reset_index(name="similarity").rename(columns={"level_0": "course1", "level_1": "course2"})
most_similar_courses = similarities[similarities["similarity"] < 0.99999].sort_values(by="similarity", ascending=False)
for row in most_similar_courses.drop_duplicates(subset=["similarity"]).head(10).itertuples():
  course1 = sampled_df.loc[row.course1]["course_name"]
  course2 = sampled_df.loc[row.course2]["course_name"]
  print(course1)
  print(sampled_df.loc[row.course1]["course description"])
  print("---" * 50)
  print(course2)
  print(sampled_df.loc[row.course2]["course description"])
  print("\n" * 2)

iOS 10 and Xcode 8 - Complete Swift 3 & Objective-C Course
A Complete iOS 10 and Xcode 8 Course with Swift 3 & Objective-C
------------------------------------------------------------------------------------------------------------------------------------------------------
iOS 12 & Xcode 10 - Complete Swift 4.2 & Objective-C Course
A Complete iOS 12 and Xcode 10 Course with Swift 4.2 & Objective-C



iOS 12 & Xcode 10 - Complete Swift 4.2 & Objective-C Course
A Complete iOS 12 and Xcode 10 Course with Swift 4.2 & Objective-C
------------------------------------------------------------------------------------------------------------------------------------------------------
iOS 11 and Xcode 9 - Complete Swift 4 & Objective-C Course
A Complete iOS 11 and Xcode 9 Course with Swift 4 & Objective-C



iOS 11 and Xcode 9 - Complete Swift 4 & Objective-C Course
A Complete iOS 11 and Xcode 9 Course with Swift 4 & Objective-C
---------------------------------------------------------------------