In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, LabelEncoder






In [2]:
# Load the dataset
df = pd.read_csv("course_info.csv")

df.columns = df.columns.str.strip()

print(df.columns)


Index(['id', 'title', 'is_paid', 'price', 'headline', 'num_subscribers',
       'avg_rating', 'num_reviews', 'num_comments', 'num_lectures',
       'content_length_min', 'published_time', 'last_update_date', 'category',
       'subcategory', 'topic', 'language', 'course_url', 'instructor_name',
       'instructor_url'],
      dtype='object')


In [3]:
#RUN ONLY WHEN REQUIRED
df = df[df['language'] == 'English']
columns_to_drop = ['id', 'course_url', 'instructor_url', 'published_time', 'last_update_date','instructor_name','language']
df = df.drop(columns=columns_to_drop)
df = df.dropna()



In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 123351 entries, 0 to 209733
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   title               123351 non-null  object 
 1   is_paid             123351 non-null  bool   
 2   price               123351 non-null  float64
 3   headline            123351 non-null  object 
 4   num_subscribers     123351 non-null  float64
 5   avg_rating          123351 non-null  float64
 6   num_reviews         123351 non-null  float64
 7   num_comments        123351 non-null  float64
 8   num_lectures        123351 non-null  float64
 9   content_length_min  123351 non-null  float64
 10  category            123351 non-null  object 
 11  subcategory         123351 non-null  object 
 12  topic               123351 non-null  object 
dtypes: bool(1), float64(7), object(5)
memory usage: 12.4+ MB


In [5]:
df.replace(r'^\s*$', np.nan, regex=True, inplace=True)

df = df.dropna()

print(df.head())
print(f"Remaining rows: {len(df)}")


                                               title  is_paid   price   
0             Online Vegan Vegetarian Cooking School     True   24.99  \
1         The Lean Startup Talk at Stanford E-Corner    False    0.00   
2  How To Become a Vegan, Vegetarian, or Flexitarian     True   19.99   
3                               How to Train a Puppy     True  199.99   
4                      Web Design from the Ground Up     True  159.99   

                                            headline  num_subscribers   
0  Learn to cook delicious vegan recipes. Filmed ...           2231.0  \
1  Debunking Myths of Entrepreneurship A startup ...          26474.0   
2  Get the tools you need for a lifestyle change ...           1713.0   
3  Train your puppy the right way with Dr. Ian Du...           4988.0   
4  Learn web design online: Everything you need t...           1266.0   

   avg_rating  num_reviews  num_comments  num_lectures  content_length_min   
0        3.75        134.0          42.0    

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 123351 entries, 0 to 209733
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   title               123351 non-null  object 
 1   is_paid             123351 non-null  bool   
 2   price               123351 non-null  float64
 3   headline            123351 non-null  object 
 4   num_subscribers     123351 non-null  float64
 5   avg_rating          123351 non-null  float64
 6   num_reviews         123351 non-null  float64
 7   num_comments        123351 non-null  float64
 8   num_lectures        123351 non-null  float64
 9   content_length_min  123351 non-null  float64
 10  category            123351 non-null  object 
 11  subcategory         123351 non-null  object 
 12  topic               123351 non-null  object 
dtypes: bool(1), float64(7), object(5)
memory usage: 12.4+ MB


## <b>Model preprocessing and training</b>


In [7]:
# weight factors for 'category' and 'subcategory'
category_weight = 5
subcategory_weight = 4

# numerical data normalization
numerical_columns = ['price', 'num_subscribers', 'avg_rating', 'num_reviews', 'num_lectures', 'content_length_min']
scaler = StandardScaler()
numerical_data = df[numerical_columns].copy()  
normalized_numerical_features = scaler.fit_transform(numerical_data)

# categorical data encoding
categorical_columns = ['category', 'subcategory', 'is_paid']
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_categorical_data = encoder.fit_transform(df[categorical_columns])

# extract specific parts of the encoded categorical data for weighted scaling
encoded_category = encoded_categorical_data[:, :len(df['category'].unique())]  # 'category' columns
encoded_subcategory = encoded_categorical_data[:, len(df['category'].unique()):(len(df['category'].unique()) + len(df['subcategory'].unique()))]  # 'subcategory' columns
encoded_is_paid = encoded_categorical_data[:, -1]  # 'is_paid' column

# weights to 'category' and 'subcategory' columns
weighted_category = encoded_category * category_weight
weighted_subcategory = encoded_subcategory * subcategory_weight

# combine the weighted categorical data with the rest
encoded_categorical_df = np.hstack([weighted_category, weighted_subcategory, encoded_is_paid.reshape(-1, 1)])

# text tokenization for Topic Column
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['topic'])
topic_sequences = tokenizer.texts_to_sequences(df['topic'])

# padding sequences to ensure consistent length 
topic_padded = pad_sequences(topic_sequences, padding='post')

combined_features = np.hstack([
    topic_padded,  
    normalized_numerical_features,  
    encoded_categorical_df  
])





## Using knn along with neural net

In [16]:
# model = models.Sequential([
#     layers.Input(shape=(combined_features.shape[1],)), 
#     layers.Dense(128, activation='relu'), 
#     layers.Dense(64, activation='relu'),  
#     layers.Dense(1, activation='sigmoid')  
# ])

# model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])

In [17]:
# target = combined_features

# model.fit(combined_features, target, epochs=1, batch_size=32)

In [18]:
# from sklearn.neighbors import NearestNeighbors

# def recommend_courses_knn(course_index, features, data, top_n=5):
#     """
#     Recommends courses similar to the given course using nearest neighbors.
#     :param course_index: The index of the input course to compare with.
#     :param features: The feature set for all courses.
#     :param data: The original dataset containing course information.
#     :param top_n: The number of top recommendations to return.
#     :return: DataFrame containing the top N recommended courses.
#     """
    
#     knn = NearestNeighbors(n_neighbors=top_n + 1, metric='euclidean')  # +1 to exclude the input course itself
#     knn.fit(features)
    
#     # Find the nearest neighbors of the input course (including the input course itself)
#     distances, indices = knn.kneighbors(features[course_index].reshape(1, -1))
    
#     top_indices = indices.flatten()[1:] # Exclude the first element (the input course itself)

#     input_course = data.iloc[course_index]
#     print(f"Input Course (Course at index {course_index}):")
#     print(f"Course Title: {input_course['title']}")
#     print(f"Category: {input_course['category']}, Subcategory: {input_course['subcategory']}")
#     print(f"Price: {input_course['price']}, Rating: {input_course['avg_rating']}")
#     print("-" * 50)

#     print("Recommended Courses:\n")
#     for idx in top_indices:
#         course_details = data.iloc[idx]
#         print(f"Course Title: {course_details['title']}")
#         print(f"Category: {course_details['category']}, Subcategory: {course_details['subcategory']}")
#         print(f"Price: {course_details['price']}, Rating: {course_details['avg_rating']}")
#         print("-" * 50)

#     recommended_courses = data.iloc[top_indices]
#     return recommended_courses

# recommended_courses_knn = recommend_courses_knn(999, combined_features, df)


## New Method

In [19]:
from tensorflow.keras import layers, models
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Define the embedding model
embedding_dim = 64  # Choose a suitable embedding size

embedding_model = models.Sequential([
    layers.Input(shape=(combined_features.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dense(embedding_dim, activation='linear')  # Embedding layer
])

# Generate embeddings
embeddings = embedding_model.predict(combined_features)

# Normalize embeddings for cosine similarity
normalized_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)




In [21]:
def recommend_similar_courses(course_index, embeddings, data, top_n=5):
    """
    Recommend courses similar to a given course based on learned embeddings.
    :param course_index: Index of the input course.
    :param embeddings: Pre-computed embeddings for all courses.
    :param data: Original dataset containing course details.
    :param top_n: Number of recommendations to return.
    :return: DataFrame with recommended courses.
    """
    input_embedding = embeddings[course_index]
    similarities = cosine_similarity(input_embedding.reshape(1, -1), embeddings).flatten()
    
    # Get top N most similar courses (excluding the input course itself)
    similar_indices = similarities.argsort()[-(top_n + 1):-1][::-1]
    
    input_course = data.iloc[course_index]
    print(f"Input Course: {input_course['title']}")
    print(f"Recommended Courses:\n")
    
    for idx in similar_indices:
        course_details = data.iloc[idx]
        print(f"Course Title: {course_details['title']}, Similarity: {similarities[idx]:.2f}")
    
    return data.iloc[similar_indices]

# Example usage
recommend_similar_courses(732, normalized_embeddings, df)


Input Course: Creative Watercolours Beginners
Recommended Courses:

Course Title: How to Paint a Realistic Robin Bird in Watercolour, Similarity: 1.00
Course Title: How to Paint Watercolour - An Entirely New Approach, Similarity: 1.00
Course Title: Watercolor realistic orchid painting: step by step workshop, Similarity: 1.00
Course Title: How to Simplify in Watercolor - Part I, Similarity: 1.00
Course Title: Watercolor Course Paint this Window Seascape, Similarity: 1.00


Unnamed: 0,title,is_paid,price,headline,num_subscribers,avg_rating,num_reviews,num_comments,num_lectures,content_length_min,category,subcategory,topic
79682,How to Paint a Realistic Robin Bird in Waterco...,False,0.0,Learn to paint wildlife in watercolor!,6356.0,4.95,98.0,30.0,5.0,180.0,Lifestyle,Arts & Crafts,Watercolor Painting
67645,How to Paint Watercolour - An Entirely New App...,False,0.0,You've never seen watercolour done this way be...,8419.0,4.45,276.0,53.0,8.0,107.0,Lifestyle,Arts & Crafts,Watercolor Painting
14950,Watercolor realistic orchid painting: step by ...,False,0.0,Watch me creating realistic botanical illustra...,11254.0,4.35,235.0,62.0,11.0,85.0,Lifestyle,Arts & Crafts,Watercolor Painting
132419,How to Simplify in Watercolor - Part I,False,0.0,The Secret for Creating a BEAUTIFUL Impression,1701.0,5.0,82.0,16.0,8.0,120.0,Lifestyle,Arts & Crafts,Watercolor Painting
15430,Watercolor Course Paint this Window Seascape,False,0.0,Project - Create a lovely Watercolor Painting ...,6845.0,4.4,228.0,74.0,16.0,62.0,Lifestyle,Arts & Crafts,Watercolor Painting
