In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('Datasets/online_courses_updated.csv')
df = df.drop(columns=['Unnamed: 0'])

In [3]:
df.shape

(100000, 16)

In [4]:
df.head()

Unnamed: 0,user_id,course_id,course_name,instructor,course_duration_hours,certification_offered,difficulty_level,rating,enrollment_numbers,course_price,feedback_score,study_material_available,time_spent_hours,previous_courses_taken,course_images,instructor_images
0,15796,9366,Python for Beginners,Emma Harris,39.1,Yes,Beginner,5.0,21600,317.5,0.797,Yes,17.6,4,https://images.unsplash.com/photo-152637909509...,https://images.pexels.com/photos/712521/pexels...
1,861,1928,Cybersecurity for Professionals,Alexander Young,36.3,Yes,Beginner,4.3,15379,40.99,0.77,Yes,28.97,9,https://images.pexels.com/photos/577585/pexels...,https://images.unsplash.com/photo-150064876779...
2,38159,9541,DevOps and Continuous Deployment,Dr. Mia Walker,13.4,Yes,Beginner,3.9,6431,380.81,0.772,Yes,52.44,4,https://images.pexels.com/photos/270404/pexels...,https://images.pexels.com/photos/733872/pexels...
3,44733,3708,Project Management Fundamentals,Benjamin Lewis,58.3,Yes,Beginner,3.1,48245,342.8,0.969,No,22.29,6,https://images.unsplash.com/photo-157316471371...,https://images.unsplash.com/photo-151908536075...
4,11285,3361,Ethical Hacking Masterclass,Daniel White,30.8,Yes,Beginner,2.8,34556,381.01,0.555,Yes,22.01,5,https://images.unsplash.com/photo-156398676860...,https://images.pexels.com/photos/2379004/pexel...


In [5]:
max_enrollments, min_enrollments = df['enrollment_numbers'].max(), df['enrollment_numbers'].min()
threshold_score = max_enrollments * 0.80
threshold_score

39999.200000000004

In [6]:
df['course_name'].unique(), df['course_name'].nunique(), df['instructor'].unique(), df['instructor'].nunique()

(array(['Python for Beginners', 'Cybersecurity for Professionals',
        'DevOps and Continuous Deployment',
        'Project Management Fundamentals', 'Ethical Hacking Masterclass',
        'Networking and System Administration',
        'Personal Finance and Wealth Building',
        'Blockchain and Decentralized Applications',
        'Graphic Design with Canva', 'Fitness and Nutrition Coaching',
        'Public Speaking Mastery', 'Photography and Video Editing',
        'Advanced Machine Learning', 'Game Development with Unity',
        'Cloud Computing Essentials', 'Mobile App Development with Swift',
        'Data Visualization with Tableau',
        'Stock Market and Trading Strategies',
        'Fundamentals of Digital Marketing', 'AI for Business Leaders'],
       dtype=object),
 20,
 array(['Emma Harris', 'Alexander Young', 'Dr. Mia Walker',
        'Benjamin Lewis', 'Daniel White', 'Dr. John Smith',
        'Dr. Robert Davis', 'Liam Adams', 'Prof. Emily Johnson',
        '

In [7]:
threshold_df = df[df['enrollment_numbers']>threshold_score]
threshold_df.shape

(20041, 16)

In [8]:
threshold_df['course_name'].unique(), threshold_df['course_name'].nunique(), threshold_df['instructor'].unique(), threshold_df['instructor'].nunique()

(array(['Project Management Fundamentals',
        'Networking and System Administration',
        'Photography and Video Editing', 'Python for Beginners',
        'Fitness and Nutrition Coaching', 'Graphic Design with Canva',
        'Data Visualization with Tableau', 'Advanced Machine Learning',
        'Stock Market and Trading Strategies',
        'Cybersecurity for Professionals',
        'DevOps and Continuous Deployment',
        'Mobile App Development with Swift',
        'Personal Finance and Wealth Building',
        'Game Development with Unity', 'AI for Business Leaders',
        'Ethical Hacking Masterclass', 'Cloud Computing Essentials',
        'Public Speaking Mastery',
        'Blockchain and Decentralized Applications',
        'Fundamentals of Digital Marketing'], dtype=object),
 20,
 array(['Benjamin Lewis', 'Dr. Robert Davis', 'Daniel White',
        'Charlotte King', 'Prof. Emily Johnson', 'James Clark',
        'Olivia Taylor', 'Michael Brown', 'Jessica Martinez

In [9]:
threshold_df.isnull().sum()

user_id                     0
course_id                   0
course_name                 0
instructor                  0
course_duration_hours       0
certification_offered       0
difficulty_level            0
rating                      0
enrollment_numbers          0
course_price                0
feedback_score              0
study_material_available    0
time_spent_hours            0
previous_courses_taken      0
course_images               0
instructor_images           0
dtype: int64

In [10]:
threshold_df['text_features'] = threshold_df['course_name'] + ' with ' + threshold_df['instructor']

In [11]:
threshold_df.head()

Unnamed: 0,user_id,course_id,course_name,instructor,course_duration_hours,certification_offered,difficulty_level,rating,enrollment_numbers,course_price,feedback_score,study_material_available,time_spent_hours,previous_courses_taken,course_images,instructor_images,text_features
3,44733,3708,Project Management Fundamentals,Benjamin Lewis,58.3,Yes,Beginner,3.1,48245,342.8,0.969,No,22.29,6,https://images.unsplash.com/photo-157316471371...,https://images.unsplash.com/photo-151908536075...,Project Management Fundamentals with Benjamin ...
6,16851,7887,Networking and System Administration,Dr. Robert Davis,44.9,Yes,Beginner,4.9,41050,389.32,0.893,Yes,15.66,3,https://images.unsplash.com/photo-157316471398...,https://images.unsplash.com/photo-1545167622-3...,Networking and System Administration with Dr. ...
14,770,534,Photography and Video Editing,Daniel White,74.0,Yes,Advanced,4.1,40437,388.7,0.62,Yes,14.13,3,https://images.unsplash.com/photo-151603506937...,https://images.pexels.com/photos/2379004/pexel...,Photography and Video Editing with Daniel White
16,5312,3455,Python for Beginners,Charlotte King,11.1,Yes,Beginner,4.6,43655,426.0,0.966,Yes,22.8,5,https://images.unsplash.com/photo-152637909509...,https://images.pexels.com/photos/774909/pexels...,Python for Beginners with Charlotte King
22,6397,1759,Fitness and Nutrition Coaching,Prof. Emily Johnson,88.5,No,Beginner,3.6,44312,178.6,0.598,No,14.45,1,https://images.unsplash.com/photo-157101961345...,https://images.pexels.com/photos/38554/girl-pe...,Fitness and Nutrition Coaching with Prof. Emil...


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
text_matrix = tfidf.fit_transform(threshold_df['text_features'])

In [30]:
text_matrix.shape

(20041, 100)

In [29]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
num_features = ['rating', 'course_price', 'feedback_score', 'time_spent_hours', 'previous_courses_taken']
cat_features = ['difficulty_level', 'certification_offered', 'study_material_available']

In [14]:
preprocessor = ColumnTransformer([
    ('num', MinMaxScaler(), num_features),
    ('cat', OneHotEncoder(), cat_features)
])

In [15]:
other_features = preprocessor.fit_transform(threshold_df)

In [16]:
other_features

array([[0.525     , 0.67249318, 0.96251511, ..., 1.        , 1.        ,
        0.        ],
       [0.975     , 0.76941186, 0.87061669, ..., 1.        , 0.        ,
        1.        ],
       [0.775     , 0.76812017, 0.54050786, ..., 1.        , 0.        ,
        1.        ],
       ...,
       [1.        , 0.22315048, 0.5719468 , ..., 0.        , 0.        ,
        1.        ],
       [0.925     , 0.29267276, 0.39903265, ..., 1.        , 0.        ,
        1.        ],
       [0.725     , 0.9694577 , 0.80411125, ..., 1.        , 0.        ,
        1.        ]])

In [31]:
other_features.shape

(20041, 12)

In [17]:
from scipy.sparse import hstack
# Combine text and other features
final_features = hstack([text_matrix, other_features])

In [18]:
final_features

<20041x112 sparse matrix of type '<class 'numpy.float64'>'
	with 262344 stored elements in COOrdinate format>

In [28]:
final_features.shape

(20041, 112)

In [19]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(final_features, final_features)

In [20]:
cosine_sim

array([[1.        , 0.6516094 , 0.44682085, ..., 0.42864686, 0.5686535 ,
        0.65156802],
       [0.6516094 , 1.        , 0.65268593, ..., 0.62624835, 0.78809872,
        0.83196338],
       [0.44682085, 0.65268593, 1.        , ..., 0.42423199, 0.59708931,
        0.64964126],
       ...,
       [0.42864686, 0.62624835, 0.42423199, ..., 1.        , 0.60221873,
        0.59543514],
       [0.5686535 , 0.78809872, 0.59708931, ..., 0.60221873, 1.        ,
        0.76439016],
       [0.65156802, 0.83196338, 0.64964126, ..., 0.59543514, 0.76439016,
        1.        ]])

In [32]:
cosine_sim.shape

(20041, 20041)

In [21]:
def recommend_courses(course_name, instructor, top_n=5):
    input_str = f"{course_name.strip()} with {instructor.strip()}".lower()
    threshold_df['normalized_text'] = threshold_df['text_features'].str.lower().str.strip()
    idx_list = threshold_df[threshold_df['normalized_text'] == input_str].index

    if len(idx_list) == 0:
        return "Course not found."

    idx = idx_list[0]
    sim_scores = cosine_sim[idx]
    top_indices = sim_scores.argsort()[-top_n-1:-1][::-1]
    return df.iloc[top_indices][['course_name','instructor','course_duration_hours','certification_offered','difficulty_level',	'rating','enrollment_numbers','course_price','feedback_score','study_material_available','time_spent_hours','previous_courses_taken','course_images','instructor_images']]

In [22]:
threshold_df.sample(2)

Unnamed: 0,user_id,course_id,course_name,instructor,course_duration_hours,certification_offered,difficulty_level,rating,enrollment_numbers,course_price,feedback_score,study_material_available,time_spent_hours,previous_courses_taken,course_images,instructor_images,text_features
3047,30315,946,Advanced Machine Learning,Dr. Robert Davis,69.1,No,Advanced,2.4,45391,84.32,0.774,Yes,23.55,4,https://images.unsplash.com/photo-162071294354...,https://images.unsplash.com/photo-1545167622-3...,Advanced Machine Learning with Dr. Robert Davis
82875,35444,6821,AI for Business Leaders,Alexander Young,58.7,No,Intermediate,5.0,42362,287.39,0.747,No,43.29,5,https://images.pexels.com/photos/8438974/pexel...,https://images.unsplash.com/photo-150064876779...,AI for Business Leaders with Alexander Young


In [23]:
recommend_courses('Networking and System Administration', 'Dr. Mia Walker', top_n=5)

Unnamed: 0,course_name,instructor,course_duration_hours,certification_offered,difficulty_level,rating,enrollment_numbers,course_price,feedback_score,study_material_available,time_spent_hours,previous_courses_taken,course_images,instructor_images
19641,Stock Market and Trading Strategies,Olivia Taylor,65.7,Yes,Beginner,2.8,2313,235.75,0.855,Yes,25.9,5,https://images.unsplash.com/photo-161197478985...,https://images.pexels.com/photos/1326946/pexel...
8580,Mobile App Development with Swift,Ethan Hall,25.0,Yes,Beginner,5.0,32754,418.33,0.395,No,1.0,2,https://images.unsplash.com/photo-163335612254...,https://images.pexels.com/photos/1043471/pexel...
13475,Data Visualization with Tableau,Sophia Anderson,49.9,No,Advanced,4.2,19781,33.45,0.747,No,19.63,7,https://images.pexels.com/photos/265087/pexels...,https://images.unsplash.com/photo-143876168103...
2582,Python for Beginners,William Thomas,62.0,No,Advanced,3.9,17005,365.2,0.567,Yes,1.0,3,https://images.unsplash.com/photo-152637909509...,https://images.unsplash.com/photo-1557862921-3...
18084,AI for Business Leaders,David Wilson,32.4,Yes,Beginner,4.0,15112,30.74,0.672,Yes,25.85,4,https://images.pexels.com/photos/8438974/pexel...,https://images.unsplash.com/photo-150700321116...


In [25]:
recommend_courses('AI for Business Leaders', 'David Wilson', top_n=5)

Unnamed: 0,course_name,instructor,course_duration_hours,certification_offered,difficulty_level,rating,enrollment_numbers,course_price,feedback_score,study_material_available,time_spent_hours,previous_courses_taken,course_images,instructor_images
75,Fundamentals of Digital Marketing,Sophia Anderson,18.0,Yes,Advanced,4.8,597,138.91,0.71,Yes,48.81,4,https://images.unsplash.com/photo-1551288049-b...,https://images.unsplash.com/photo-143876168103...
5643,Cloud Computing Essentials,Prof. Emily Johnson,72.2,Yes,Intermediate,3.3,5600,274.96,0.284,Yes,24.37,4,https://images.pexels.com/photos/19867468/pexe...,https://images.pexels.com/photos/38554/girl-pe...
18135,Personal Finance and Wealth Building,James Clark,30.1,No,Beginner,4.3,49039,236.21,0.848,Yes,11.56,6,https://images.unsplash.com/photo-1554224155-6...,https://images.pexels.com/photos/1222271/pexel...
1451,DevOps and Continuous Deployment,Sarah Lee,43.0,Yes,Beginner,4.7,15141,171.02,0.716,Yes,47.66,4,https://images.pexels.com/photos/270404/pexels...,https://images.unsplash.com/photo-152450438894...
710,Game Development with Unity,Charlotte King,11.3,Yes,Beginner,4.3,12067,333.41,0.733,Yes,1.5,5,https://images.unsplash.com/photo-1542751371-a...,https://images.pexels.com/photos/774909/pexels...
