In [1]:
#import libraries
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
#load datasets
df = pd.read_csv('coursera_courses.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5411 entries, 0 to 5410
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   url             5411 non-null   object
 1   name            5411 non-null   object
 2   category        5411 non-null   object
 3   what_you_learn  3005 non-null   object
 4   skills          5411 non-null   object
 5   language        5411 non-null   object
 6   instructors     5411 non-null   object
 7   content         5411 non-null   object
dtypes: object(8)
memory usage: 338.3+ KB


In [3]:
df.head(3)

Unnamed: 0,url,name,category,what_you_learn,skills,language,instructors,content
0,https://www.coursera.org/learn/-network-security,Network Security,Information Technology,,"Computer Networking, Network Planning And Desi...",English,['~31081695'],Welcome to course 4 of 5 of this Specializatio...
1,https://www.coursera.org/learn/-security-princ...,Security Principles,Information Technology,,"Cyber Security Policies, Data Integrity, Cyber...",English,['~31081695'],Welcome to course 1 of 5 of this Specializatio...
2,https://www.coursera.org/learn/21st-century-en...,21st Century Energy Transition: how do we make...,Physical Science And Engineering,Understand the complexity of systems supplying...,"Electric Power Systems, Environmental Policy, ...",English,['brad-hayes'],NOTE: “21 st Century Energy Transition – How d...


In [6]:
# Define columns to fill
cols = ['name', 'category', 'what_you_learn', 'skills', 'language', 'instructors', 'content']

# Fill missing values for selected columns
df[cols] = df[cols].fillna('')

# Drop rows where name is missing
df = df.dropna(subset=['name'])

df.head()


Unnamed: 0,url,name,category,what_you_learn,skills,language,instructors,content
0,https://www.coursera.org/learn/-network-security,Network Security,Information Technology,,"Computer Networking, Network Planning And Desi...",English,['~31081695'],Welcome to course 4 of 5 of this Specializatio...
1,https://www.coursera.org/learn/-security-princ...,Security Principles,Information Technology,,"Cyber Security Policies, Data Integrity, Cyber...",English,['~31081695'],Welcome to course 1 of 5 of this Specializatio...
2,https://www.coursera.org/learn/21st-century-en...,21st Century Energy Transition: how do we make...,Physical Science And Engineering,Understand the complexity of systems supplying...,"Electric Power Systems, Environmental Policy, ...",English,['brad-hayes'],NOTE: “21 st Century Energy Transition – How d...
3,https://www.coursera.org/learn/360-vr-video-pr...,VR and 360 Video Production,Arts And Humanities,,"Virtual Reality, Videography, Media Production...",English,['googlearvr'],Welcome to the Google AR & VR Virtual Reality ...
4,https://www.coursera.org/learn/3d-anatomy-phys...,Foundations of Human Anatomy and Physiology,Health,Learners will understand how body structure su...,"Vital Signs, Basic Patient Care, Anatomy, Heal...",English,"['~167016541', '~166856472', '~166856442', '~1...",This course provides a foundational understand...


In [20]:
df['text'] = (
    df['name'] + ' ' +
    df['category'] + ' ' +
    df['skills'] + ' ' +
    df['what_you_learn'] + ' ' +
    df['content'] + ' ' +
    df['language'] + ' ' +
    df['instructors']
)

df[['name', 'text']].head()


Unnamed: 0,name,text
0,Network Security,Network Security Information Technology Comput...
1,Security Principles,Security Principles Information Technology Cyb...
2,21st Century Energy Transition: how do we make...,21st Century Energy Transition: how do we make...
3,VR and 360 Video Production,VR and 360 Video Production Arts And Humanitie...
4,Foundations of Human Anatomy and Physiology,Foundations of Human Anatomy and Physiology He...


In [21]:
df.head()

Unnamed: 0,name,category,what_you_learn,skills,language,instructors,content,text
0,Network Security,Information Technology,,"Computer Networking, Network Planning And Desi...",English,['~31081695'],Welcome to course 4 of 5 of this Specializatio...,Network Security Information Technology Comput...
1,Security Principles,Information Technology,,"Cyber Security Policies, Data Integrity, Cyber...",English,['~31081695'],Welcome to course 1 of 5 of this Specializatio...,Security Principles Information Technology Cyb...
2,21st Century Energy Transition: how do we make...,Physical Science And Engineering,Understand the complexity of systems supplying...,"Electric Power Systems, Environmental Policy, ...",English,['brad-hayes'],NOTE: “21 st Century Energy Transition – How d...,21st Century Energy Transition: how do we make...
3,VR and 360 Video Production,Arts And Humanities,,"Virtual Reality, Videography, Media Production...",English,['googlearvr'],Welcome to the Google AR & VR Virtual Reality ...,VR and 360 Video Production Arts And Humanitie...
4,Foundations of Human Anatomy and Physiology,Health,Learners will understand how body structure su...,"Vital Signs, Basic Patient Care, Anatomy, Heal...",English,"['~167016541', '~166856472', '~166856442', '~1...",This course provides a foundational understand...,Foundations of Human Anatomy and Physiology He...


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english', max_features=20000)
tfidf_matrix = tfidf.fit_transform(df['text'])



(5411, 20000)

In [23]:
#cosine similarity
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [24]:
#index mapping
df['name_lower'] = df['name'].str.lower()
indices = pd.Series(df.index, index=df['name_lower']).drop_duplicates()


In [53]:
# function for course similarity
def recommend_similar_courses(course_name, n=10):
    course_name = course_name.lower()
    
    if course_name not in indices:
        print("Course not found. Please check the name.")
        return pd.DataFrame()
    
    idx = indices[course_name]
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # skip the first one (same course)
    sim_scores = sim_scores[1:n+1]
    
    course_indices = [i[0] for i in sim_scores]
    
    return df[['name', 'category', 'skills', 'language',]].iloc[course_indices]


In [54]:
# testing
recommend_similar_courses("networksecurity", n=5)


Course not found. Please check the name.
