In [2]:
!pip install thefuzz

Collecting thefuzz
  Using cached thefuzz-0.22.1-py3-none-any.whl.metadata (3.9 kB)
Collecting rapidfuzz<4.0.0,>=3.0.0 (from thefuzz)
  Downloading rapidfuzz-3.14.3-cp313-cp313-win_amd64.whl.metadata (12 kB)
Using cached thefuzz-0.22.1-py3-none-any.whl (8.2 kB)
Downloading rapidfuzz-3.14.3-cp313-cp313-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   ------------- -------------------------- 0.5/1.5 MB 907.7 kB/s eta 0:00:02
   ------------- -------------------------- 0.5/1.5 MB 907.7 kB/s eta 0:00:02
   -------------------- ------------------- 0.8/1.5 MB 642.6 kB/s eta 0:00:02
   -------------------- ------------------- 0.8/1.5 MB 642.6 kB/s eta 0:00:02
   -------------------- ------------------- 0.8/1.5 MB 642.6 kB/s eta 0:00:02
   -----

In [3]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from thefuzz import process  


In [4]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download("punkt_tab")
nltk.download("stopwords")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zohai\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zohai\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\zohai\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zohai\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [5]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [6]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z ]", " ", text)
    
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]  # stopword remove
    tokens = [lemmatizer.lemmatize(t) for t in tokens]   # lemmatize
    
    return " ".join(tokens)

In [9]:
df = pd.read_csv("../dataset/coursera_course_dataset_v3.csv")
df = df.drop(columns=['unnamed:_0'], errors='ignore')

Text Columns

In [10]:
text_cols = ['Title', 'Organization', 'Skills', 'course_description', 'Difficulty', 'Type', 'Duration']

for col in text_cols:
    df[col] = df[col].fillna('').astype(str)
    df[col] = df[col].str.replace("Ã‚", "", regex=False).str.strip()
    df[col] = df[col].apply(clean_text)

Numeric Columns


In [11]:
df['course_students_enrolled'] = df['course_students_enrolled'].astype(str).str.replace(',', '', regex=False)
df['course_students_enrolled'] = pd.to_numeric(df['course_students_enrolled'], errors='coerce')
df['course_students_enrolled'] = df['course_students_enrolled'].fillna(df['course_students_enrolled'].median())

df['Ratings'] = pd.to_numeric(df['Ratings'], errors='coerce')
df['Ratings'] = df['Ratings'].fillna(df['Ratings'].median())

df['Review Count'] = df['Review Count'].astype(str).str.replace(',', '', regex=False)
df['Review Count'] = pd.to_numeric(df['Review Count'], errors='coerce')
df['Review Count'] = df['Review Count'].fillna(df['Review Count'].median())

Combining Text for TF-IDF : Implementation of BOW()

In [12]:
df['combined_text'] = (
    df['Title'] + " " +
    df['Organization'] + " " +
    df['Skills'] + " " +
    df['course_description'] + " " +
    df['Difficulty'] + " " +
    df['Type'] + " " +
    df['Duration']
)

Tf-IDF for CBF

In [13]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_text'])
cosine_sim = cosine_similarity(tfidf_matrix)

In [14]:
def find_course_index(course_title, df, threshold=50):
    titles = df['Title'].tolist()
    match, score = process.extractOne(course_title, titles)
    if score < threshold:
        return None
    return df[df['Title'] == match].index[0]

Recommender


In [15]:
def cbf_recommend(course_title, df, sim_matrix, top_n=5):
    idx = find_course_index(course_title, df)
    if idx is None:
        return "Course not found!"

    sim_scores = list(enumerate(sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    
    return df.iloc[[i[0] for i in sim_scores]][[
        'Title', 'Organization', 'Skills', 'Difficulty'
    ]]


For Testing 

In [16]:
course_title = "Back End"

print("-> Content-Based Filtering Recommendations:")
print(cbf_recommend(course_title, df, cosine_sim))

-> Content-Based Filtering Recommendations:
                                  Title Organization  \
11    ibm full stack software developer          ibm   
57       ibm applied devops engineering          ibm   
9       ibm devops software engineering          ibm   
59              ibm front end developer          ibm   
50  ibm full stack javascript developer          ibm   

                                               Skills    Difficulty  
11  cloud computing python programming cloud appli...      beginner  
57  devops software engineering cloud computing co...  intermediate  
9   devops software engineering cloud computing co...      beginner  
59  cloud application software engineering compute...      beginner  
50  software engineering computer programming soft...      beginner  
