In [None]:
!pip install thefuzz



In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from thefuzz import process  


In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download("punkt_tab")
nltk.download("stopwords")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zohai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zohai\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\zohai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zohai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z ]", " ", text)
    
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]  # stopword remove
    tokens = [lemmatizer.lemmatize(t) for t in tokens]   # lemmatize
    
    return " ".join(tokens)

In [None]:
df = pd.read_csv("../dataset/coursera_course_dataset_v3.csv")
df = df.drop(columns=['unnamed:_0'], errors='ignore')

Text Columns

In [None]:
text_cols = ['Title', 'Organization', 'Skills', 'course_description', 'Difficulty', 'Type', 'Duration']

for col in text_cols:
    df[col] = df[col].fillna('').astype(str)
    df[col] = df[col].str.replace("Ã‚", "", regex=False).str.strip()
    df[col] = df[col].apply(clean_text)

Numeric Columns


In [None]:
df['course_students_enrolled'] = df['course_students_enrolled'].astype(str).str.replace(',', '', regex=False)
df['course_students_enrolled'] = pd.to_numeric(df['course_students_enrolled'], errors='coerce')
df['course_students_enrolled'] = df['course_students_enrolled'].fillna(df['course_students_enrolled'].median())

df['Ratings'] = pd.to_numeric(df['Ratings'], errors='coerce')
df['Ratings'] = df['Ratings'].fillna(df['Ratings'].median())

df['Review Count'] = df['Review Count'].astype(str).str.replace(',', '', regex=False)
df['Review Count'] = pd.to_numeric(df['Review Count'], errors='coerce')
df['Review Count'] = df['Review Count'].fillna(df['Review Count'].median())

Combining Text for TF-IDF : Implementation of BOW()

In [None]:
df['combined_text'] = (
    df['Title'] + " " +
    df['Organization'] + " " +
    df['Skills'] + " " +
    df['course_description'] + " " +
    df['Difficulty'] + " " +
    df['Type'] + " " +
    df['Duration']
)

Tf-IDF for CBF

In [None]:
tfidf_vec = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vec.fit_transform(df['combined_text'])

count_vec = CountVectorizer(stop_words='english')
count_matrix = count_vec.fit_transform(df['combined_text'])

In [None]:
def compute_similarity(matrix, metric="cosine"):
    if metric == "cosine":
        return cosine_similarity(matrix)
    elif metric == "adjusted_cosine":
        # adjusted cosine = cosine similarity after subtracting mean per row
        norm = matrix - matrix.mean(axis=1)
        return cosine_similarity(np.array(norm))
    elif metric == "euclidean":
        return 1 / (1 + euclidean_distances(matrix))
    else:
        raise ValueError("Invalid similarity metric!")

In [None]:
def get_sim_matrices(metric):
    return (
        compute_similarity(tfidf_matrix, metric),
        compute_similarity(count_matrix, metric),
    )

In [None]:
def find_course_index(course_title, df, threshold=50):
    titles = df['Title'].tolist()
    match, score = process.extractOne(course_title, titles)
    if score < threshold:
        return None
    return df[df['Title'] == match].index[0]

Recommender


In [None]:
def cbf_recommend(course_title, df, sim_matrix, top_n=5):
    idx = find_course_index(course_title, df)
    if idx is None:
        return "Course not found!"

    sim_scores = list(enumerate(sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

    return df.iloc[[i[0] for i in sim_scores]][[
        'Title', 'Organization', 'Skills', 'Difficulty'
    ]]

For Testing 

In [29]:
course_title = input("Enter Course Title: ")
metric_choice = input("Choose similarity (cosine / adjusted_cosine / euclidean): ").strip()

tfidf_sim, count_sim = get_sim_matrices(metric_choice)

print("\n======================= TF-IDF RESULTS =======================")
print(cbf_recommend(course_title, df, tfidf_sim))

print("\n==================== COUNTVECTORIZER RESULTS =====================")
print(cbf_recommend(course_title, df, count_sim))


                                 Title                 Organization  \
115       learn sql basic data science  university california davis   
14                ibm data engineering                          ibm   
3                     ibm data science                          ibm   
41         ibm data warehouse engineer                          ibm   
369  applied data science data analyst                   databricks   

                                                Skills    Difficulty  
115  database sql data management data analysis big...      beginner  
14   data management database database administrati...      beginner  
3    python programming data science machine learni...      beginner  
41   data management database administration databa...      beginner  
369  machine learning machine learning algorithm al...  intermediate  

                            Title                 Organization  \
3                ibm data science                          ibm   
140   applied

Pickeling Code:

In [None]:
import pickle
with open("cbf_model.pkl", "wb") as f:
    pickle.dump({
        "df": df,
        "tfidf_vec": tfidf_vec,
        "tfidf_matrix": tfidf_matrix,
        "count_vec": count_vec,
        "count_matrix": count_matrix
    }, f)

print(" Model and data successfully saved as cbf_model.pkl")
