In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('Datasets/online_courses_updated.csv')
df = df.drop(columns=['Unnamed: 0'])

In [3]:
max_enrollments, min_enrollments = df['enrollment_numbers'].max(), df['enrollment_numbers'].min()
threshold_score = max_enrollments * 0.80
threshold_score

39999.200000000004

In [4]:
df = df[df['enrollment_numbers']>threshold_score]
df.shape

(20041, 16)

In [5]:
df['course_name'].unique(), df['course_name'].nunique(), df['instructor'].unique(), df['instructor'].nunique()

(array(['Project Management Fundamentals',
        'Networking and System Administration',
        'Photography and Video Editing', 'Python for Beginners',
        'Fitness and Nutrition Coaching', 'Graphic Design with Canva',
        'Data Visualization with Tableau', 'Advanced Machine Learning',
        'Stock Market and Trading Strategies',
        'Cybersecurity for Professionals',
        'DevOps and Continuous Deployment',
        'Mobile App Development with Swift',
        'Personal Finance and Wealth Building',
        'Game Development with Unity', 'AI for Business Leaders',
        'Ethical Hacking Masterclass', 'Cloud Computing Essentials',
        'Public Speaking Mastery',
        'Blockchain and Decentralized Applications',
        'Fundamentals of Digital Marketing'], dtype=object),
 20,
 array(['Benjamin Lewis', 'Dr. Robert Davis', 'Daniel White',
        'Charlotte King', 'Prof. Emily Johnson', 'James Clark',
        'Olivia Taylor', 'Michael Brown', 'Jessica Martinez

In [6]:
df['text_features'] = df['course_name'] + ' with ' + df['instructor']
df['text_features'] = df['text_features'].str.lower().str.strip()

In [7]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['text_features'])

In [8]:
collab_features = ['rating', 'course_price', 'feedback_score', 'time_spent_hours']
scaler = MinMaxScaler()
numeric_matrix = scaler.fit_transform(df[collab_features])

In [9]:
from scipy.sparse import hstack
combined_features = hstack([tfidf_matrix, numeric_matrix])
combined_features = combined_features.tocsr()

In [10]:
combined_features

<20041x104 sparse matrix of type '<class 'numpy.float64'>'
	with 182308 stored elements in Compressed Sparse Row format>

In [11]:
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(combined_features)

In [22]:
def recommend_courses_1st(course_name, instructor, top_n=5):
    input_str = f"{course_name.strip()} with {instructor.strip()}".lower()
    ## match the course
    match = df[df['text_features'] == input_str]
    if match.empty:
        return "Course not found."

    idx = match.index[0]
    distances, indices = knn.kneighbors(combined_features[idx], n_neighbors=len(df))

    ## Flatten and remove self
    recommended_indices = indices.flatten()
    recommended_indices = recommended_indices[recommended_indices != idx]

    ## Build unique recommendations by course name + instructor
    seen = set()
    unique_recommendations = []
    for rec_idx in recommended_indices:
        row = df.iloc[rec_idx]
        key = (row['course_name'].strip().lower(), row['instructor'].strip().lower())
        if key != (course_name.strip().lower(), instructor.strip().lower()) and key not in seen:
            seen.add(key)
            unique_recommendations.append(row)
        if len(unique_recommendations) == top_n:
            break

    rec_df =  pd.DataFrame(unique_recommendations)[[
        'course_name','instructor','course_duration_hours','certification_offered','difficulty_level',	'rating','enrollment_numbers','course_price','feedback_score','study_material_available','time_spent_hours','previous_courses_taken','course_images','instructor_images'
    ]]
    return rec_df.sort_values(by='rating', ascending=False).head(top_n)


def recommend_courses_2nd(course_name, instructor, top_n=5):
    # Normalize input string
    input_str = f"{course_name.strip()} with {instructor.strip()}".lower()

    # Match the course in the DataFrame
    match = df[df['text_features'] == input_str]
    if match.empty:
        return "Course not found."

    # Get index of the matched course
    idx = match.index[0]

    # Compute KNN distances (ensure 2D input by slicing)
    distances, indices = knn.kneighbors(combined_features[idx:idx+1], n_neighbors=len(df))

    # Flatten and exclude the course itself
    recommended_indices = indices.flatten()
    recommended_indices = recommended_indices[recommended_indices != idx]

    # Collect unique recommendations
    seen = set()
    unique_recommendations = []
    for rec_idx in recommended_indices:
        row = df.iloc[rec_idx]
        key = (row['course_name'].strip().lower(), row['instructor'].strip().lower())

        if key != (course_name.strip().lower(), instructor.strip().lower()) and key not in seen:
            seen.add(key)
            unique_recommendations.append(row)

        if len(unique_recommendations) == top_n:
            break

    # Build DataFrame from recommendations
    rec_df = pd.DataFrame(unique_recommendations)[[
        'course_name',
        'instructor',
        'course_duration_hours',
        'certification_offered',
        'difficulty_level',
        'rating',
        'enrollment_numbers',
        'course_price',
        'feedback_score',
        'study_material_available',
        'time_spent_hours',
        'previous_courses_taken',
        'course_images',
        'instructor_images'
    ]]

    return rec_df.sort_values(by='rating', ascending=False).head(top_n)


def recommend_courses_3rd(course_name, instructor, top_n=5):
    input_str = f"{course_name.strip()} with {instructor.strip()}".lower()
    
    # Match the input course
    match = df[df['text_features'] == input_str]
    if match.empty:
        return "Course not found."

    idx = match.index[0]

    # Get distances and indices from the KNN model
    distances, indices = knn.kneighbors(combined_features[idx], n_neighbors=len(df))

    # Flatten and exclude the input course itself
    recommended_indices = [i for i in indices.flatten() if i != idx]

    # Track seen course names and instructors
    seen_course_names = set([course_name.strip().lower()])
    seen_instructors = set([instructor.strip().lower()])

    # Build recommendations
    unique_recommendations = []
    for rec_idx in recommended_indices:
        row = df.iloc[rec_idx]
        course = row['course_name'].strip().lower()
        teacher = row['instructor'].strip().lower()

        if course not in seen_course_names and teacher not in seen_instructors:
            seen_course_names.add(course)
            seen_instructors.add(teacher)
            unique_recommendations.append(row)

        if len(unique_recommendations) == top_n:
            break

    if not unique_recommendations:
        return "No unique recommendations found."

    # Convert to DataFrame
    rec_df = pd.DataFrame(unique_recommendations)[[
        'course_name','instructor','course_duration_hours','certification_offered','difficulty_level',
        'rating','enrollment_numbers','course_price','feedback_score','study_material_available',
        'time_spent_hours','previous_courses_taken','course_images','instructor_images'
    ]]

    return rec_df.sort_values(by='rating', ascending=False).head(top_n)


In [20]:
recommend_courses_1st('Mobile App Development with Swift', 'Sophia Anderson', top_n=5)

Unnamed: 0,course_name,instructor,course_duration_hours,certification_offered,difficulty_level,rating,enrollment_numbers,course_price,feedback_score,study_material_available,time_spent_hours,previous_courses_taken,course_images,instructor_images
44177,Graphic Design with Canva,Alexander Young,10.5,No,Intermediate,5.0,49439,178.31,1.0,Yes,47.55,4,https://images.unsplash.com/photo-1547658719-d...,https://images.unsplash.com/photo-150064876779...
55986,Graphic Design with Canva,Jessica Martinez,30.5,Yes,Beginner,5.0,47500,338.14,0.897,Yes,29.01,8,https://images.unsplash.com/photo-1547658719-d...,https://images.pexels.com/photos/1181686/pexel...
4523,Graphic Design with Canva,Ethan Hall,58.3,Yes,Beginner,4.9,46747,237.21,0.822,Yes,30.02,7,https://images.unsplash.com/photo-1547658719-d...,https://images.pexels.com/photos/1043471/pexel...
8887,Graphic Design with Canva,James Clark,22.2,Yes,Advanced,4.7,41206,250.9,0.976,Yes,26.32,4,https://images.unsplash.com/photo-1547658719-d...,https://images.pexels.com/photos/1222271/pexel...
13463,Graphic Design with Canva,Sophia Anderson,41.1,Yes,Advanced,4.7,49332,272.64,0.89,Yes,28.15,6,https://images.unsplash.com/photo-1547658719-d...,https://images.unsplash.com/photo-143876168103...


In [26]:
recommend_courses_2nd('Graphic Design with Canva', 'Sophia Anderson', top_n=5)

Unnamed: 0,course_name,instructor,course_duration_hours,certification_offered,difficulty_level,rating,enrollment_numbers,course_price,feedback_score,study_material_available,time_spent_hours,previous_courses_taken,course_images,instructor_images
35808,Fitness and Nutrition Coaching,Dr. Mia Walker,30.9,Yes,Intermediate,3.8,41200,485.3,0.643,Yes,14.69,3,https://images.unsplash.com/photo-157101961345...,https://images.pexels.com/photos/733872/pexels...
30422,Fitness and Nutrition Coaching,Dr. John Smith,97.1,Yes,Intermediate,3.8,46114,496.12,0.717,Yes,19.51,2,https://images.unsplash.com/photo-157101961345...,https://images.pexels.com/photos/428333/pexels...
64361,Fitness and Nutrition Coaching,Dr. Robert Davis,39.2,Yes,Beginner,3.6,47764,468.14,0.739,Yes,12.83,6,https://images.unsplash.com/photo-157101961345...,https://images.unsplash.com/photo-1545167622-3...
5208,Fitness and Nutrition Coaching,Sarah Lee,37.5,Yes,Beginner,3.6,41330,497.27,0.69,Yes,11.53,4,https://images.unsplash.com/photo-157101961345...,https://images.unsplash.com/photo-152450438894...
35507,Fitness and Nutrition Coaching,Liam Adams,72.1,Yes,Intermediate,3.3,43222,495.11,0.788,Yes,9.59,5,https://images.unsplash.com/photo-157101961345...,https://images.pexels.com/photos/220453/pexels...
