In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
courses_dataset_path = r"C:\Users\sudhe\Projects\project_for_moeny\data\courses_dataset.csv"
courses_dataset = pd.read_csv(courses_dataset_path)
courses_dataset

Unnamed: 0,Student ID,Interests,Career Goals,Matched Courses
0,1,Chemistry,Computer Engineering,"Chemical & Biochemical Engineering, Cybersecurity"
1,2,Computer Architecture,Computer Science,"Computer Engineering, Cybersecurity"
2,3,Computer Architecture,Engineering Management,"Computer Engineering, Cybersecurity"
3,4,Network Security,Cybersecurity,Cybersecurity
4,5,Strategic Planning,Information Systems,"Engineering Management, Software Engineering"
...,...,...,...,...
1995,1996,Health Informatics,Computer Science,"Health Information Technology, Human-Centered ..."
1996,1997,Statistics,Health Information Technology,"Data Science, Computer Science"
1997,1998,Artificial Intelligence,Engineering Management,"Computer Science, Health Information Technology"
1998,1999,Environmental Science,Engineering Management,"Environmental Engineering, Computer Engineering"


In [3]:
print(courses_dataset['Career Goals'].nunique())
print(courses_dataset['Interests'].nunique())
print(courses_dataset['Matched Courses'].nunique())

12
33
144


### Cosine-similarity Approach

In [11]:
# Combine 'Interests' and 'Career Goals' into one string per student
courses_dataset['profile_text'] = courses_dataset['Interests'] + " " + courses_dataset['Career Goals']

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english')
profile_vectors = vectorizer.fit_transform(courses_dataset['profile_text'])
course_vectors = vectorizer.fit_transform(courses_dataset['Matched Courses'])


In [12]:
# Calculate cosine similarity between each student profile and course description
similarity_matrix = cosine_similarity(profile_vectors, course_vectors)

# Generate course recommendations based on highest similarity
top_course_indices = similarity_matrix.argsort(axis=1)[:, -1:]  # Top 3 courses for each student
course_recommendations = [[courses_dataset['Matched Courses'].iloc[idx] for idx in reversed(indices)] for indices in top_course_indices]

courses_dataset['Recommended Courses'] = course_recommendations
courses_dataset[['Interests', 'Career Goals', 'Recommended Courses']]

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 60 while Y.shape[1] == 18

### Inference

In [8]:
def recommend_courses(interests, career_goals, vectorizer, course_df, course_vectors):
    # Combine the student's interests and career goals into a single profile text
    profile_text = interests + " " + career_goals
    
    # Vectorize the new student profile using the already fitted vectorizer
    profile_vector = vectorizer.transform([profile_text])
    
    # Calculate cosine similarity between the student profile and all course descriptions
    similarity_scores = cosine_similarity(profile_vector, course_vectors)
    
    # Get the indices of the top 3 courses
    top_indices = similarity_scores.argsort()[0][-1:]
    
    # Retrieve the course names based on these indices
    recommended_courses = [course_df['Matched Courses'].iloc[index] for index in reversed(top_indices)]
    
    return recommended_courses


In [9]:
# Example student data
new_student_interests = "Health Informatics"
new_student_career_goals = "Computer Science"

# Get course recommendations
recommendations = recommend_courses(new_student_interests, new_student_career_goals, vectorizer, courses_dataset, course_vectors)

print("Recommended Courses for the new student:", recommendations)


Recommended Courses for the new student: ['Health Information Technology, Computer Science']
