In [1]:
# Course Recommendation System
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import pickle
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading data...")
students_df = pd.read_csv('data/students.csv')
courses_df = pd.read_csv('data/courses.csv')
enrollments_df = pd.read_csv('data/enrollments.csv')
submissions_df = pd.read_csv('data/submissions.csv')

print(f"Loaded {len(students_df)} students, {len(courses_df)} courses")

# Prepare course features
def prepare_course_features():
    # Fill missing values
    courses_df['description'] = courses_df['description'].fillna('')
    courses_df['department_name'] = courses_df['department_name'].fillna('General')
    
    # Create text features
    courses_df['text_features'] = (
        courses_df['title'] + ' ' + 
        courses_df['description'] + ' ' + 
        courses_df['department_name']
    )
    
    # TF-IDF vectorization
    tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
    course_features = tfidf.fit_transform(courses_df['text_features'])
    
    return course_features, tfidf

# Create user-course interaction matrix
def create_interaction_matrix():
    # Convert completion status to numeric first
    status_map = {'not_started': 1, 'in_progress': 2, 'completed': 3}
    enrollments_df['completion_numeric'] = enrollments_df['completion_status'].map(status_map)
    
    # Create user-course matrix with numeric values
    user_course_matrix = enrollments_df.pivot_table(
        index='student_id', 
        columns='course_id', 
        values='completion_numeric',
        aggfunc='max',  # Use max in case of duplicate entries
        fill_value=0
    )
    
    # Add grade information
    if not submissions_df.empty:
        avg_grades = submissions_df.groupby(['student_id', 'course_id'])['grade'].mean().reset_index()
        for _, row in avg_grades.iterrows():
            student_id, course_id, grade = row['student_id'], row['course_id'], row['grade']
            if student_id in user_course_matrix.index and course_id in user_course_matrix.columns:
                # Boost score based on grade (grade/20 to normalize to 0-5 scale)
                user_course_matrix.loc[student_id, course_id] += grade / 20
    
    return user_course_matrix

# Collaborative filtering
def collaborative_filtering(user_course_matrix, target_student_id, n_recommendations=5):
    if target_student_id not in user_course_matrix.index:
        return []
    
    # Calculate user similarities
    user_similarity = cosine_similarity(user_course_matrix)
    user_similarity_df = pd.DataFrame(
        user_similarity, 
        index=user_course_matrix.index, 
        columns=user_course_matrix.index
    )
    
    # Find similar users
    similar_users = user_similarity_df[target_student_id].sort_values(ascending=False)[1:11]
    
    # Get courses liked by similar users
    target_user_courses = set(user_course_matrix.loc[target_student_id]
                             [user_course_matrix.loc[target_student_id] > 0].index)
    
    recommendations = {}
    
    for similar_user, similarity_score in similar_users.items():
        similar_user_courses = user_course_matrix.loc[similar_user]
        for course_id, rating in similar_user_courses.items():
            if course_id not in target_user_courses and rating > 1:  # Not enrolled and rating > 1
                if course_id not in recommendations:
                    recommendations[course_id] = 0
                recommendations[course_id] += similarity_score * rating
    
    # Sort and return top recommendations
    sorted_recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)
    return [course_id for course_id, _ in sorted_recommendations[:n_recommendations]]

# Content-based filtering
def content_based_filtering(course_features, user_course_matrix, target_student_id, n_recommendations=5):
    if target_student_id not in user_course_matrix.index:
        return []
    
    # Get user's enrolled courses
    user_courses = user_course_matrix.loc[target_student_id]
    enrolled_courses = user_courses[user_courses > 0].index.tolist()
    
    if not enrolled_courses:
        return []
    
    # Calculate course similarities
    course_similarity = cosine_similarity(course_features)
    
    # Find similar courses
    similar_courses = {}
    
    for enrolled_course in enrolled_courses:
        if enrolled_course in courses_df['id'].values:
            course_idx = courses_df[courses_df['id'] == enrolled_course].index[0]
            similarities = course_similarity[course_idx]
            
            for idx, similarity_score in enumerate(similarities):
                course_id = courses_df.iloc[idx]['id']
                if course_id not in enrolled_courses:
                    if course_id not in similar_courses:
                        similar_courses[course_id] = 0
                    similar_courses[course_id] += similarity_score
    
    # Sort and return top recommendations
    sorted_recommendations = sorted(similar_courses.items(), key=lambda x: x[1], reverse=True)
    return [course_id for course_id, _ in sorted_recommendations[:n_recommendations]]

# Hybrid recommendation
def hybrid_recommendations(target_student_id, n_recommendations=3):
    cf_recommendations = collaborative_filtering(user_course_matrix, target_student_id, n_recommendations)
    cb_recommendations = content_based_filtering(course_features, user_course_matrix, target_student_id, n_recommendations)
    
    # Combine recommendations (give equal weight)
    all_recommendations = list(set(cf_recommendations + cb_recommendations))
    
    # If not enough recommendations, add popular courses
    if len(all_recommendations) < n_recommendations:
        enrolled_courses = set(user_course_matrix.loc[target_student_id]
                              [user_course_matrix.loc[target_student_id] > 0].index) if target_student_id in user_course_matrix.index else set()
        
        popular_courses = enrollments_df['course_id'].value_counts().head(10).index.tolist()
        for course_id in popular_courses:
            if course_id not in enrolled_courses and course_id not in all_recommendations:
                all_recommendations.append(course_id)
                if len(all_recommendations) >= n_recommendations:
                    break
    
    return all_recommendations[:n_recommendations]

# Prepare data
print("Preparing course features...")
course_features, tfidf = prepare_course_features()

print("Creating interaction matrix...")
user_course_matrix = create_interaction_matrix()

# Test recommendation
print("Testing recommendations...")
if not user_course_matrix.empty:
    sample_student = user_course_matrix.index[0]
    recommendations = hybrid_recommendations(sample_student, 3)
    print(f"Sample recommendations for student {sample_student}: {recommendations}")

# Save models
print("Saving models...")
models = {
    'course_features': course_features,
    'tfidf': tfidf,
    'user_course_matrix': user_course_matrix,
    'courses_df': courses_df,
    'students_df': students_df,
    'hybrid_recommendations': hybrid_recommendations
}

with open('recommendation_model.pkl', 'wb') as f:
    pickle.dump(models, f)

print("Model saved successfully!")

Loading data...
Loaded 7 students, 16 courses
Preparing course features...
Creating interaction matrix...
Testing recommendations...
Sample recommendations for student 9: [8, 11, 4]
Saving models...
Model saved successfully!
