In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline

# Load dataset
data_path = r'C:\Users\VarshaPriyadarshini\Desktop\Final project\DataScience\CourseraDataset.csv'
df = pd.read_csv(data_path)

# Feature extraction: TF-IDF on course titles and skills gained
tfidf_vectorizer = TfidfVectorizer()
title_tfidf = tfidf_vectorizer.fit_transform(df['Course Title'])
skill_tfidf = tfidf_vectorizer.fit_transform(df['Skill gain'])

# Check available columns
print(df.columns)

# Assuming the actual name of the 'Level Encoded' column needs to be verified and possibly updated
feature_columns = ['Rating', 'Duration to complete (Approx.)', 'Number of Review']  # Update as needed

# Combine TF-IDF features with existing features
X = pd.concat([pd.DataFrame(title_tfidf.toarray()), pd.DataFrame(skill_tfidf.toarray()), df[feature_columns]], axis=1)
y = df['Next Course ID']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)

# Function to recommend courses based on input
def recommend_next_courses(course_title, skill_gain, n_recommendations=3):
    # Process input course title and skill gain
    input_data = pd.DataFrame({
        'Course Title': [course_title],
        'Skill gain': [skill_gain]
    })
    
    # Transform input data with TF-IDF
    title_tfidf = tfidf_vectorizer.transform(input_data['Course Title'])
    skill_tfidf = tfidf_vectorizer.transform(input_data['Skill gain'])
    
    # Combine with numerical features (you can add dummy values for rating, duration, etc. if needed)
    input_features = pd.concat([pd.DataFrame(title_tfidf.toarray()), pd.DataFrame(skill_tfidf.toarray()), pd.DataFrame([[0, 0, 0]])], axis=1)  # Adjust if necessary
    
    # Scale the input features
    input_features_scaled = scaler.transform(input_features)
    
    # Find the nearest neighbors
    distances, indices = knn_model.kneighbors(input_features_scaled, n_neighbors=n_recommendations + 1)  # +1 to skip the input course
    
    # Get recommended course IDs (excluding the first one)
    recommended_course_ids = df['Course ID'].iloc[indices[0][1:n_recommendations + 1]].values
    
    # Retrieve course titles for recommended course IDs
    recommended_courses = df[df['Course ID'].isin(recommended_course_ids)]['Course Title'].values
    
    return recommended_courses

# Example usage
recommended_courses = recommend_next_courses('Data Structures', 'Understanding algorithms', n_recommendations=3)
print("Recommended courses:")
for course in recommended_courses:
    print(course)


Index(['Course Title', 'Rating', 'Level', 'Schedule', 'What you will learn',
       'Skill gain', 'Modules', 'Instructor', 'Offered By', 'Keyword',
       'Course Url', 'Duration to complete (Approx.)', 'Number of Review'],
      dtype='object')


KeyError: 'Next Course ID'