In [15]:
import pandas as pd
import tensorflow as tf


In [16]:
df = pd.read_csv("/mnt/c/Users/vichr/OneDrive/Desktop/ml projects/FFCS_Buddy/faculty.csv")
df.head()

Unnamed: 0,faculty_id,faculty_name,department,rating,style_tags
0,101.0,Dr. Anand R.,SCOPE,4.3,"project-based,chill,good-notes"
1,102.0,Dr. Priya K.,SCOPE,4.8,"strict,theory-heavy,tough-grader"
2,103.0,Dr. Suresh M.,SELECT,3.9,"helpful,strict"
3,104.0,Dr. Karthik S.,SCOPE,4.1,"engaging,helpful"
4,105.0,Dr. Meena L.,SENSE,4.6,"good-notes,helpful"


In [17]:
from sklearn.preprocessing import MultiLabelBinarizer

In [18]:
df['style_tags_list'] = df['style_tags'].apply(lambda x: x.split(','))

# 1. Convert the 'style_tags' column to a list of tags
mlb = MultiLabelBinarizer()
encoded_tags = mlb.fit_transform(df['style_tags_list'])

# Create a new DataFrame with the encoded tags
# The column names will be the unique tags themselves
encoded_df = pd.DataFrame(encoded_tags, columns=[f"tag_{cls}" for cls in mlb.classes_])

# 4. Combine the original data with the new encoded columns
final_df = pd.concat([df.drop(['style_tags', 'style_tags_list'], axis=1), encoded_df], axis=1)

# 5. Save the final, numerically-encoded data to a new CSV file
final_df.to_csv("faculty_encoded.csv", index=False)

In [19]:
final_df.head()


Unnamed: 0,faculty_id,faculty_name,department,rating,tag_boring,tag_chill,tag_engaging,tag_fast-paced,tag_good-notes,tag_helpful,tag_lenient-grading,tag_project-based,tag_strict,tag_theory-heavy,tag_tough-grader
0,101.0,Dr. Anand R.,SCOPE,4.3,0,1,0,0,1,0,0,1,0,0,0
1,102.0,Dr. Priya K.,SCOPE,4.8,0,0,0,0,0,0,0,0,1,1,1
2,103.0,Dr. Suresh M.,SELECT,3.9,0,0,0,0,0,1,0,0,1,0,0
3,104.0,Dr. Karthik S.,SCOPE,4.1,0,0,1,0,0,1,0,0,0,0,0
4,105.0,Dr. Meena L.,SENSE,4.6,0,0,0,0,1,1,0,0,0,0,0


In [21]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# --- 1. Load the FINAL, numerically encoded data ---
try:
    # Use the file that already has the 0s and 1s
    faculty_df = pd.read_csv("faculty_encoded.csv")
except FileNotFoundError:
    print("Error: 'faculty_encoded.csv' not found.")
    print("Make sure you have run the 'encode_data.py' script first.")
    exit()

# --- 2. Prepare for Recommendation ---

# Get the list of all possible tag columns (e.g., 'tag_chill', 'tag_strict', etc.)
tag_columns = [col for col in faculty_df.columns if col.startswith('tag_')]

# Separate the numerical tag data from the rest of the info
faculty_tag_matrix = faculty_df[tag_columns].values


# --- 3. The New Recommendation Function ---

def recommend_faculty(user_preferences: list, top_n=5):
    """
    Recommends top N faculty using the pre-encoded numerical data.

    Args:
        user_preferences (list): A list of the user's preferred tags (e.g., ['chill', 'project-based']).
        top_n (int): The number of recommendations to return.

    Returns:
        pandas.DataFrame: A DataFrame with the top N recommended faculty.
    """
    # Create a "user profile" vector of 0s and 1s that matches the faculty data format
    user_vector = np.zeros(len(tag_columns))
    for pref in user_preferences:
        # Create the column name, e.g., 'chill' -> 'tag_chill'
        tag_col_name = f"tag_{pref}"
        if tag_col_name in tag_columns:
            # Find the index of this tag column
            col_index = tag_columns.index(tag_col_name)
            # Set the user's preference for this tag to 1
            user_vector[col_index] = 1

    # Calculate the cosine similarity between the user's vector and ALL faculty members
    # We need to reshape the user_vector to be a 2D array for the function
    cosine_similarities = cosine_similarity(user_vector.reshape(1, -1), faculty_tag_matrix).flatten()

    # Add the similarity scores to the original DataFrame
    faculty_df['similarity_score'] = cosine_similarities

    # Sort the faculty by the similarity score in descending order
    recommendations = faculty_df.sort_values(by=['similarity_score', 'rating'], ascending=False)
    
    # We need to re-add the original 'style_tags' for display purposes.
    # Let's recreate it from the encoded columns.
    def get_tags_from_row(row):
        return [col.replace('tag_', '') for col in tag_columns if row[col] == 1]

    recommendations['style_tags'] = recommendations.apply(get_tags_from_row, axis=1).str.join(',')


    return recommendations.head(top_n)[['faculty_name', 'department', 'rating', 'style_tags', 'similarity_score']]


# --- 4. Example Usage ---

if __name__ == "__main__":
    # Define a sample user's preferences
    user_prefs = ["project-based", "helpful", "lenient-grading"]
    print(f"Finding recommendations for user with preferences: {user_prefs}\n")

    # Get the recommendations
    top_faculty = recommend_faculty(user_prefs)

    # Print the results
    print("--- Top 5 Recommended Faculty ---")
    print(top_faculty)

Finding recommendations for user with preferences: ['project-based', 'helpful', 'lenient-grading']

--- Top 5 Recommended Faculty ---
         faculty_name department  rating  \
453    Dr. Aadhya Ali      SCOPE     4.9   
449   Dr. Zara Pillai      SCOPE     4.8   
492    Dr. Krish Khan     SELECT     4.8   
524   Dr. Vivaan Iyer      SCOPE     4.8   
536  Dr. Vihaan Gupta      SCOPE     4.8   

                                style_tags  similarity_score  
453  helpful,lenient-grading,project-based          1.000000  
449          lenient-grading,project-based          0.816497  
492                helpful,lenient-grading          0.816497  
524                  helpful,project-based          0.816497  
536                  helpful,project-based          0.816497  
