# Student recommender system using Skills, Interests and Red flags

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import pickle

In [2]:
users = pd.read_csv('data/students.csv', sep=',', encoding='latin-1', on_bad_lines='skip')

In [5]:
users.head(21)

Unnamed: 0,Student-ID,Skills,Interests,Red-Flags
0,657c6cfc080a390090b50fc4,"Angular,Node.js,MongoDB,Express.js,JavaScript,...","Cats,Painting,Morning Person,Art,Sports","Smoking,Parties,Drinking"
1,657c6d9a080a390090b50fc5,"Python,Machine Learning,Django,Data Science,Da...","Parties,Drinking,Music,Smoking","Morning Person,Cats"
2,657c6e55080a390090b50fc6,"React,Next.js,JavaScript,Node.js,NoSQL,MongoDB","Cats,Cinema,Blogging,Photographing,Smoking",Dogs
3,657c6f14080a390090b50fc7,"Symfony,PHP,PostgreSQL,MySQL,Angular,SQL Serve...","Basketball,Dance,Sports,Fast Food,Travel","Drinking,Cats,Dogs"
4,657c6f9f080a390090b50fc8,"PHP,Symfony,PostgreSQL,MySQL,Angular,SQL","Sports,Painting,Writing,Reading","Drinking,Smoking"
5,657c7064080a390090b50fc9,"Cloud,DevOps,Power Automate,Testing,Automation...","Cinema,Writing,Fast Food","Drinking,Smoking"
6,657c710a080a390090b50fca,"Angular,NoSQL,MongoDB,Next.js,Power BI,Network","Cats,Fast Food,Swimming,Social Media","Drinking,Smoking"
7,657c715a080a390090b50fcb,"Power Automate,Machine Learning,Data Science,P...","Football,Morning Person,Art,Yoga","Social Media,Smoking"
8,657c72c5080a390090b50fcc,"Cyber Security,Network,Automation,Data Science...","Writing,Basketball,Dance,Yoga,Fishing","Parties,Drinking"
9,657c733b080a390090b50fcd,"Power BI,Data Science,SQL,NoSQL,Data Analysis","Cinema,Tennis,Cats","Drinking,Smoking"


In [7]:
users.columns

Index(['Student-ID', 'Skills', 'Interests', 'Red-Flags'], dtype='object')

In [8]:
users.rename(columns={
    "Student-ID": "student_id",
    "Skills": "skills",
    "Interests": "interests",
    "Red-Flags": "red_flags"
}, inplace=True)

In [9]:
users.head()

Unnamed: 0,student_id,skills,interests,red_flags
0,657c6cfc080a390090b50fc4,"Angular,Node.js,MongoDB,Express.js,JavaScript,...","Cats,Painting,Morning Person,Art,Sports","Smoking,Parties,Drinking"
1,657c6d9a080a390090b50fc5,"Python,Machine Learning,Django,Data Science,Da...","Parties,Drinking,Music,Smoking","Morning Person,Cats"
2,657c6e55080a390090b50fc6,"React,Next.js,JavaScript,Node.js,NoSQL,MongoDB","Cats,Cinema,Blogging,Photographing,Smoking",Dogs
3,657c6f14080a390090b50fc7,"Symfony,PHP,PostgreSQL,MySQL,Angular,SQL Serve...","Basketball,Dance,Sports,Fast Food,Travel","Drinking,Cats,Dogs"
4,657c6f9f080a390090b50fc8,"PHP,Symfony,PostgreSQL,MySQL,Angular,SQL","Sports,Painting,Writing,Reading","Drinking,Smoking"


In [10]:
users.fillna('', inplace=True)

In [11]:
users.head()

Unnamed: 0,student_id,skills,interests,red_flags
0,657c6cfc080a390090b50fc4,"Angular,Node.js,MongoDB,Express.js,JavaScript,...","Cats,Painting,Morning Person,Art,Sports","Smoking,Parties,Drinking"
1,657c6d9a080a390090b50fc5,"Python,Machine Learning,Django,Data Science,Da...","Parties,Drinking,Music,Smoking","Morning Person,Cats"
2,657c6e55080a390090b50fc6,"React,Next.js,JavaScript,Node.js,NoSQL,MongoDB","Cats,Cinema,Blogging,Photographing,Smoking",Dogs
3,657c6f14080a390090b50fc7,"Symfony,PHP,PostgreSQL,MySQL,Angular,SQL Serve...","Basketball,Dance,Sports,Fast Food,Travel","Drinking,Cats,Dogs"
4,657c6f9f080a390090b50fc8,"PHP,Symfony,PostgreSQL,MySQL,Angular,SQL","Sports,Painting,Writing,Reading","Drinking,Smoking"


In [12]:
users['skills'] = users['skills'].apply(lambda x: x.split(','))
users['interests'] = users['interests'].apply(lambda x: x.split(','))
users['red_flags'] = users['red_flags'].apply(lambda x: x.split(','))

In [13]:
print(users.sample(4))

                  student_id  \
14  657c76ec080a390090b50fd2   
9   657c733b080a390090b50fcd   
3   657c6f14080a390090b50fc7   
10  657c7466080a390090b50fce   

                                               skills  \
14  [Machine Learning, Power Automate, Automation,...   
9   [Power BI, Data Science, SQL, NoSQL, Data Anal...   
3   [Symfony, PHP, PostgreSQL, MySQL, Angular, SQL...   
10  [React, Next.js, Node.js, Express.js, JavaScri...   

                                            interests               red_flags  
14                           [Painting, Dogs, Travel]        [Drinking, Dogs]  
9                              [Cinema, Tennis, Cats]     [Drinking, Smoking]  
3      [Basketball, Dance, Sports, Fast Food, Travel]  [Drinking, Cats, Dogs]  
10  [Smoking, Drinking, Parties, Fast Food, Photog...        [Morning Person]  


In [14]:
mlb = MultiLabelBinarizer()
skills_onehot = pd.DataFrame(mlb.fit_transform(users['skills']), columns=mlb.classes_)
interests_onehot = pd.DataFrame(mlb.fit_transform(users['interests']), columns=mlb.classes_)
red_flags_onehot = pd.DataFrame(mlb.fit_transform(users['red_flags']), columns=mlb.classes_)

In [15]:
skills_onehot

Unnamed: 0,Android Studio,Angular,Automation,C#,Cloud,Cyber Security,Data Analysis,Data Science,Dev Mobile,DevOps,...,Power BI,Python,React,SQL,SQL Server,Security,Spring Boot,Symfony,Testing,Xamarin
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,1,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,1,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
5,0,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
6,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,1,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,1,1,0,0,...,1,0,0,1,0,0,0,0,0,0


In [16]:
encoded_features = pd.concat([skills_onehot, interests_onehot, red_flags_onehot], axis=1)

In [17]:
skills_matrix = csr_matrix(encoded_features)

In [18]:
model = NearestNeighbors(algorithm='brute')
model.fit(skills_matrix)

In [19]:
pickle.dump(model, open('artifacts/model.pkl', 'wb'))
pickle.dump(users['student_id'], open('artifacts/students_id.pkl', 'wb'))
pickle.dump(users, open('artifacts/students.pkl', 'wb'))
pickle.dump(skills_matrix, open('artifacts/skills_matrix.pkl', 'wb'))

In [28]:
def recommend_student(student_id):
    if student_id not in users['student_id'].values:
        print(f"Student with ID {student_id} not found.")
        return

    student_index = np.where(users['student_id'] == student_id)[0][0]
    current_student_red_flags = users.loc[student_index, 'red_flags']
    
    # Find red flags to exclude
    exclude_red_flags = set(flag for flag in current_student_red_flags if 
                            (users['interests'].apply(lambda x: flag in x)).sum() >= 1)
    
    # Find students to exclude
    exclude_students = set()
    for index, row in users.iterrows():
        if any(flag in row['interests'] for flag in exclude_red_flags):
            exclude_students.add(row['student_id'])
    
    # Find similar students using the model
    distance, suggestion = model.kneighbors(skills_matrix[student_index, :].reshape(1, -1), n_neighbors=len(users))
    
    # Calculate Jaccard similarity and sort recommended students in descending order
    def jaccard_similarity(set1, set2):
        intersection = len(set1.intersection(set2))
        union = len(set1.union(set2))
        return intersection / union if union != 0 else 0

    selected_student_skills = set(users.loc[student_index, 'skills'])
    recommended_students = []

    # Append recommended student IDs, excluding those with common red_flags
    for i in range(len(suggestion)):
        similar_students = users['student_id'].iloc[suggestion[i]]
        for j in similar_students:
            if j not in exclude_students:
                recommended_students.append(j)

    # Sort recommended students by Jaccard similarity in descending order
    recommended_students.sort(key=lambda x: jaccard_similarity(selected_student_skills, set(users.loc[users['student_id'] == x, 'skills'].iloc[0])), reverse=True)

    # Print the IDs of recommended students
    print("Recommended Students IDs:")
    for student_id in recommended_students:
        print(student_id)

# Example usage
recommend_student("your_student_id")


Student with ID your_student_id not found.


In [29]:
student_id_to_recommend = "657c6cfc080a390090b50fc4"
recommend_student(student_id_to_recommend)

Recommended Students IDs:
657c6cfc080a390090b50fc4
657d61ee5930ff12d40dba61
657d6b345930ff12d40dba62
657c710a080a390090b50fca
657c75ee080a390090b50fd1
65a3a30440b7be1d2c8bc6a5
65a24ba65798c60d3e286f56
657c6f9f080a390090b50fc8
657c7064080a390090b50fc9
657c6f14080a390090b50fc7
657c74db080a390090b50fcf
657c7539080a390090b50fd0
657c733b080a390090b50fcd
657c76ec080a390090b50fd2
657c715a080a390090b50fcb
657c72c5080a390090b50fcc
