# Student recommender system using Skills, Interests and Red flags

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import pickle

In [8]:
users = pd.read_csv('data/students.csv', sep=',', encoding='latin-1', on_bad_lines='skip')

In [9]:
users.head(30)

Unnamed: 0,Student-ID,Skills,Interests,Red-Flags
0,65787026f9984c068f184133,"Java,Angular","hhhh,jjjjj,hhdjdjdj","hahahah,hdhsj"
1,65787045f9984c068f184134,"NodeJs,Angular","oooo,rrrrr,qqqqqqq","hahahah,hdhsj"
2,6578832b0abd9e53cb847bf0,"NodeJs,Angular","cats,dogs,qqqqqqq","hahahah,hdhsj"


In [10]:
users.columns

Index(['Student-ID', 'Skills', 'Interests', 'Red-Flags'], dtype='object')

In [11]:
users.rename(columns={
    "Student-ID": "student_id",
    "Skills": "skills",
    "Interests": "interests",
    "Red-Flags": "red_flags"
}, inplace=True)

In [12]:
users.head()

Unnamed: 0,student_id,skills,interests,red_flags
0,65787026f9984c068f184133,"Java,Angular","hhhh,jjjjj,hhdjdjdj","hahahah,hdhsj"
1,65787045f9984c068f184134,"NodeJs,Angular","oooo,rrrrr,qqqqqqq","hahahah,hdhsj"
2,6578832b0abd9e53cb847bf0,"NodeJs,Angular","cats,dogs,qqqqqqq","hahahah,hdhsj"


In [13]:
users.fillna('', inplace=True)

In [14]:
users.head()

Unnamed: 0,student_id,skills,interests,red_flags
0,65787026f9984c068f184133,"Java,Angular","hhhh,jjjjj,hhdjdjdj","hahahah,hdhsj"
1,65787045f9984c068f184134,"NodeJs,Angular","oooo,rrrrr,qqqqqqq","hahahah,hdhsj"
2,6578832b0abd9e53cb847bf0,"NodeJs,Angular","cats,dogs,qqqqqqq","hahahah,hdhsj"


In [15]:
users['skills'] = users['skills'].apply(lambda x: x.split(','))
users['interests'] = users['interests'].apply(lambda x: x.split(','))
users['red_flags'] = users['red_flags'].apply(lambda x: x.split(','))

In [16]:
print(users.sample(3))

                 student_id             skills                interests  \
1  65787045f9984c068f184134  [NodeJs, Angular]   [oooo, rrrrr, qqqqqqq]   
2  6578832b0abd9e53cb847bf0  [NodeJs, Angular]    [cats, dogs, qqqqqqq]   
0  65787026f9984c068f184133    [Java, Angular]  [hhhh, jjjjj, hhdjdjdj]   

          red_flags  
1  [hahahah, hdhsj]  
2  [hahahah, hdhsj]  
0  [hahahah, hdhsj]  


In [17]:
mlb = MultiLabelBinarizer()
skills_onehot = pd.DataFrame(mlb.fit_transform(users['skills']), columns=mlb.classes_)
interests_onehot = pd.DataFrame(mlb.fit_transform(users['interests']), columns=mlb.classes_)
red_flags_onehot = pd.DataFrame(mlb.fit_transform(users['red_flags']), columns=mlb.classes_)

In [18]:
skills_onehot

Unnamed: 0,Angular,Java,NodeJs
0,1,1,0
1,1,0,1
2,1,0,1


In [28]:
encoded_features = pd.concat([skills_onehot, interests_onehot, red_flags_onehot], axis=1)

In [29]:
skills_matrix = csr_matrix(encoded_features)

In [42]:
model = NearestNeighbors(algorithm='brute')
model.fit(skills_matrix)

In [43]:
pickle.dump(model, open('artifacts/model.pkl', 'wb'))
pickle.dump(users['student_id'], open('artifacts/students_id.pkl', 'wb'))
pickle.dump(users, open('artifacts/students.pkl', 'wb'))
pickle.dump(skills_matrix, open('artifacts/skills_matrix.pkl', 'wb'))

In [44]:
def recommend_student(student_id):
    if student_id not in users['student_id'].values:
        print(f"Student with ID {student_id} not found.")
        return

    student_index = np.where(users['student_id'] == student_id)[0][0]
    current_student_red_flags = users.loc[student_index, 'red_flags']
    
    # Find red flags to exclude
    exclude_red_flags = set(flag for flag in current_student_red_flags if 
                            (users['interests'].apply(lambda x: flag in x)).sum() >= 3)
    
    # Find students to exclude
    exclude_students = set()
    for index, row in users.iterrows():
        if any(flag in row['interests'] for flag in exclude_red_flags):
            exclude_students.add(row['student_id'])
    
    # Find similar students using the model
    distance, suggestion = model.kneighbors(skills_matrix[student_index, :].reshape(1, -1), n_neighbors=3)
    
    # Print recommended student IDs, excluding those with common red_flags
    for i in range(len(suggestion)):
        similar_students = users['student_id'].iloc[suggestion[i]]
        for j in similar_students:
            if j not in exclude_students:
                print(j)

In [45]:
student_id_to_recommend = "65787045f9984c068f184134"
recommend_student(student_id_to_recommend)

65787045f9984c068f184134
6578832b0abd9e53cb847bf0
65787026f9984c068f184133
