# Student recommender system using Skills, Interests and Red flags

In [368]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import pickle

In [369]:
users = pd.read_csv('data/students_v1.csv', sep=';', encoding='latin-1', on_bad_lines='skip')

In [370]:
users.head(30)

Unnamed: 0,Student-ID,Skills,Interests,Red-Flags
0,1,"Angular, Spring Boot, Java","Football, Cats, Dogs",Drinking
1,2,"React, NodeJS, ExpressJS, MongoDB","Painting, Drinking,Parties",Sports
2,3,"Power BI, Spark, Haddoop","Sports, Cinema, Morning Person","Drinking,Cats, Dogs, Parties"
3,4,"JavaScript, Angular, JEE, MySQL","Camping, Football, Fast Food",Cats
4,5,"Angular, Spring Boot, MongoDB","Painting, Blogging, Writing",Morning Person
5,6,"NextJS, MongoDB, NodeJS, ExpressJS","Football, Travel, Smooking, Parties","Photographing, Social Media, Dogs"
6,7,"PowerBI, Python, SQL, Machine Learning","Cinema, Blogging, Cats",Drinking
7,8,"Machine Learning, Python","Reading, Art, Dogs",Cats
8,9,"Machine Learning, Python, NoSQL","Yoga, Dance, Drinking, Parties","Dogs, Cats"
9,10,"Machine Learning, Python, NoSQL","Painting, Basketball","Smoking, Drinking"


In [371]:
users.columns

Index(['Student-ID', 'Skills', 'Interests', 'Red-Flags'], dtype='object')

In [372]:
users.rename(columns={
    "Student-ID": "student_id",
    "Skills": "skills",
    "Interests": "interests",
    "Red-Flags": "red_flags"
}, inplace=True)

In [373]:
users.head()

Unnamed: 0,student_id,skills,interests,red_flags
0,1,"Angular, Spring Boot, Java","Football, Cats, Dogs",Drinking
1,2,"React, NodeJS, ExpressJS, MongoDB","Painting, Drinking,Parties",Sports
2,3,"Power BI, Spark, Haddoop","Sports, Cinema, Morning Person","Drinking,Cats, Dogs, Parties"
3,4,"JavaScript, Angular, JEE, MySQL","Camping, Football, Fast Food",Cats
4,5,"Angular, Spring Boot, MongoDB","Painting, Blogging, Writing",Morning Person


In [374]:
users.fillna('', inplace=True)

In [375]:
users.head()

Unnamed: 0,student_id,skills,interests,red_flags
0,1,"Angular, Spring Boot, Java","Football, Cats, Dogs",Drinking
1,2,"React, NodeJS, ExpressJS, MongoDB","Painting, Drinking,Parties",Sports
2,3,"Power BI, Spark, Haddoop","Sports, Cinema, Morning Person","Drinking,Cats, Dogs, Parties"
3,4,"JavaScript, Angular, JEE, MySQL","Camping, Football, Fast Food",Cats
4,5,"Angular, Spring Boot, MongoDB","Painting, Blogging, Writing",Morning Person


In [376]:
users['skills'] = users['skills'].apply(lambda x: x.split(','))
users['interests'] = users['interests'].apply(lambda x: x.split(','))
users['red_flags'] = users['red_flags'].apply(lambda x: x.split(','))

In [377]:
print(users.sample(3))

    student_id                                   skills  \
5            6  [NextJS,  MongoDB,  NodeJS,  ExpressJS]   
2            3             [Power BI,  Spark,  Haddoop]   
10          11     [Machine Learning,  Python,  Django]   

                                        interests  \
5        [Football,  Travel,  Smooking,  Parties]   
2              [Sports,  Cinema,  Morning Person]   
10  [Cinema,  Blogging,  Photographing,  Parties]   

                                red_flags  
5   [Photographing,  Social Media,  Dogs]  
2       [Drinking, Cats,  Dogs,  Parties]  
10                              [Smoking]  


In [378]:
mlb = MultiLabelBinarizer()
skills_onehot = pd.DataFrame(mlb.fit_transform(users['skills']), columns=mlb.classes_)
interests_onehot = pd.DataFrame(mlb.fit_transform(users['interests']), columns=mlb.classes_)
red_flags_onehot = pd.DataFrame(mlb.fit_transform(users['red_flags']), columns=mlb.classes_)

In [379]:
skills_onehot

Unnamed: 0,Flutter,Angular,Automation,C#,Cloud,Data Analysis,Dev Mobile,Django,ExpressJS,Haddoop,...,Flutter.1,JavaScript,Machine Learning,NextJS,Power BI,PowerBI,React,Symfony,Testing,Unit Test
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [380]:
encoded_features = pd.concat([skills_onehot, interests_onehot, red_flags_onehot], axis=1)

In [381]:
skills_matrix = csr_matrix(encoded_features)

In [382]:
model = NearestNeighbors(algorithm='brute')
model.fit(skills_matrix)

In [383]:
pickle.dump(model, open('artifacts/model.pkl', 'wb'))
pickle.dump(students['student_id'], open('artifacts/students_id.pkl', 'wb'))
pickle.dump(students, open('artifacts/students.pkl', 'wb'))

In [384]:
def recommend_student(student_id):
    if student_id not in users['student_id'].values:
        print(f"Student with ID {student_id} not found.")
        return

    student_index = np.where(users['student_id'] == student_id)[0][0]
    current_student_red_flags = users.loc[student_index, 'red_flags']
    
    # Find red flags to exclude
    exclude_red_flags = set(flag for flag in current_student_red_flags if 
                            (users['interests'].apply(lambda x: flag in x)).sum() >= 3)
    
    # Find students to exclude
    exclude_students = set()
    for index, row in users.iterrows():
        if any(flag in row['interests'] for flag in exclude_red_flags):
            exclude_students.add(row['student_id'])
    
    # Find similar students using the model
    distance, suggestion = model.kneighbors(skills_matrix[student_index, :].reshape(1, -1), n_neighbors=10)
    
    # Print recommended student IDs, excluding those with common red_flags
    for i in range(len(suggestion)):
        similar_students = users['student_id'].iloc[suggestion[i]]
        for j in similar_students:
            if j not in exclude_students:
                print(j)

In [388]:
student_id_to_recommend = 11
recommend_student(student_id_to_recommend)

11
10
7
8
9
22
24
23
12
20
