In [185]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.neighbors import NearestNeighbors
import numpy as np

In [186]:
df  = pd.read_csv('../dataset/user_cleaned.csv')

In [187]:
df.head()

Unnamed: 0,id,language,currentEducation,experienceYears,interest_list,skill_list,learningResource_list,tools_list
0,05bad4a0-5d10-4b4f-ab27-0e54403bdda4,English,Elementary,1,"['Cognitive Computing', 'Neural Networks', 'AI...","['Recommender Systems', 'AI in Education']","['Stanford Online', 'Harvard Online', 'MIT Ope...","['DataRobot', 'Microsoft Azure AI', 'OpenAI GP..."
1,05cd7644-56a9-4de9-83b2-37909a575869,Thai,Master Degree,4,"['Speech Recognition', 'AI in Finance', 'AI Et...","['Neural Networks', 'Reinforcement Learning', ...","['Udemy', 'YouTube', 'edX', 'DeepLearning.ai',...","['PyTorch', 'Amazon SageMaker', 'RapidMiner', ..."
2,06a5167b-429c-40d7-af95-8c08e3ae8d3c,Mandarin,Undergraduate Degree,3,"['AI in Education', 'Cognitive Computing', 'Au...","['Deep Learning', 'Cognitive Computing', 'Reco...","['DeepLearning.ai', '', 'Coursera', 'MIT OpenC...","['Weka', 'Hugging Face Transformers', 'Keras',..."
3,0a7c5ce8-da53-4462-98b4-15b48159a72c,Arabic,Undergraduate Degree,3,"['Recommender Systems', 'AI Planning and Optim...","['Autonomous Vehicles', '', 'Cognitive Computi...","['MIT OpenCourseWare', 'Stanford Online']","['Scikit-learn', '', 'Weka', 'Microsoft Azure ..."
4,0d1bfc79-4a6b-453c-a275-66924a1c8b5e,Portuguese,Master Degree,4,"['Expert Systems', 'Generative AI', 'Computer ...","['Robotics', 'Reinforcement Learning', 'Natura...","['GitHub', 'Stanford Online']","['RapidMiner', '', 'OpenAI GPT', 'TensorFlow',..."


In [188]:
encoder = OneHotEncoder(handle_unknown='ignore')
encoded_cat = encoder.fit_transform(df[['language', 'currentEducation']]).toarray()

# Binarize the array columns
mlb_interest = MultiLabelBinarizer()
interest_binarized = mlb_interest.fit_transform(df['interest_list'])

mlb_skill = MultiLabelBinarizer()
skill_binarized = mlb_skill.fit_transform(df['skill_list'])

mlb_learning = MultiLabelBinarizer()
learning_binarized = mlb_learning.fit_transform(df['learningResource_list'])

mlb_tools = MultiLabelBinarizer()
tools_binarized = mlb_tools.fit_transform(df['tools_list'])

# Combine all the features into one matrix
X = np.hstack([encoded_cat, interest_binarized, skill_binarized, learning_binarized, tools_binarized, 
               df[['experienceYears']].values])

# Create and fit the model
knn = NearestNeighbors(n_neighbors=3, metric='euclidean')
knn.fit(X)

In [189]:
# Input data for which you want the k nearest neighbors
input_data = pd.DataFrame({
    'language': ['Thai'], 
    'currentEducation': ['Masters Degree'],
    'experienceYears': [5],
    'interest_list': [['Cognitive Computing']],
    'skill_list': [['Recommender Systems']],
    'learningResource_list': [['Stanford Online']],
    'tools_list': [['RapidMiner']]
})

# Preprocess the input data
input_cat = encoder.transform(input_data[['language', 'currentEducation']]).toarray()
input_interest = mlb_interest.transform(input_data['interest_list'])
input_skill = mlb_skill.transform(input_data['skill_list'])
input_learning = mlb_learning.transform(input_data['learningResource_list'])
input_tools = mlb_tools.transform(input_data['tools_list'])

# Combine the processed input
input_combined = np.hstack([input_cat, input_interest, input_skill, input_learning, input_tools, 
                             input_data[['experienceYears']].values])

# Find the nearest neighbors
distances, indices = knn.kneighbors(input_combined)

print(indices)

# Get the nearest neighbor IDs
nearest_ids = df['id'].iloc[indices[0]].values

print("Nearest IDs:", nearest_ids)

[[14 86 56]]
Nearest IDs: ['20034b71-437b-4792-abbc-68ec02023b48'
 'b55b1cb8-e217-4aaa-aa2c-4b932d915692'
 '7937a2ec-700e-4427-b7ac-f3f4d29571b7']




In [191]:
import pickle

# Assuming you have your fitted model and transformers as follows:
# knn, encoder, mlb_interest, mlb_skill, mlb_learning, mlb_tools

pathname = "../pickle/"

# Save the model
with open(pathname + 'knn_model.pkl', 'wb') as model_file:
    pickle.dump(knn, model_file)

# Save the encoders and binarizers
with open(pathname + 'encoder.pkl', 'wb') as encoder_file:
    pickle.dump(encoder, encoder_file)

with open(pathname + 'mlb_interest.pkl', 'wb') as mlb_interest_file:
    pickle.dump(mlb_interest, mlb_interest_file)

with open(pathname + 'mlb_skill.pkl', 'wb') as mlb_skill_file:
    pickle.dump(mlb_skill, mlb_skill_file)

with open(pathname + 'mlb_learning.pkl', 'wb') as mlb_learning_file:
    pickle.dump(mlb_learning, mlb_learning_file)

with open(pathname + 'mlb_tools.pkl', 'wb') as mlb_tools_file:
    pickle.dump(mlb_tools, mlb_tools_file)