In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pickle

# Read the dataset
df = pd.read_csv("dataset/csv/survey_results_public.csv")

# Columns of interest
columns_of_interest = [
    'LanguageWantToWorkWith',
    'DatabaseWantToWorkWith',
    'PlatformWantToWorkWith',
    'WebframeWantToWorkWith',
    'DevType',
]

# Select columns of interest
df_selected = df[columns_of_interest]

# Reduce the Dataset Size
df_selected = df_selected.head(5000)

# Aggregate skills
df_selected['AggregatedSkills'] = df_selected.apply(lambda row: ' '.join(str(x) for x in row if pd.notna(x)), axis=1)
df_selected['AggregatedSkills'] = df_selected['AggregatedSkills'].fillna('')

# Create a DataFrame for the user
user_skills = {
    'LanguageHaveWorkedWith': 'Python',
    'DatabaseHaveWorkedWith': 'PostgreSQL',
    'PlatformHaveWorkedWith': 'AWS',
    'WebframeHaveWorkedWith': 'Django',
    'DevType': 'Developer, full-stack',
}

user_df = pd.DataFrame(user_skills, index=[0])
user_combined_df = pd.concat([df_selected, user_df]).reset_index(drop=True)
user_combined_df['AggregatedSkills'] = user_combined_df['AggregatedSkills'].fillna('')

# Print memory usage
# print(user_combined_df.memory_usage(deep=True))

# Create a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the skills column
tfidf_matrix = tfidf_vectorizer.fit_transform(user_combined_df['AggregatedSkills'])

# Calculate the cosine similarity between skills
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Get the index of the user in the DataFrame
user_index = user_combined_df.shape[0] - 1

# Get the similarity scores for the user
user_similarity_scores = cosine_sim[user_index]

# Get the indices of the top N similar users
top_similar_users = user_similarity_scores.argsort()[:-6:-1]

# Display the top similar users
# print("Top 5 Similar Users:")
# print(user_combined_df.loc[top_similar_users, 'DevType'])

# Define categories
categories = [
    'LanguageWantToWorkWith',
    'DatabaseWantToWorkWith',
    'PlatformWantToWorkWith',
    'WebframeWantToWorkWith',
]

# Create a dictionary to store recommended skills for each category
recommended_skills_dict = {category: set() for category in categories}

# Loop through the top similar users and update the recommended skills dictionary
for index in top_similar_users:
    for category in categories:
        skills = str(user_combined_df.loc[index, category]).lower()
        recommended_skills_dict[category].update(skills.split())

# Remove the skills the user already has for each category
user_skills_dict = {category: set(str(user_combined_df.loc[user_index, category]).lower().split()) for category in categories}
for category in categories:
    recommended_skills_dict[category] -= user_skills_dict[category]

# Display the top 3 recommended skills for each category
for category in categories:
    print(f"\nTop 3 Recommended {category}:")
    recommended_skills = list(recommended_skills_dict[category])[:3]
    print(", ".join(recommended_skills))

# Save the models and data
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)

with open('cosine_sim.pkl', 'wb') as file:
    pickle.dump(cosine_sim, file)

with open('user_combined_df.pkl', 'wb') as file:
    pickle.dump(user_combined_df, file)

# Load the models and data
with open('tfidf_vectorizer.pkl', 'rb') as file:
    loaded_tfidf_vectorizer = pickle.load(file)

with open('cosine_sim.pkl', 'rb') as file:
    loaded_cosine_sim = pickle.load(file)

with open('user_combined_df.pkl', 'rb') as file:
    loaded_user_combined_df = pickle.load(file)



Top 3 Recommended LanguageWantToWorkWith:
assembly;c;c#;c++;javascript;typescript, shells);javascript;python;typescript, (all

Top 3 Recommended DatabaseWantToWorkWith:
realtime, cassandra;mysql;postgresql, database;sqlite

Top 3 Recommended PlatformWantToWorkWith:
ocean;heroku;netlify;vmware, (aws);digital, now

Top 3 Recommended WebframeWantToWorkWith:
django;react;solid.js;spring, rails, angular;deno;ruby
