In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Read the dataset
df = pd.read_csv("dataset/csv/survey_results_public.csv")

# Columns of interest
columns_of_interest = [
    'LanguageWantToWorkWith',
    'DatabaseWantToWorkWith',
    'PlatformWantToWorkWith',
    'WebframeWantToWorkWith',
    'DevType',
]

# Select columns of interest
df_selected = df[columns_of_interest]

# Reduce the Dataset Size
df_selected = df_selected.head(2000)

# Aggregate skills
df_selected['AggregatedSkills'] = df_selected.apply(lambda row: ' '.join(str(x) for x in row if pd.notna(x)), axis=1)
df_selected['AggregatedSkills'] = df_selected['AggregatedSkills'].fillna('')

# Create a DataFrame for the user
user_skills = {
    'LanguageHaveWorkedWith': 'Python',
    'DatabaseHaveWorkedWith': 'PostgreSQL',
    'PlatformHaveWorkedWith': 'AWS',
    'WebframeHaveWorkedWith': 'Django',
    'DevType': 'Developer, full-stack',
}

user_df = pd.DataFrame(user_skills, index=[0])
user_combined_df = pd.concat([df_selected, user_df]).reset_index(drop=True)
user_combined_df['AggregatedSkills'] = user_combined_df['AggregatedSkills'].fillna('')

# Print memory usage
# print(user_combined_df.memory_usage(deep=True))

# Create a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the skills column
tfidf_matrix = tfidf_vectorizer.fit_transform(user_combined_df['AggregatedSkills'])

# Calculate the cosine similarity between skills
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Get the index of the user in the DataFrame
user_index = user_combined_df.shape[0] - 1

# Get the similarity scores for the user
user_similarity_scores = cosine_sim[user_index]

# Get the indices of the top N similar users
top_similar_users = user_similarity_scores.argsort()[:-6:-1]

# Display the top similar users
# print("Top 5 Similar Users:")
# print(user_combined_df.loc[top_similar_users, 'DevType'])

# Get the recommended skills based on the most similar users
recommended_skills = set()
for index in top_similar_users:
    skills = user_combined_df.loc[index, 'AggregatedSkills'].lower().split()
    recommended_skills.update(skills)

# Remove the skills the user already has
user_skills_set = set(user_combined_df.loc[user_index, 'AggregatedSkills'].lower().split())
recommended_skills -= user_skills_set

print("\nRecommended Skills:")
print(recommended_skills)


Recommended Skills:
{'shells);html/css;javascript;python', 'mobile', 'typescript', 'database;sqlite', 'oracle;postgresql;sqlite', 'c++;elixir;go;java;kotlin;matlab;python', 'angular;angularjs', 'developer,', 'firebase', 'django;fastapi;laravel;vue.js', 'c++;javascript;lua;php;python;ruby;typescript', '(all', 'realtime', 'bash/shell', 'django;vue.js', 'full-stack'}
