In [2]:
!pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp39-cp39-linux_x86_64.whl size=3195756 sha256=3d672c4a59223e9add52919ac3a5432725998e9afd6a57f84ea6b6e6a49d2fc9
  Stored in directory: /root/.cache/pip/wheels/c6/3a/46/9b17b3512bdf283c6cb84f59929cdd5199d4e754d596d22784
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [60]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate, train_test_split

In [61]:
# Select a random sample of 10% of the job offers and job applications
job_offers = pd.read_csv('FinalDataSetJobOffers.csv').sample(frac=0.1, random_state=42)
job_seekers= pd.read_csv('CvDatasetFinal_3.csv').sample(frac=0.1, random_state=42)

In [62]:
# Check for missing values in the job offers dataframe
print("Missing values in job_offers:")
print(job_offers.isna().sum())

# Check for missing values in the job applications dataframe
print("Missing values in job_applications:")
print(job_seekers.isna().sum())

Missing values in job_offers:
Job post               0
Company name           0
Job description        0
Required skills        0
Location               0
Company rating         0
Company review         0
Experience required    0
dtype: int64
Missing values in job_applications:
Category             0
Name                 0
Email                0
Phone                0
Education            0
Skills               0
Experience           2
Experience_Rating    0
dtype: int64


In [63]:
# Drop rows with missing values in job_offers and job_applications dataframes
job_offers.dropna(inplace=True)
job_seekers.dropna(inplace=True)


In [64]:
# Preprocess job offers data
tfidf = TfidfVectorizer(stop_words='english')
job_offers_matrix = tfidf.fit_transform(job_offers['Job description'])


In [65]:
# Define a function for generating content-based recommendations
def content_based_recommendations(user_profile, job_offers_matrix, job_offers, n_recommendations=10):
    user_profile_matrix = tfidf.transform([user_profile])
    similarities = cosine_similarity(user_profile_matrix, job_offers_matrix)
    similar_jobs_indices = similarities.argsort()[0][::-1]
    recommended_jobs = job_offers.iloc[similar_jobs_indices][:n_recommendations]
    return recommended_jobs

In [66]:
def content_recommendations(profile, offers_df, top_n):
    # Compute the cosine similarity between the job descriptions and the user profile
    profile_vector = tfidf.transform([profile])
    cosine_similarities = cosine_similarity(profile_vector, job_offers_matrix).flatten()
    
    # Sort the job offers by their cosine similarity to the user profile
    offers_df['cosine_similarity'] = cosine_similarities
    offers_df = offers_df.sort_values(by='cosine_similarity', ascending=False)
    
    # Return the top N job offers
    top_offers = offers_df.head(top_n)
    top_offers = top_offers[['job_id', 'description', 'cosine_similarity']]
    return top_offers

In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def hybrid_recommendations(user_profile, job_offers_matrix, job_offers, top_n=5):
    # create a new CountVectorizer with the same vocabulary as the TfidfVectorizer for the user profile
    cv = CountVectorizer(vocabulary=tfidf.vocabulary_)
    
    # transform the job descriptions to a count matrix
    job_desc_count = cv.transform(job_offers['Job description'])
    
    # concatenate the user profile and job description count matrices
    combined_matrix = scipy.sparse.vstack([tfidf_matrix, job_desc_count])
    
    # calculate the cosine similarity between the user profile and job offers
    cosine_sim = cosine_similarity(combined_matrix)
    
    # get the indices of the top_n job offers
    job_indices = cosine_sim[0].argsort()[-top_n-1:-1][::-1]
    
    # get the corresponding job IDs and descriptions
    job_ids = job_offers.iloc[job_indices]['job_id'].values
    job_descs = job_offers.iloc[job_indices]['description'].values
    
    print( list(zip(job_ids, job_descs)))


In [70]:
print(job_offers.columns)

Index(['Job post', 'Company name', 'Job description', 'Required skills',
       'Location', 'Company rating', 'Company review', 'Experience required'],
      dtype='object')


In [71]:
vectorizer = TfidfVectorizer()
job_offers_matrix = vectorizer.fit_transform(job_offers['Job description'])

feature_names = vectorizer.get_feature_names_out()
print(feature_names)

['00' '000' '0009' ... 'zuora' 'zweig' 'âmanhattan']


In [72]:
print(job_offers_matrix.getnnz(axis=0))

[ 9 52  1 ...  1  1  1]


In [79]:
print("Shape of job_offers_matrix:", job_offers_matrix.shape)
print("Shape of user_profile_matrix.T:", user_profile_matrix.T.shape)


Shape of job_offers_matrix: (3357, 8041)
Shape of user_profile_matrix.T: (8041, 1)


In [59]:
user_profile = 'I am a software engineer with experience in Java, Python, and SQL'
num_words = len(user_profile.split())
print(num_words)


12


In [85]:
# Set the number of top job offers to recommend to the user
n = 5

# Create a new user profile string
user_profile = "python machine learning data analysis"

# Transform the user profile string into a matrix
user_profile_matrix = vectorizer.transform([user_profile])

# Calculate the cosine similarity between the user profile and job offers
cosine_sim = cosine_similarity(user_profile_matrix, job_offers_matrix)

# Get the indices of the top_n job offers
top_n = cosine_sim.argsort()[0][-n:]

# Get the corresponding job IDs and descriptions
job_ids = job_offers.iloc[top_n]['Job post'].values
job_descs = job_offers.iloc[top_n]['Job description'].values

# Print the top_n job offers
for job_id, job_desc in zip(job_ids, job_descs):
    print(job_id)
    print(job_desc)
    print('---------------------')


data scientist
optim candid respons naval nuclear laboratori seek data scientist join team dedic collect transform data naval nuclear fleet analyt throughout laboratori present in-depth analysi clear concis manner data-ori approach problem solv decis make build predict model character variou system process refin verifi integr data use analysi data scientist display great collabor interperson skill written verbal lead improv project provid regular updat organ prefer skill includ good script program skill experi python r. experi data visual packag tableau sa rstudio understand machine-learn applic profici work databas
---------------------
consult analyt der
experi data scienc machin learn python r. advanc program ...
---------------------
data scientist
look data scientist analyz larg amount raw inform find pattern help improv compani reli build data product extract valuabl busi insight role highli analyt knack analysi math statist critic think problem-solv skill essenti interpret data 