# **00 Import Library**

In [22]:
pip install faiss-cpu==1.11.0.post1

Collecting faiss-cpu==1.11.0.post1
  Using cached faiss_cpu-1.11.0.post1-cp39-cp39-win_amd64.whl (14.9 MB)
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0.post1
Note: you may need to restart the kernel to use updated packages.


In [29]:
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Text Vectorization
import gensim
from gensim.models import Word2Vec, FastText
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.utils import simple_preprocess

# Modeling
from sklearn.metrics.pairwise import cosine_similarity
from gensim.similarities import WmdSimilarity
from sentence_transformers import SentenceTransformer

# **01 Load Dataset**

In [4]:
df_job = pd.read_csv(r"C:\Users\hp\COURSE\SISTECH\PP_MachineLearningOperations_TalithaRahmadewatiW\FINAL PROJECT\dataset\cleaned_jobstreet.csv")
df_courses = pd.read_csv(r"C:\Users\hp\COURSE\SISTECH\PP_MachineLearningOperations_TalithaRahmadewatiW\FINAL PROJECT\dataset\cleaned_classentral.csv")

In [15]:
df_job

Unnamed: 0,title,category,tokens,w2v_vec
0,data scientist,mathematics statistics information sciences sc...,"['data', 'scientist', 'mathematics', 'statisti...","[-0.10176187, 0.0332709, 0.15051332, 0.0856378..."
1,senior data scientist genai hybrid working,other information communication technology,"['senior', 'data', 'scientist', 'genai', 'hybr...","[-0.097883716, 0.036603168, 0.15064253, 0.0839..."
2,data scientist,mathematics statistics information sciences sc...,"['data', 'scientist', 'mathematics', 'statisti...","[-0.097248085, 0.023463484, 0.15020867, 0.0985..."
3,data science engineer,engineering software information communication...,"['data', 'science', 'engineer', 'engineer', 's...","[-0.106991224, 0.045524247, 0.16322128, 0.1059..."
4,data scientist,engineering software information communication...,"['data', 'scientist', 'engineer', 'software', ...","[-0.11201963, 0.044731632, 0.15488832, 0.08253..."
...,...,...,...,...
571,bi developer,developersprogrammers information communicatio...,"['bi', 'developer', 'developersprogrammers', '...","[-0.08755468, 0.046252754, 0.1457336, 0.082536..."
572,senior sales account managersales manager data...,account relationship management sales,"['senior', 'sale', 'account', 'managersales', ...","[-0.11049973, 0.032507837, 0.14785658, 0.08316..."
573,project manager erp digital transformation,programme project management information commu...,"['project', 'manager', 'erp', 'digital', 'tran...","[-0.102200195, 0.04617636, 0.1531078, 0.076464..."
574,software engineer java,developersprogrammers information communicatio...,"['software', 'engineer', 'java', 'developerspr...","[-0.107833296, 0.05134064, 0.15680124, 0.08837..."


In [6]:
df_courses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6919 entries, 0 to 6918
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     6883 non-null   object
 1   category  6919 non-null   object
 2   tokens    6919 non-null   object
dtypes: object(3)
memory usage: 162.3+ KB


In [7]:
# Gabungkan semua token jadi satu list untuk Word2Vec
all_sentences = df_job["tokens"].tolist() + df_courses["tokens"].tolist()

# **02 Text Vectorization**

## Word2Vec

In [32]:
w2v_model = Word2Vec(sentences=all_sentences, vector_size=100, window=5, min_count=2, workers=4)

def sentence_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

df_job["w2v_vec"] = df_job["tokens"].apply(lambda x: sentence_vector(x, w2v_model))
df_courses["w2v_vec"] = df_courses["tokens"].apply(lambda x: sentence_vector(x, w2v_model))


<!-- ## Sentence Tranformers -->

In [33]:
# MODEL_NAME = "all-MiniLM-L6-v2"
# model_st = SentenceTransformer(MODEL_NAME)

# # Embed and normalize text vector
# job_embeddings = model_st.encode(df_job["tokens"].tolist(), convert_to_numpy=True)
# course_embeddings = model_st.encode(df_courses["tokens"].tolist(), convert_to_numpy=True)

# # Normalize embeddings for cosine similarity
# job_embeddings = job_embeddings / np.linalg.norm(job_embeddings, axis=1, keepdims=True)
# course_embeddings = course_embeddings / np.linalg.norm(course_embeddings, axis=1, keepdims=True)


# **03 Labelling & Similarity Mapping**

## Cosine Similarity

In [34]:
# Hitung vektor rata-rata Word2Vec
def sentence_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Buat vektor rata-rata
df_job["w2v_vec"] = df_job["tokens"].apply(lambda x: sentence_vector(x, w2v_model))
df_courses["w2v_vec"] = df_courses["tokens"].apply(lambda x: sentence_vector(x, w2v_model))

# Fungsi mapping job → course via Cosine
def map_job_to_courses_cosine(df_jobs, df_courses, top_k=3):
    course_matrix = np.stack(df_courses["w2v_vec"].values)
    results = []

    for idx, row in df_jobs.iterrows():
        job_vec = row["w2v_vec"].reshape(1, -1)
        sims = cosine_similarity(job_vec, course_matrix).flatten()
        top_indices = sims.argsort()[-top_k:][::-1]

        matches = [{
            "course_title": df_courses.iloc[i]["title"],
            "score": round(sims[i], 4)
        } for i in top_indices]

        results.append({
            "job_title": row["title"],
            "job_category": row["category"],
            "recommended_courses": matches
        })
    return results

job_course_sim_cosine = map_job_to_courses_cosine(df_job, df_courses, top_k=3)


<!-- ## WMD Similarity -->

In [21]:
# # Inisialisasi indeks WMD dari course
# wmd_index = WmdSimilarity(df_courses["tokens"].tolist(), w2v_model.wv, num_best=3)

# # Mapping job → course via WMD
# def map_job_to_courses_wmd(df_jobs, df_courses, wmd_index, top_k=3):
#     results = []

#     for idx, row in df_jobs.iterrows():
#         sims = wmd_index[row["tokens"]][:top_k]

#         matches = [{
#             "course_title": df_courses.iloc[i]["title"],
#             "score": round(score, 4)
#         } for i, score in sims]

#         results.append({
#             "job_title": row["title"],
#             "job_category": row["category"],
#             "recommended_courses": matches
#         })
#     return results

# job_course_sim_wmd = map_job_to_courses_wmd(df_job, df_courses, wmd_index, top_k=3)

# 04 Recommendation Modeling

### a. Cosine + W2Vec

In [37]:
# ============================
# COSINE SIMILARITY MODELING
# ============================
def recommend_courses_cosine(df_jobs, df_courses, top_k=3):
    course_matrix = np.stack(df_courses["w2v_vec"].values)
    recommendations = []

    for idx, row in df_jobs.iterrows():
        job_vec = row["w2v_vec"].reshape(1, -1)
        sims = cosine_similarity(job_vec, course_matrix).flatten()
        top_indices = sims.argsort()[-top_k:][::-1]

        matches = [{
            "course_title": df_courses.iloc[i]["title"],
            "score": round(sims[i], 4)
        } for i in top_indices]

        recommendations.append({
            "job": row["title"],
            "category": row["category"],
            "matches": matches
        })

    return recommendations

<!-- ### e. WMD + W2Vec -->

In [None]:
# def map_jobs_to_courses_st(df_jobs, df_courses, job_vecs, course_vecs, top_k=3):
#     results = []
#     for i, job in enumerate(df_jobs.itertuples()):
#         sim = cosine_similarity(job_vecs[i].reshape(1, -1), course_vecs).flatten()
#         top_indices = sim.argsort()[-top_k:][::-1]

#         recommendations = [{
#             "course_title": df_courses.iloc[j]["title"],
#             "score": round(sim[j], 4)
#         } for j in top_indices]

#         results.append({
#             "job_title": job.title,
#             "category": job.category,
#             "recommended_courses": recommendations
#         })
#     return results

# st_recommendations = map_jobs_to_courses_st(df_job, df_courses, job_embeddings, course_embeddings)

In [39]:
def print_recommendation_results(results, method="Cosine + W2V"):
    for res in results:
        print(f"[{method}] Job: {res['job']}")
        print(f"Category: {res['category']}")
        print("Rekomendasi Course:")
        for match in res['matches']:
            print(f"   - {match['course_title']:<50} | Score: {match['score']:.4f}")
        print()
        
# 1. Cosine
results_cosine = recommend_courses_cosine(df_job, df_courses, top_k=5)
print_recommendation_results(results_cosine[:2], method="Cosine + Word2Vec")

# # 2. WMD
# from gensim.similarities import WmdSimilarity
# wmd_index = WmdSimilarity(df_courses["tokens"].tolist(), w2v_model.wv, num_best=3)

# results_wmd = recommend_courses_wmd(df_job, df_courses, wmd_index, top_k=3)
# print_recommendation_results(results_wmd[:2], method="WMD + Word2Vec")

[Cosine + Word2Vec] Job: data scientist
Category: mathematics statistics information sciences science technology
Rekomendasi Course:
   - machine learning devops engineer                   | Score: 0.9992
   - presto on aws journey lessons learned and performance optimization | Score: 0.9990
   - introduction to the internet of things and embedded systems | Score: 0.9990
   - python for data science immersive live online      | Score: 0.9990
   - the power of machine learning boost business accumulate clicks fight fraud and deny deadbeats | Score: 0.9990

[Cosine + Word2Vec] Job: senior data scientist genai hybrid working
Category: other information communication technology
Rekomendasi Course:
   - empowering fearless data engineering with rust     | Score: 0.9991
   - data science job interview full mock interview     | Score: 0.9990
   - data science in production crossing the chasm      | Score: 0.9989
   - will gen ai replace data jobs a study on its impact | Score: 0.9989
   - pep

### Sample Text

In [30]:
# sample_text = "mathematics statistics information sciences science technology"

# # Tokenisasi ringan (tanpa NLTK)
# def basic_tokenize(text):
#     return text.lower().split()

# # Representasi Word2Vec rata-rata
# def sentence_vector(tokens, model):
#     vectors = [model.wv[word] for word in tokens if word in model.wv]
#     return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# def recommend_courses_from_text_cosine(user_text, df_courses, model, top_k=3):
#     tokens = basic_tokenize(user_text)
#     user_vec = sentence_vector(tokens, model).reshape(1, -1)

#     course_matrix = np.stack(df_courses["w2v_vec"].values)
#     similarities = cosine_similarity(user_vec, course_matrix).flatten()
#     top_indices = similarities.argsort()[-top_k:][::-1]

#     return [{
#         "course_title": df_courses.iloc[i]["title"],
#         "score": round(similarities[i], 4)
#     } for i in top_indices]

# def recommend_courses_from_text_wmd(user_text, wmd_index, top_k=3):
#     tokens = basic_tokenize(user_text)
#     sims = wmd_index[tokens][:top_k]

#     return [{
#         "course_title": df_courses.iloc[i]["title"],
#         "score": round(score, 4)
#     } for i, score in sims]

# # Sample user input
# sample_text = "mathematics statistics information sciences science technology"

# # COSINE RECOMMENDATION
# recommendations_cosine = recommend_courses_from_text_cosine(sample_text, df_courses, w2v_model, top_k=3)

# # WMD RECOMMENDATION (make sure wmd_index sudah dibuat)
# wmd_index = WmdSimilarity(df_courses["tokens"].tolist(), w2v_model.wv, num_best=3)
# recommendations_wmd = recommend_courses_from_text_wmd(sample_text, wmd_index, top_k=3)

# # Output: Cosine
# print("💼 [COSINE] User Query: Data Analyst")
# print("📚 Recommended Courses:")
# for rec in recommendations_cosine:
#     print(f"   - {rec['course_title']} | Score: {rec['score']}")

# print("\n")

# # Output: WMD
# print("💼 [WMD] User Query: Data Analyst")
# print("📚 Recommended Courses:")
# for rec in recommendations_wmd:
#     print(f"   - {rec['course_title']} | Score: {rec['score']}")


## Saving Vectorization & Similarity Results