# **00 Import Library**

In [51]:
pip install -U sentence-transformers

Collecting sentence-transformersNote: you may need to restart the kernel to use updated packages.

  Downloading sentence_transformers-5.0.0-py3-none-any.whl (470 kB)
     -------------------------------------- 470.2/470.2 kB 1.4 MB/s eta 0:00:00
Collecting transformers<5.0.0,>=4.41.0
  Downloading transformers-4.53.2-py3-none-any.whl (10.8 MB)
     ---------------------------------------- 10.8/10.8 MB 2.0 MB/s eta 0:00:00
Collecting huggingface-hub>=0.20.0
  Downloading huggingface_hub-0.33.4-py3-none-any.whl (515 kB)
     -------------------------------------- 515.3/515.3 kB 1.2 MB/s eta 0:00:00
Collecting tokenizers<0.22,>=0.21
  Downloading tokenizers-0.21.2-cp39-abi3-win_amd64.whl (2.5 MB)
     ---------------------------------------- 2.5/2.5 MB 2.4 MB/s eta 0:00:00
Collecting safetensors>=0.4.3
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl (308 kB)
     -------------------------------------- 308.9/308.9 kB 1.1 MB/s eta 0:00:00
Installing collected packages: safetensors,

In [52]:
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Text Vectorization
import gensim
from gensim.models import Word2Vec, FastText
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.utils import simple_preprocess

# Modeling
from sklearn.metrics.pairwise import cosine_similarity
from gensim.similarities import WmdSimilarity
from sentence_transformers import SentenceTransformer




# **01 Load Dataset**

In [6]:
df_job = pd.read_csv("cleaned_jobstreet.csv")
df_courses = pd.read_csv("cleaned_classentral.csv")

In [7]:
df_job.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480 entries, 0 to 479
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   url          480 non-null    object
 1   country      480 non-null    object
 2   title        480 non-null    object
 3   company      480 non-null    object
 4   location     480 non-null    object
 5   category     480 non-null    object
 6   work_type    480 non-null    object
 7   description  480 non-null    object
 8   text         480 non-null    object
 9   text_clean   480 non-null    object
 10  tokens       480 non-null    object
dtypes: object(11)
memory usage: 41.4+ KB


In [8]:
df_courses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           10 non-null     object 
 1   provider        10 non-null     object 
 2   language        10 non-null     object 
 3   certificate     10 non-null     object 
 4   average rating  10 non-null     float64
 5   price type      10 non-null     object 
 6   reviews         10 non-null     object 
 7   overview        10 non-null     object 
 8   skills          10 non-null     object 
 9   text            10 non-null     object 
 10  text_clean      10 non-null     object 
 11  tokens          10 non-null     object 
dtypes: float64(1), object(11)
memory usage: 1.1+ KB


In [36]:
# Gabungkan semua token jadi satu list untuk Word2Vec
all_sentences = df_job["tokens"].tolist() + df_courses["tokens"].tolist()

# **02 Text Vectorization**

## Word2Vec

In [65]:
w2v_model = Word2Vec(sentences=all_sentences, vector_size=100, window=5, min_count=2, workers=4)

def sentence_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

df_job["w2v_vec"] = df_job["tokens"].apply(lambda x: sentence_vector(x, w2v_model))
df_courses["w2v_vec"] = df_courses["tokens"].apply(lambda x: sentence_vector(x, w2v_model))


## Sentence Tranformers

In [66]:
# model_st = SentenceTransformer('all-MiniLM-L6-v2')  # ringan & cepat

# # Encode kolom 'text_clean'
# df_job["st_vec"] = list(model_st.encode(df_job["text_clean"].tolist(), show_progress_bar=True))
# df_courses["st_vec"] = list(model_st.encode(df_courses["text_clean"].tolist(), show_progress_bar=True))

# **03 Labelling & Similarity Mapping**

## Cosine Similarity

In [67]:
# Hitung vektor rata-rata Word2Vec
def sentence_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Buat vektor rata-rata
df_job["w2v_vec"] = df_job["tokens"].apply(lambda x: sentence_vector(x, w2v_model))
df_courses["w2v_vec"] = df_courses["tokens"].apply(lambda x: sentence_vector(x, w2v_model))

# Fungsi mapping job → course via Cosine
def map_job_to_courses_cosine(df_jobs, df_courses, top_k=3):
    course_matrix = np.stack(df_courses["w2v_vec"].values)
    results = []

    for idx, row in df_jobs.iterrows():
        job_vec = row["w2v_vec"].reshape(1, -1)
        sims = cosine_similarity(job_vec, course_matrix).flatten()
        top_indices = sims.argsort()[-top_k:][::-1]

        matches = [{
            "course_title": df_courses.iloc[i]["title"],
            "score": round(sims[i], 4)
        } for i in top_indices]

        results.append({
            "job_title": row["title"],
            "job_category": row["category"],
            "recommended_courses": matches
        })
    return results

job_course_sim_cosine = map_job_to_courses_cosine(df_job, df_courses, top_k=3)


## WMD Similarity

In [68]:
# Inisialisasi indeks WMD dari course
wmd_index = WmdSimilarity(df_courses["tokens"].tolist(), w2v_model.wv, num_best=3)

# Mapping job → course via WMD
def map_job_to_courses_wmd(df_jobs, df_courses, wmd_index, top_k=3):
    results = []

    for idx, row in df_jobs.iterrows():
        sims = wmd_index[row["tokens"]][:top_k]

        matches = [{
            "course_title": df_courses.iloc[i]["title"],
            "score": round(score, 4)
        } for i, score in sims]

        results.append({
            "job_title": row["title"],
            "job_category": row["category"],
            "recommended_courses": matches
        })
    return results

job_course_sim_wmd = map_job_to_courses_wmd(df_job, df_courses, wmd_index, top_k=3)

# 04 Recommendation Modeling

### b. Cosine + W2Vec

In [69]:
# ============================
# COSINE SIMILARITY MODELING
# ============================
def recommend_courses_cosine(df_jobs, df_courses, top_k=3):
    course_matrix = np.stack(df_courses["w2v_vec"].values)
    recommendations = []

    for idx, row in df_jobs.iterrows():
        job_vec = row["w2v_vec"].reshape(1, -1)
        sims = cosine_similarity(job_vec, course_matrix).flatten()
        top_indices = sims.argsort()[-top_k:][::-1]

        matches = [{
            "course_title": df_courses.iloc[i]["title"],
            "score": round(sims[i], 4)
        } for i in top_indices]

        recommendations.append({
            "job": row["title"],
            "category": row["category"],
            "matches": matches
        })

    return recommendations

### e. WMD + W2Vec

In [70]:
# ============================
# WMD SIMILARITY MODELING
# ============================
def recommend_courses_wmd(df_jobs, df_courses, wmd_index, top_k=3):
    recommendations = []

    for idx, row in df_jobs.iterrows():
        sims = wmd_index[row["tokens"]][:top_k]

        matches = [{
            "course_title": df_courses.iloc[i]["title"],
            "score": round(score, 4)
        } for i, score in sims]

        recommendations.append({
            "job": row["title"],
            "category": row["category"],
            "matches": matches
        })

    return recommendations

In [85]:
def print_recommendation_results(results, method="Cosine + W2V"):
    for res in results:
        print(f"[{method}] Job: {res['job']}")
        print(f"Category: {res['category']}")
        print("Rekomendasi Course:")
        for match in res['matches']:
            print(f"   - {match['course_title']:<50} | Score: {match['score']:.4f}")
        print()
        
# 1. Cosine
results_cosine = recommend_courses_cosine(df_job, df_courses, top_k=5)
print_recommendation_results(results_cosine[:2], method="Cosine + Word2Vec")

# 2. WMD
from gensim.similarities import WmdSimilarity
wmd_index = WmdSimilarity(df_courses["tokens"].tolist(), w2v_model.wv, num_best=3)

results_wmd = recommend_courses_wmd(df_job, df_courses, wmd_index, top_k=3)
print_recommendation_results(results_wmd[:2], method="WMD + Word2Vec")

[Cosine + Word2Vec] Job: data scientist, financial conglomerates supervision
Category: analysis & reporting (banking & financial services)
Rekomendasi Course:
   - cs50's computer science for lawyers                | Score: 0.9880
   - cs50's computer science for business professionals | Score: 0.9874
   - introduction to electrical engineering and computer science i | Score: 0.9833
   - cs50's introduction to computer science            | Score: 0.9829
   - introduction to computer science and programming using python | Score: 0.9765

[Cosine + Word2Vec] Job: data scientist
Category: mathematics, statistics & information sciences (science & technology)
Rekomendasi Course:
   - cs50's computer science for lawyers                | Score: 0.9913
   - cs50's introduction to computer science            | Score: 0.9894
   - harvard cs50 – full computer science university course | Score: 0.9851
   - cs50's computer science for business professionals | Score: 0.9851
   - introduction to elect

### Sample Text

In [84]:
sample_text = "data scientist skilled in Python and machine learning"

# Tokenisasi ringan (tanpa NLTK)
def basic_tokenize(text):
    return text.lower().split()

# Representasi Word2Vec rata-rata
def sentence_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

def recommend_courses_from_text_cosine(user_text, df_courses, model, top_k=3):
    tokens = basic_tokenize(user_text)
    user_vec = sentence_vector(tokens, model).reshape(1, -1)

    course_matrix = np.stack(df_courses["w2v_vec"].values)
    similarities = cosine_similarity(user_vec, course_matrix).flatten()
    top_indices = similarities.argsort()[-top_k:][::-1]

    return [{
        "course_title": df_courses.iloc[i]["title"],
        "score": round(similarities[i], 4)
    } for i in top_indices]

def recommend_courses_from_text_wmd(user_text, wmd_index, top_k=3):
    tokens = basic_tokenize(user_text)
    sims = wmd_index[tokens][:top_k]

    return [{
        "course_title": df_courses.iloc[i]["title"],
        "score": round(score, 4)
    } for i, score in sims]

# Sample user input
sample_text = "data scientist skilled in Python and machine learning"

# COSINE RECOMMENDATION
recommendations_cosine = recommend_courses_from_text_cosine(sample_text, df_courses, w2v_model, top_k=3)

# WMD RECOMMENDATION (make sure wmd_index sudah dibuat)
wmd_index = WmdSimilarity(df_courses["tokens"].tolist(), w2v_model.wv, num_best=3)
recommendations_wmd = recommend_courses_from_text_wmd(sample_text, wmd_index, top_k=3)

# Output: Cosine
print("💼 [COSINE] User Query: Data Analyst")
print("📚 Recommended Courses:")
for rec in recommendations_cosine:
    print(f"   - {rec['course_title']} | Score: {rec['score']}")

print("\n")

# Output: WMD
print("💼 [WMD] User Query: Data Analyst")
print("📚 Recommended Courses:")
for rec in recommendations_wmd:
    print(f"   - {rec['course_title']} | Score: {rec['score']}")


💼 [COSINE] User Query: Data Analyst
📚 Recommended Courses:
   - cs50's computer science for lawyers | Score: 0.0
   - functional programming principles in scala | Score: 0.0
   - computational social science methods | Score: 0.0


💼 [WMD] User Query: Data Analyst
📚 Recommended Courses:


## Saving Vectorization & Similarity Results