# 00 Import Library

In [5]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloadi

True

In [6]:
import pandas as pd
import numpy as np
import re
import unicodedata
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import FreqDist
from deep_translator import GoogleTranslator

# Stopword Removal & Stemming (Bahasa Indonesia)
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import stanza
from wordcloud import WordCloud

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Word Embedding
import gensim
from gensim.models import Word2Vec, FastText
from gensim.utils import simple_preprocess

# Modeling
from scipy import stats
from numpy import dot
from numpy.linalg import norm
from ast import literal_eval
from sklearn.metrics.pairwise import linear_kernel,cosine_similarity

# 01 Load Dataset

In [7]:
df = pd.read_csv('cleaned_jobstreet.csv')
df

Unnamed: 0,title,tokens
0,"data scientist, financial conglomerates superv...","['role', 'purpose', 'execute', 'suptech', 'dat..."
1,data scientist,"['job', 'description', 'responsibility', 'data..."
2,data annotator,"['job', 'description', 'key', 'responsibility'..."
3,data scientist (artificial intelligence),"['join', 'dcap', 'dynamic', 'fastgrowing', 'te..."
4,data scientist,"['key', 'responsibility', 'data', 'exploration..."
...,...,...
475,data scientist,"['job', 'description', 'work', 'closely', 'int..."
476,data scientist - pricing,"['people', 'job', 'description', 'data', 'scie..."
477,data analyst,"['tugas', 'tanggung', 'jawab', 'mengumpulkan',..."
478,data analyst – pricing staff,"['key', 'responsibility', 'collect', 'process'..."


# 02 Text Preprocessing

In [None]:
# --- 1. Lowercase Function (Multi-column) ---
def lowercase_columns(df, cols):
    for col in cols:
        df[col] = df[col].astype(str).str.lower()
    return df

# --- 2. Clean Text Function ---
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"[\n\r\t]+", " ", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"[^\x00-\x7F]+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"[,.!?]", "", text)
    return np.nan if text == "" else text

# --- 3. Translate Indonesian Texts to English ---
def translate_text(text):
    try:
        return GoogleTranslator(source='auto', target='en').translate(text)
    except:
        return text  # fallback

# --- 4. Replace Slang ---
df_slang = pd.read_csv("slang.csv")
slang_dict = dict(zip(df_slang['slang'], df_slang['formal']))
additional_slang = {}  # Tambahkan jika ada
slang_dict.update(additional_slang)

def replace_slang(text):
    if not isinstance(text, str): return ""
    words = text.split()
    return " ".join([slang_dict.get(w, w) for w in words])

# --- 5. Tokenizing ---
def tokenizing_text(text):
    return word_tokenize(text)

# --- 6. Remove Stopwords ---
stopword_manual = pd.read_csv("stopword.csv", header=None)
custom_stopwords = set(stopword_manual.iloc[:, 0].str.lower())
custom_stopwords.update([])  # Tambahkan manual tambahan jika ada
factory_stopword = StopWordRemoverFactory()
stopwords_nltk = set(stopwords.words('indonesian'))

def remove_manual_stopwords(tokens):
    return [word for word in tokens if word.lower() not in custom_stopwords]

# --- 7. Lemmatization ---
def lemmatize_flex(word):
    lemma_v = Word(word).lemmatize("v")
    return lemma_v if lemma_v != word else Word(word).lemmatize("n")

# 03 Text Vectorization

## a. Word2Vec

In [None]:
all_sentences = pd.concat([df["tokens"], df_kunci["tokens"]], ignore_index=True).tolist()

# Word2Vec (CBOW default)
w2v_model = Word2Vec(sentences=all_sentences, vector_size=100, window=7, min_count=1, workers=4)

def sentence_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Hitung vektor kalimat
df["w2v_vec"] = df["tokens"].apply(lambda x: sentence_vector(x, w2v_model))
df_kunci["w2v_vec"] = df_kunci["tokens"].apply(lambda x: sentence_vector(x, w2v_model))


In [None]:
tokens = df2.loc[0, "tokens"]

# Matriks vektor Word2Vec untuk token
matrix_w2v = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
matrix_w2v_df = pd.DataFrame(matrix_w2v, index=[word for word in tokens if word in w2v_model.wv])

print("=== Matriks Vektor Word2Vec (baris per kata) ===")
print(matrix_w2v_df.head())

## b. FastText

In [None]:
ft_model = FastText(sentences=all_sentences, vector_size=100, window=7, min_count=1, workers=4)ft_model = FastText(sentences=all_sentences, vector_size=100, window=7, min_count=1, workers=4)

In [None]:
def sentence_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

df2["ft_vec"] = df2["tokens"].apply(lambda x: sentence_vector(x, ft_model))
df2_kunci["ft_vec"] = df2_kunci["tokens"].apply(lambda x: sentence_vector(x, ft_model))

matrix_ft = [ft_model.wv[word] for word in tokens if word in ft_model.wv]
matrix_ft_df = pd.DataFrame(matrix_ft, index=[word for word in tokens if word in ft_model.wv])
print("=== Matriks Vektor FastText (baris per kata) ===")
print(matrix_ft_df.head())

# 04 Labelling & Similarity Mapping

# 05 Recommendation Modeling