# 00 Import LIbrary

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from deep_translator import GoogleTranslator
from nltk.corpus import stopwords
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from textblob import Word


# 01 Load Dataset

In [None]:
# Load dataset
url_data = "https://raw.githubusercontent.com/talitharhmd/jobstreet-scraper/main/jobstreet_data.csv"
df = pd.read_csv(url_data)

# Load slang dictionary
url_slang = "https://raw.githubusercontent.com/talitharhmd/jobstreet-scraper/main/slang.csv"
df_slang = pd.read_csv(url_slang)
slang_dict = dict(zip(df_slang['slang'], df_slang['formal']))
additional_slang = {}  
slang_dict.update(additional_slang)

# Load stopword 
url_stopwords = "https://raw.githubusercontent.com/talitharhmd/jobstreet-scraper/main/stopword.csv"
stopword_manual = pd.read_csv(url_stopwords, header=None)
custom_stopwords = set(stopword_manual.iloc[:, 0].str.lower())
custom_stopwords.update([]) 

# 03 Preprocessing

In [3]:
# 1. lowercase
def lowercase_columns(df, cols):
    for col in cols:
        df[col] = df[col].astype(str).str.lower()
    return df

# 2. clean text
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"[\n\r\t]+", " ", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"[^\x00-\x7F]+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"[,.!?]", "", text)
    return np.nan if text == "" else text

# 3. translate to english
def translate_text(text):
    try:
        return GoogleTranslator(source='auto', target='en').translate(text)
    except:
        return text 

# 4. replace slang
def replace_slang(text):
    if not isinstance(text, str): return ""
    words = text.split()
    return " ".join([slang_dict.get(w, w) for w in words])

# 5. tokenize
def tokenizing_text(text):
    return word_tokenize(text)

# 6. remove stopword
factory_stopword = StopWordRemoverFactory()
stopwords_nltk = set(stopwords.words('english'))

def remove_manual_stopwords(tokens):
    return [word for word in tokens if word.lower() not in custom_stopwords]

# 7. lemmatization
def lemmatize_flex(word):
    lemma_v = Word(word).lemmatize("v")
    return lemma_v if lemma_v != word else Word(word).lemmatize("n")


# 04 Pipeline

In [4]:
columns_to_clean = ['title', 'category', 'work_type', 'description']

df = lowercase_columns(df, columns_to_clean)

for col in columns_to_clean:
    df[f"{col}"] = df[col].apply(clean_text)

df.dropna(subset=[f"{col}" for col in columns_to_clean], inplace=True)

df_indo = df[df['country'] == 'indonesia'].copy()
df_indo['description'] = df_indo['description'].apply(translate_text)
df.update(df_indo)

df['after_slang'] = df['description'].apply(replace_slang)
df["tokenizing"] = df["after_slang"].apply(tokenizing_text)
df["stopword_removed"] = df["tokenizing"].apply(remove_manual_stopwords)
df["lemmatization"] = df["stopword_removed"].apply(lambda tokens: [lemmatize_flex(w) for w in tokens])


## Saving Cleaned Dataset 

In [5]:
df[["title", "description", "after_slang", "tokenizing", "stopword_removed", "lemmatization"]]
subset_df = df[["title", "category", "lemmatization"]]
subset_df.to_csv("cleaned_jobstreet.csv", index=False)
subset_df.head(3)

Unnamed: 0,title,category,lemmatization
0,data scientist financial conglomerates supervi...,analysis reporting banking financial services,"[role, purpose, execute, suptech, data, analyt..."
1,data scientist,mathematics statistics information sciences sc...,"[job, description, responsibility, data, scien..."
2,data annotator,database development administration informatio...,"[job, description, key, responsibility, accura..."
