# **00 Import & Install Library**

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from deep_translator import GoogleTranslator
from nltk.corpus import stopwords
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from textblob import Word

# **01 Load Dataset**

In [50]:
# job dataset
url_job = r"C:\Users\hp\COURSE\SISTECH\PP_MachineLearningOperations_TalithaRahmadewatiW\FINAL PROJECT\dataset\jobstreet.csv"
df_job = pd.read_csv(url_job)

# course dataset
url_courses = r"C:\Users\hp\COURSE\SISTECH\PP_MachineLearningOperations_TalithaRahmadewatiW\FINAL PROJECT\dataset\courses_classentral.csv"
df_courses = pd.read_csv(url_courses)


# Load slang dictionary
url_slang = "https://raw.githubusercontent.com/talitharhmd/jobstreet-scraper/main/slang.csv"
df_slang = pd.read_csv(url_slang)
slang_dict = dict(zip(df_slang['slang'], df_slang['formal']))
additional_slang = {}  
slang_dict.update(additional_slang)

# Load stopword 
url_stopwords = "https://raw.githubusercontent.com/talitharhmd/jobstreet-scraper/main/stopword.csv"
stopword_manual = pd.read_csv(url_stopwords, header=None)
custom_stopwords = set(stopword_manual.iloc[:, 0].str.lower())
custom_stopwords.update([]) 

# **02 Overview** 

In [51]:
## Data set information for Jobstreet
print("**"*30)
print(" "*15, "Information Jobsteet dataset")
print("**"*30)
print(df_job.info())

************************************************************
                Information Jobsteet dataset
************************************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 576 entries, 0 to 575
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Title                 576 non-null    object
 1   Company               576 non-null    object
 2   Country               576 non-null    object
 3   Location              576 non-null    object
 4   Category              576 non-null    object
 5   Work Type             576 non-null    object
 6   Salary                576 non-null    object
 7   Requirements          576 non-null    object
 8   Description           576 non-null    object
 9   Cleaned Title         576 non-null    object
 10  Cleaned Company       576 non-null    object
 11  Cleaned Country       576 non-null    object
 12  Cleaned Location      5

In [52]:
## Data set information for Classentral
print("**"*30)
print(" "*15, "Information Classentral dataset")
print("**"*30)
print(df_courses.info())

************************************************************
                Information Classentral dataset
************************************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6919 entries, 0 to 6918
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Title                6919 non-null   object 
 1   Category             6919 non-null   object 
 2   Provider             6919 non-null   object 
 3   Language             6919 non-null   object 
 4   Certificate          6919 non-null   object 
 5   Average Rating       6919 non-null   float64
 6   Price Type           6919 non-null   object 
 7   Reviews              6919 non-null   object 
 8   Duration             6622 non-null   object 
 9   Overview             6919 non-null   object 
 10  Link                 6919 non-null   object 
 11  Cleaned Title        6882 non-null   object 
 12  Cleaned Category  

In [53]:
df_courses.head(3)
# df_job.head(3)

Unnamed: 0,Title,Category,Provider,Language,Certificate,Average Rating,Price Type,Reviews,Duration,Overview,Link,Cleaned Title,Cleaned Category,Cleaned Provider,Cleaned Language,Cleaned Certificate,Cleaned Price Type,Cleaned Overview
0,Introduction to Data Science in Python,Data Science,Coursera,English,Certificate Available,2.4,Free,46 reviews,1 day 10 hours 52 minutes,"Learn Python fundamentals, data manipulation w...",https://www.classcentral.comhttps://www.classc...,introduct data scienc python,data scienc,coursera,english,certif avail,free,learn python fundament data manipul panda basi...
1,A Crash Course in Data Science,Data Science,Coursera,English,Certificate Available,3.6,Free,25 reviews,6 hours 4 minutes,"Rapid introduction to data science essentials,...",https://www.classcentral.comhttps://www.classc...,crash cours data scienc,data scienc,coursera,english,certif avail,free,rapid introduct data scienc essenti cover key ...
2,Python for Data Science,Data Science,edX,English,No Certificate,4.4,Free,48 reviews,"10 weeks, 8-10 hours a week","Learn to use powerful, open-source, Python too...",https://www.classcentral.comhttps://www.classc...,python data scienc,data scienc,edx,english,certif,free,learn use power opensourc python tool includ p...


# **03 Text Preprocessing**

In [54]:
import re
import numpy as np
from nltk.tokenize import word_tokenize
from textblob import Word
from deep_translator import GoogleTranslator
from nltk.corpus import stopwords

# === SETUP ===
stopwords_nltk = set(stopwords.words('english'))
custom_stopwords = set()  # isi sesuai kebutuhan
slang_dict = {
    "u": "you", "pls": "please", "thx": "thanks", "btw": "by the way"
    # tambahkan sesuai kebutuhan
}

# === 1. lowercase semua teks di dataframe ===
def lowercase_columns(df):
    df.columns = df.columns.str.lower()
    for col in df.columns:
        if df[col].dtype == object:
            df[col] = df[col].astype(str).str.lower()
    return df

# === 2. Clean text ===
def clean_text(text):
    if not isinstance(text, str): return ""
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"[\n\r\t]+", " ", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"[^\x00-\x7F]+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return np.nan if text == "" else text

# === 3. Translate (optional) ===
def translate_text(text):
    try:
        return GoogleTranslator(source='auto', target='en').translate(text)
    except:
        return text 

# === 4. Replace slang ===
def replace_slang(text):
    if not isinstance(text, str): return ""
    return " ".join([slang_dict.get(word, word) for word in text.split()])

# === 5. Final pipeline ===
def preprocess_pipeline(text):
    if not isinstance(text, str): return ""
    text = clean_text(text)
    text = replace_slang(text)
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stopwords_nltk and w not in custom_stopwords]
    lemmatized = [Word(w).lemmatize("v") if Word(w).lemmatize("v") != w else Word(w).lemmatize("n") for w in tokens]
    return " ".join(lemmatized)

# === Apply ke DataFrame ===
df_job = lowercase_columns(df_job)
df_courses = lowercase_columns(df_courses)

# === Bersihkan kolom individual ===
for df, cols in [(df_job, ['title', 'category', 'description']),
                 (df_courses, ['title', 'category', 'overview'])]:
    for col in cols:
        df[col] = df[col].astype(str).apply(clean_text).apply(replace_slang)

# === Buat kolom gabungan dan bersihkan untuk modeling ===
for df, cols in [(df_job, ['title', 'category', 'description']),
                 (df_courses, ['title', 'category', 'overview'])]:
    df['text'] = df[cols].astype(str).apply(lambda x: ' '.join(x), axis=1)
    df['text_clean'] = df['text'].apply(preprocess_pipeline)

# === Tokenize untuk modeling ===
df_job["tokens"] = df_job["text_clean"].apply(word_tokenize)
df_courses["tokens"] = df_courses["text_clean"].apply(word_tokenize)


In [55]:
df_courses

Unnamed: 0,title,category,provider,language,certificate,average rating,price type,reviews,duration,overview,...,cleaned title,cleaned category,cleaned provider,cleaned language,cleaned certificate,cleaned price type,cleaned overview,text,text_clean,tokens
0,introduction to data science in python,data science,coursera,english,certificate available,2.4,free,46 reviews,1 day 10 hours 52 minutes,learn python fundamentals data manipulation wi...,...,introduct data scienc python,data scienc,coursera,english,certif avail,free,learn python fundament data manipul panda basi...,introduction to data science in python data sc...,introduction data science python data science ...,"[introduction, data, science, python, data, sc..."
1,a crash course in data science,data science,coursera,english,certificate available,3.6,free,25 reviews,6 hours 4 minutes,rapid introduction to data science essentials ...,...,crash cours data scienc,data scienc,coursera,english,certif avail,free,rapid introduct data scienc essenti cover key ...,a crash course in data science data science ra...,crash course data science data science rapid i...,"[crash, course, data, science, data, science, ..."
2,python for data science,data science,edx,english,no certificate,4.4,free,48 reviews,"10 weeks, 8-10 hours a week",learn to use powerful opensource python tools ...,...,python data scienc,data scienc,edx,english,certif,free,learn use power opensourc python tool includ p...,python for data science data science learn to ...,python data science data science learn use pow...,"[python, data, science, data, science, learn, ..."
3,data science math skills,data science,coursera,english,certificate available,4.1,free,12 reviews,13 hours 20 minutes,master essential math concepts for data scienc...,...,data scienc math skill,data scienc,coursera,english,certif avail,free,master essenti math concept data scienc set th...,data science math skills data science master e...,data science math skill data science master es...,"[data, science, math, skill, data, science, ma..."
4,programming for data science with python,data science,udacity,english,certificate available,4.8,paid,28 reviews,2 months 2 weeks 4 days 8 hours 43 minutes,master sql python and version control for data...,...,program data scienc python,data scienc,udac,english,certif avail,paid,master sql python version control data analysi...,programming for data science with python data ...,program data science python data science maste...,"[program, data, science, python, data, science..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6914,mastering generative ai for software development,computer science,edx,english,certificate available,0.0,free,0 reviews,"3 weeks, 2-3 hours a week",build jobready ai skills for software developm...,...,master gener ai softwar develop,comput scienc,edx,english,certif avail,free,build jobreadi ai skill softwar develop autom ...,mastering generative ai for software developme...,master generative ai software development comp...,"[master, generative, ai, software, development..."
6915,computer science programming in java,computer science,study.com,english,certificate available,0.0,paid,0 reviews,12 hours 48 minutes,learn java programming fundamentals from data ...,...,comput scienc program java,comput scienc,studycom,english,certif avail,paid,learn java program fundament data type method ...,computer science programming in java computer ...,computer science program java computer science...,"[computer, science, program, java, computer, s..."
6916,mlops tools mlflow and hugging face,computer science,edx,english,certificate available,0.0,free,0 reviews,"4 weeks, 3-6 hours a week",enhance your mlops journey explore mlflow and ...,...,mlop tool mlflow hug face,comput scienc,edx,english,certif avail,free,enhanc mlop journey explor mlflow hug face str...,mlops tools mlflow and hugging face computer s...,mlops tool mlflow hug face computer science en...,"[mlops, tool, mlflow, hug, face, computer, sci..."
6917,working with deepseek in python,computer science,datacamp,english,certificate available,0.0,paid,0 reviews,3 hours,discover what all of the deepseek hype was rea...,...,work deepseek python,comput scienc,datacamp,english,certif avail,paid,discov deepseek hype realli build applic use d...,working with deepseek in python computer scien...,work deepseek python computer science discover...,"[work, deepseek, python, computer, science, di..."


## Pipeline

In [56]:
# === JOB DATA ===
df_job_filtered = df_job[[
    'title', 'category', 'tokens'
]].copy()
# Drop baris yang kosong setelah preprocessing
df_job_filtered.dropna(subset=['tokens'], inplace=True)

# === COURSE DATA ===
df_courses_filtered = df_courses[[
    'title', 'category', 'tokens'
]].copy()
# Drop baris yang kosong setelah preprocessing
df_courses_filtered.dropna(subset=['tokens'], inplace=True)

In [57]:
df_courses_filtered.head()

Unnamed: 0,title,category,tokens
0,introduction to data science in python,data science,"[introduction, data, science, python, data, sc..."
1,a crash course in data science,data science,"[crash, course, data, science, data, science, ..."
2,python for data science,data science,"[python, data, science, data, science, learn, ..."
3,data science math skills,data science,"[data, science, math, skill, data, science, ma..."
4,programming for data science with python,data science,"[program, data, science, python, data, science..."


## Saving Cleaned Dataset 

In [58]:
df_job_filtered.to_csv("cleaned_jobstreet.csv", index=False)
df_courses_filtered.to_csv("cleaned_classentral.csv", index=False)