# Job Recommendations 

This notebook creates a model, to recommend job positions given a position requirements description . This is done only for IT jobs. 

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, auc, roc_curve, roc_auc_score
import re

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\INES\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\INES\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
data = pd.read_excel(r"C:\ikhsan\UNAIR\MBKM\Bangkit\1. Capstone\data website indeed Jakarta (data 30 May 2023).xlsx", "clean data")
print(data.columns)
# selecting only IT Jobs
df = data
# selecting 
cols = ['Kualifikasi', 'Posisi', 'Deskripsi Pekerjaan ', 'Tipe Pekerjaan']
df = df[cols]
df.head(5)

Index(['Posisi', 'Perusahaan ', 'Lokasi', 'Salary (minimum per bulan)',
       'Tipe Pekerjaan', 'Kualifikasi', 'Deskripsi Pekerjaan ', 'Link'],
      dtype='object')


Unnamed: 0,Kualifikasi,Posisi,Deskripsi Pekerjaan,Tipe Pekerjaan
0,Wanita\nUsia maksimal 30 tahun\nMemiliki penga...,Talent Host Streaming ( Freelance ),Melakukan penjualan produk secara live di Tik ...,Part-time
1,Technical requirements to perform the work\nAc...,Data Analyst,Work up to 20 hours per week.\nEarn a competit...,Part-time
2,Pria & Wanita \n18 - 25 tahun \nMin. pengalama...,Staff Operational,,Full-time
3,Pria atau Wanita\nUsia maksimal 26 tahun\nTing...,Server Tasty Kitchen (Area Jabodetabek),,Part-time
4,Usia Maksimal 30 tahun\nPendidikan Minimal Lul...,Admin Online (Cengkareng),,Full-time


# Modifying Job Titles
Selecting only top 21 job titles, to manage class imbalance

In [4]:
classes = df['Posisi'].value_counts()
keys = classes.keys().to_list()

df = df[df['Posisi'].isin(keys)]
df['Posisi'].value_counts()

Posisi
Telemarketing                  5
Staff Gudang                   5
Cook Helper                    4
Kasir                          3
Beautician                     2
                              ..
KYC Agent                      1
Talent / Model [Min LD 100]    1
Guru Geografi                  1
SPG / SPB Elektronik           1
Staff Logistik (SLO-03)        1
Name: count, Length: 287, dtype: int64

Change job titles to base title. For exmaple, chaning Senior Java Developer to Java Developer.   

In [5]:
posisiFix = []
for i in df['Posisi']:
    i = re.sub("\(.*?\)|\[.*?\]|-.*", "", i)
    # print(i)
    posisiFix.append(i)

# print(posisiFix)
posisiFix = pd.DataFrame(posisiFix)
# posisiFix.head(5)
df['Posisi'] = posisiFix[0].values
df['Posisi'].value_counts()

Posisi
Telemarketing                          5
Staff Gudang                           5
Cook Helper                            4
Kasir                                  3
Sales Advisor                          2
                                      ..
Housekeeping                           1
Customer Service Specialist            1
FREELANCE Live Streamer Marketplace    1
KYC Agent                              1
Staff Logistik                         1
Name: count, Length: 282, dtype: int64

# Building custom tokenizer to process text

In [6]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        # lemmatize text - convert to base form 
        self.wnl = WordNetLemmatizer()
        # creating stopwords list, to ignore lemmatizing stopwords 
        self.stopwords = stopwords.words('indonesian')
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if t not in self.stopwords]

# removing new line characters, and certain hypen patterns                  
df['Kualifikasi'] = df['Kualifikasi'].apply(lambda x: x.replace('\n', ' ').replace('\r', '').replace('- ', ''). replace(' - ', ' to '))

# Featurizing Text

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
# train features and labels 
y = df['Posisi']
X = df['Kualifikasi']

# allPreferensi = []
# for i in range(0, 3):
#     preferensi = input("Masukkan preferensi: ")
#     allPreferensi.append(preferensi)
#     if i < 2:
#         answer = input("Mau tambah preferensi?")
#     if answer != ("Y" or "Yes" or "y" or "ya"):
#         break
# strPreferensi = ', '.join(allPreferensi)
# preferensi = [strPreferensi]
# X = pd.DataFrame(preferensi, columns = ["Preferensi"])

# tdif feature rep 
vectorizer = TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words=stopwords.words('indonesian'))
vectorizer.fit(X)

# transoforming text to tdif features
tfidf_matrix = vectorizer.transform(X)

# sparse matrix to dense matrix for training
X_tdif = tfidf_matrix.toarray()

# encoding text labels in categories 
enc = LabelEncoder() 
enc.fit(y.values)
y_enc=enc.transform(y.values)

X_train_words, X_test_words, y_train, y_test = train_test_split(X, y_enc, test_size=0.3, random_state=10)

X_train = vectorizer.transform(X_train_words)
X_train = X_train.toarray()

X_test = vectorizer.transform(X_test_words)
X_test = X_test.toarray()




# Training Naive Bayes
Looks pretty overfit

In [47]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
gnb = GaussianNB()
train_preds = gnb.fit(X_train, y_train).predict(X_train)
test_preds = gnb.predict(X_test)

print('Train acc: {0}'.format(accuracy_score(y_train, train_preds)))
print('Test acc: {0}'.format(accuracy_score(y_test, test_preds)))


Train acc: 0.944954128440367
Test acc: 0.02127659574468085


# Training Logistic Regression
By modifiying the maximum number of iterations, and regularization, C, the above experienced overfitting was reduced significantly 


In [48]:
from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression(max_iter=100,verbose=1, C=60)

train_preds = logistic.fit(X_train, y_train).predict(X_train)
test_preds = logistic.predict(X_test)

print('Train acc: {0}'.format(accuracy_score(y_train, train_preds)))
print('Test acc: {0}'.format(accuracy_score(y_test, test_preds)))


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Train acc: 0.944954128440367
Test acc: 0.031914893617021274


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.0s finished


# Creating Job Recommendations 
Recommends 2 job position alternatives given a job requirement. By obtaining probability of class predictions, and picking the top N predictions, other than true label, N closest recommendations can be got

In [49]:
preds_data = {'Current Position Requirments': [], 'Current Position': [], 'Alternative 1': [], 'Alternative 2': []}
y_preds_proba = logistic.predict_proba(X_test)

counter = 0 
for idx, (pred_row, true_job_position) in enumerate(zip(y_preds_proba, y_test)):
    class_preds = np.argsort(pred_row)
    # delete true class
    for i in [-1, -2]:
        if class_preds[i] == true_job_position:
            class_preds=np.delete(class_preds,i)
    # getting other 2 highest job predictions         
    top_classes = class_preds[-2:]
    # obtaining class name string from int label 
    class_names = enc.inverse_transform(top_classes)
    true_job_position_name = enc.inverse_transform([true_job_position])
    # saving to dict
    preds_data['Current Position Requirments'].append(X_test_words.iloc[idx])
    preds_data['Current Position'].append(true_job_position_name[0])
    preds_data['Alternative 1'].append(class_names[1])
    preds_data['Alternative 2'].append(class_names[0])

    
    counter +=1

In [50]:
preds_df = pd.DataFrame.from_dict(preds_data)
preds_df.to_csv('Recommendations.csv', index=False)
preds_df


Unnamed: 0,Current Position Requirments,Current Position,Alternative 1,Alternative 2
0,"Pria, usia maks. 35 tahun Pendidikan Min. SMA/...",Merchandiser Staff,SALES AGENT / SPG / SPB,Posisi kasir untuk Jakarta
1,"Pendidikan SMK/SMA, D3, S1 Penempatan Sebakis ...",Admin Produksi,Desk Collection Recovery,Sales
2,Pria / Wanita Minimal Pendidikan SMA/SMK/Seder...,Sales Kartu Kredit Perbankan,Field Activator Reguler,SALES AGENT / SPG / SPB
3,Pendidikan SMA/SLTA Pengalaman minimal 1 tahu...,Anggota Security,Barista,Posisi kasir untuk Jakarta
4,"Wanita, minimal lulusan SMA/K Boleh Fresh Grad...",Customer Service,Admin Sosial media,SALES AGENT / SPG / SPB
...,...,...,...,...
89,Pria / Wanita Pendidikan minimal SMA / Sederaj...,Brand Content Creator,CSR,Ambassador SOKO FInancial Club
90,SMA/SMU/SMK (Diwajibkan) Pengalaman: Asisten A...,Asisten Apoteker,Marketing Sales,Ambassador SOKO FInancial Club
91,Experience: Sales: 1 year (Preferred),Dropshipper Elektronik,Beautician,COOK
92,Memiliki pengalaman minimal 1 tahun sebagai dr...,Driver,Marketing Sales,Host Livestream Specialist
