In [1]:
import numpy as np
import pandas as pd

In [2]:
file_url = 'https://raw.githubusercontent.com/wongwara/Jobseeker_Baymax/d9d7fc4753ca407eba5423da6e1101e042b216a8/dataset/final_cleaned.csv'

In [3]:
df = pd.read_csv(file_url)

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 638 entries, 0 to 637
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             638 non-null    int64  
 1   jobClassification      638 non-null    int64  
 2   state                  638 non-null    int64  
 3   teaser                 540 non-null    object 
 4   workType               638 non-null    int64  
 5   min_salary             638 non-null    int64  
 6   max_salary             638 non-null    int64  
 7   isRightToWorkRequired  638 non-null    int64  
 8   desktopAdTemplate      537 non-null    object 
 9   Python                 638 non-null    int64  
 10  SQL                    638 non-null    int64  
 11  R                      638 non-null    int64  
 12  Tableau                638 non-null    int64  
 13  SAS                    638 non-null    int64  
 14  Matlab                 638 non-null    int64  
 15  Hadoop

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,jobClassification,state,teaser,workType,min_salary,max_salary,isRightToWorkRequired,desktopAdTemplate,Python,...,SAS,Matlab,Hadoop,Spark,Java,Scala,recruiter,state_encoded,salary_section,salary_section_enc
0,0,0,1,Fantastic organisation seeks experienced Insig...,2,90000,120000,0,\n \n \n \n Insights Analyst – Onl...,0,...,0,0,0,0,0,0,1,1,"(100000.0, 110000.0]",0.0
1,1,1,1,This role requires an individual with strong c...,2,90000,110000,0,\n \n \n \n Credit Risk Analyst \n...,0,...,1,0,0,0,0,0,1,1,"(90000.0, 100000.0]",1.0
2,2,1,1,One of Australia's leading financial service p...,2,110000,120000,1,\n \n \n Data Analytics Recruitment Sol...,1,...,1,0,0,0,0,0,1,1,"(110000.0, 120000.0]",2.0
3,3,2,0,Postdoctoral researcher in molecular evolution...,2,71509,90215,0,\n \n \n \n Postdoctoral Fellow \n...,0,...,0,0,0,0,1,0,0,0,"(80000.0, 90000.0]",3.0
4,4,3,0,Postdoctoral researcher in molecular evolution...,2,71509,90215,0,\n \n \n \n Postdoctoral Fellow \n...,0,...,0,0,0,0,1,0,0,0,"(80000.0, 90000.0]",3.0


### Clean text

In [7]:
df['teaser']= df['teaser'].fillna('')

In [8]:
df['desktopAdTemplate']= df['desktopAdTemplate'].fillna('')

In [9]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer


# Clean the text data
df['teaser'] = df['teaser'].str.replace('[^\w\s]', '') # Remove punctuation
df['desktopAdTemplate'] = df['desktopAdTemplate'].str.replace('[^\w\s]', '') # Remove punctuation
df['teaser'] = df['teaser'].str.replace('\d+', '') # Remove digits
df['desktopAdTemplate'] = df['desktopAdTemplate'].str.replace('\d+', '') # Remove digits

# Normalize the text data
stop_words = set(stopwords.words('english'))
df['teaser'] = df['teaser'].apply(lambda x: ' '.join([word.lower() for word in x.split() if word.lower() not in stop_words]))
df['desktopAdTemplate'] = df['desktopAdTemplate'].apply(lambda x: ' '.join([word.lower() for word in x.split() if word.lower() not in stop_words]))

# Tokenize the text data
df['teaser'] = df['teaser'].apply(lambda x: word_tokenize(x))
df['desktopAdTemplate'] = df['desktopAdTemplate'].apply(lambda x: word_tokenize(x))

# Apply stemming
stemmer = PorterStemmer()
df['teaser'] = df['teaser'].apply(lambda x: [stemmer.stem(word) for word in x])
df['desktopAdTemplate'] = df['desktopAdTemplate'].apply(lambda x: [stemmer.stem(word) for word in x])

# # Create TF-IDF vectors
vectorizer = TfidfVectorizer()
teaser_tfidf = vectorizer.fit_transform(df['teaser'].apply(lambda x: ' '.join(x)))
desktopAdTemplate_tfidf = vectorizer.fit_transform(df['desktopAdTemplate'].apply(lambda x: ' '.join(x)))

# # Concatenate the TF-IDF vectors with the original dataframe
df = pd.concat([df.drop(['teaser', 'desktopAdTemplate'], axis=1), pd.DataFrame(teaser_tfidf.toarray()), pd.DataFrame(desktopAdTemplate_tfidf.toarray())], axis=1)

# Display the resulting dataframe
print(df.head())


   Unnamed: 0  jobClassification  state  workType  min_salary  max_salary  \
0           0                  0      1         2       90000      120000   
1           1                  1      1         2       90000      110000   
2           2                  1      1         2      110000      120000   
3           3                  2      0         2       71509       90215   
4           4                  3      0         2       71509       90215   

   isRightToWorkRequired  Python  SQL  R  ...  6803  6804  6805  6806  6807  \
0                      0       0    1  1  ...   0.0   0.0   0.0   0.0   0.0   
1                      0       0    1  1  ...   0.0   0.0   0.0   0.0   0.0   
2                      1       1    1  1  ...   0.0   0.0   0.0   0.0   0.0   
3                      0       0    0  0  ...   0.0   0.0   0.0   0.0   0.0   
4                      0       0    0  0  ...   0.0   0.0   0.0   0.0   0.0   

   6808  6809  6810  6811 6812  
0   0.0   0.0   0.0   0.0  0.

In [10]:
df.shape

(638, 7766)

In [11]:
df.columns

Index([           'Unnamed: 0',     'jobClassification',
                       'state',              'workType',
                  'min_salary',            'max_salary',
       'isRightToWorkRequired',                'Python',
                         'SQL',                     'R',
       ...
                          6803,                    6804,
                          6805,                    6806,
                          6807,                    6808,
                          6809,                    6810,
                          6811,                    6812],
      dtype='object', length=7766)

In [12]:
df = df.drop(['Unnamed: 0','state_encoded','min_salary','max_salary','salary_section'],axis=1)

In [13]:
df.columns

Index([    'jobClassification',                 'state',
                    'workType', 'isRightToWorkRequired',
                      'Python',                   'SQL',
                           'R',               'Tableau',
                         'SAS',                'Matlab',
       ...
                          6803,                    6804,
                          6805,                    6806,
                          6807,                    6808,
                          6809,                    6810,
                          6811,                    6812],
      dtype='object', length=7761)

In [14]:
df.shape

(638, 7761)

### Read csv file to the same split for other experiment

In [15]:
file_url_x_train = 'https://raw.githubusercontent.com/wongwara/Jobseeker_Baymax/main/Final/data%20splitted/X_train.csv'
file_url_x_test = 'https://raw.githubusercontent.com/wongwara/Jobseeker_Baymax/main/Final/data%20splitted/X_test.csv'
file_url_y_train = 'https://raw.githubusercontent.com/wongwara/Jobseeker_Baymax/main/Final/data%20splitted/y_train.csv'
file_url_y_test = 'https://raw.githubusercontent.com/wongwara/Jobseeker_Baymax/main/Final/data%20splitted/y_test.csv'

In [16]:
x_train = pd.read_csv(file_url_x_train)
x_test = pd.read_csv(file_url_x_test)
y_train = pd.read_csv(file_url_y_train)
y_test = pd.read_csv(file_url_y_test)

In [17]:
y_train = y_train.drop('Unnamed: 0', axis = 1)
y_test = y_test.drop('Unnamed: 0', axis = 1)

In [18]:
print(x_train.shape,x_test.shape, y_train.shape, y_test.shape)

(510, 7761) (128, 7761) (510, 1) (128, 1)


### Train model using Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(x_train, y_train)

LogisticRegression()

Accuracy score

In [20]:
from sklearn.metrics import accuracy_score
train_pred = log_reg.predict(x_train)
train_acc_score = accuracy_score(y_train, train_pred)
print('Training Accuracy Score :', train_acc_score)

Training Accuracy Score : 0.43333333333333335


In [21]:
test_pred = log_reg.predict(x_test)
test_acc_score = accuracy_score(y_test, test_pred)
print('Testing Accuracy Score :', test_acc_score)

Testing Accuracy Score : 0.375


F1 score

In [22]:
from sklearn.metrics import f1_score
train_f1_score = f1_score(y_train, train_pred, average='macro')
print('Training f1 Score :', train_f1_score)

Training f1 Score : 0.37464600890200045


In [23]:
test_f1_score = f1_score(y_test, test_pred, average='macro')
print('Testing f1 Score :', test_f1_score)

Testing f1 Score : 0.36236322198230747


### GridSearch

In [24]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.001, 0.1, 1, 10],
              'penalty': ['l1', 'l2'],
              'max_iter': [100, 150, 200]
             }
grid_search_log = GridSearchCV(log_reg, param_grid = param_grid, scoring = 'f1_macro')
grid_search_log.fit(x_train, y_train)

GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': [0.001, 0.1, 1, 10], 'max_iter': [100, 150, 200],
                         'penalty': ['l1', 'l2']},
             scoring='f1_macro')

In [26]:
best_params_grid = grid_search_log.best_params_
print(best_params_grid)
print('Best Hyperparameters :', grid_search_log.best_params_)

{'C': 10, 'max_iter': 200, 'penalty': 'l2'}
Best Hyperparameters : {'C': 10, 'max_iter': 200, 'penalty': 'l2'}


In [28]:
best_model_grid = LogisticRegression(**best_params_grid)
best_model_grid.fit(x_train, y_train)

LogisticRegression(C=10, max_iter=200)

In [29]:
y_train_pred_grid = best_model_grid.predict(x_train)
train_acc_score_grid = accuracy_score(y_train, y_train_pred_grid)
print('Training Hyperparameter tuning - Grid Search', train_acc_score_grid)

Training Hyperparameter tuning - Grid Search 0.5078431372549019


In [30]:
y_test_pred_grid = best_model_grid.predict(x_test)
test_acc_score_grid = accuracy_score(y_test, y_test_pred_grid)
print('Testing Hyperparameter tuning - Grid Search', test_acc_score_grid)

Testing Hyperparameter tuning - Grid Search 0.4140625


In [31]:
train_f1_score_grid= f1_score(y_train, y_train_pred_grid, average='macro')
print('Training Hyperparameter tuning - Grid Search', train_f1_score_grid)

Training Hyperparameter tuning - Grid Search 0.45933912381324193


In [32]:
test_f1_score_grid = f1_score(y_test, y_test_pred_grid, average = 'macro')
print('Testing Hyperparameter tuning - Grid Search', test_f1_score_grid)

Testing Hyperparameter tuning - Grid Search 0.3865130831500084
