# PART III: CLASSIFICATION (pre-trained word2vec)

In [28]:
import gensim.downloader as api
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import plotly.figure_factory as ff
from joblib import dump
from joblib import load

In [29]:
model = api.load('word2vec-google-news-300')

In [30]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\temulenbd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\temulenbd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
stop_words = set(stopwords.words('english'))

In [32]:
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text)
    # Convert to lower case
    tokens = [word.lower() for word in tokens]
    # Remove punctuation and non-alphabetic tokens
    words = [word for word in tokens if word.isalpha()]
    # Remove stop words
    words = [word for word in words if not word in stop_words]
    return words

In [33]:
# Import finalized dataset as pandas data frame.
df = pd.read_csv('data_jobads_final.csv', index_col=None)

# Apply the final touch.
df['job_description'] = df['job_description'].str.replace('\n', ' ')
df = df.dropna()
df = df.iloc[:,-2:]
df = df.reset_index(drop=True)

df.head(2)

Unnamed: 0,job_description,label
0,silver stream healthcare group offer great emp...,registered_nurse
1,create a better future for yourself recruitne...,registered_nurse


In [34]:
df['processed_text'] = df['job_description'].apply(preprocess_text)

df

Unnamed: 0,job_description,label,processed_text
0,silver stream healthcare group offer great emp...,registered_nurse,"[silver, stream, healthcare, group, offer, gre..."
1,create a better future for yourself recruitne...,registered_nurse,"[create, better, future, recruitnet, internati..."
2,"access healthcare, one of irelands leading hea...",registered_nurse,"[access, healthcare, one, irelands, leading, h..."
3,are you a dedicated and compassionate staff nu...,registered_nurse,"[dedicated, compassionate, staff, nurse, looki..."
4,clinical research nurse cardiology cnm2 we a...,registered_nurse,"[clinical, research, nurse, cardiology, seekin..."
...,...,...,...
1161,the successful candidate will have exposure to...,data_analyst,"[successful, candidate, exposure, levels, firm..."
1162,sector: fintech you will be a datadriven indiv...,data_analyst,"[sector, fintech, datadriven, individual, leas..."
1163,our client are recognised as a market leader a...,data_analyst,"[client, recognised, market, leader, looking, ..."
1164,the role our operations analysts are responsib...,data_analyst,"[role, operations, analysts, responsible, mana..."


In [35]:
def document_vector(word2vec_model, doc):
    # Filter words in doc that are in the model's vocabulary
    doc = [word for word in doc if word in word2vec_model.key_to_index]
    if not doc:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(word2vec_model[doc], axis=0)

In [36]:
# Apply feature extraction to the processed text
X = np.array([document_vector(model, doc) for doc in df['processed_text']])
X.shape

(1166, 300)

In [37]:
# Extract labels
y = df['label'].values

In [38]:
# Initialize the encoder
label_encoder = LabelEncoder()

# Fit and transform labels to encode them
Y = label_encoder.fit_transform(y)

# Now `y_encoded` contains encoded labels suitable for classification

In [39]:
X_train, X_temp, y_train, y_temp = train_test_split(X, Y, test_size=0.3, random_state=820, stratify=Y)
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=820, stratify=y_temp)

In [40]:
# Train a logistic regression classifier
classifier = LogisticRegression(random_state=630)
classifier.fit(X_train, y_train)

# Evaluate the classifier
predictions = classifier.predict(X_validation)
print(classification_report(y_validation, predictions))

label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

print("Label Mapping:")
for label, encoded_num in label_mapping.items():
    print(f"{encoded_num}: {label}")

              precision    recall  f1-score   support

           0       0.97      1.00      0.98        57
           1       1.00      0.95      0.98        22
           2       1.00      0.99      0.99        96

    accuracy                           0.99       175
   macro avg       0.99      0.98      0.98       175
weighted avg       0.99      0.99      0.99       175

Label Mapping:
0: data_analyst
1: electrician
2: registered_nurse


In [41]:
dump(classifier, 'ft_word2vec_temuulen')

['ft_word2vec_temuulen']

## 4 Evaluating the model.

In [42]:
ttt = load('ft_word2vec_temuulen')

In [43]:
predictions = ttt.predict(X_test)
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        56
           1       1.00      1.00      1.00        22
           2       1.00      1.00      1.00        97

    accuracy                           1.00       175
   macro avg       1.00      1.00      1.00       175
weighted avg       1.00      1.00      1.00       175



In [44]:
labels = y_test
preds = predictions

In [45]:
cm_labels = ['registered nurse', 'electrician', 'data analyst']
cm_matrix = confusion_matrix(labels, preds)
cm_title = "CONFUSION MATRIX: fine-tuned 'bert-base-uncased' model for classification"

fig = ff.create_annotated_heatmap(z=cm_matrix, 
                                  x=cm_labels,
                                  y=cm_labels, 
                                  colorscale='balance', 
                                  showscale=True,
                                  annotation_text=cm_matrix)

fig.update_layout(width=700, 
                  height=700, 
                  title=cm_title, 
                  title_x=0.5,
                  xaxis=dict(title='Predicted Value', side='bottom'), 
                  yaxis_title='True Value')

fig.update_yaxes(tickangle=-90)  
    
fig.show()

# Print detailed classification report.
report = classification_report(labels, preds, output_dict=True)
report_title = "CLASSIFICATION REPORT: fine-tuned 'bert-base-uncased' model for classification"