# PART III: CLASSIFICATION (pre-trained word2vec)

In [21]:
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import plotly.figure_factory as ff
from joblib import dump
from joblib import load
import torch

In [22]:
# Download a compatible Pytorch version.
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

# Check whether CUDA is accessible.
cuda_available = torch.cuda.is_available()
cuda_device= torch.cuda.get_device_name(0)

if cuda_available == True:
    print('CUDA was successfully installed and compiled on my device.')
    print('CUDA device name is:', cuda_device)
else:
    print('Cuda in not available')

CUDA was successfully installed and compiled on my device.
CUDA device name is: NVIDIA GeForce GTX 1650


In [2]:
# Load the pretrained Word2Vec model
pretrained_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [3]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\temulenbd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\temulenbd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
stop_words = set(stopwords.words('english'))

In [5]:
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text)
    # Convert to lower case
    tokens = [word.lower() for word in tokens]
    # Remove punctuation and non-alphabetic tokens
    words = [word for word in tokens if word.isalpha()]
    # Remove stop words
    words = [word for word in words if not word in stop_words]
    return words

In [6]:
# Import finalized dataset as pandas data frame.
df = pd.read_csv('data_jobads_final.csv', index_col=None)

# Apply the final touch.
df['job_description'] = df['job_description'].str.replace('\n', ' ')
df = df.dropna()
df = df.iloc[:,-2:]
df = df.reset_index(drop=True)

df.head(2)

Unnamed: 0,job_description,label
0,silver stream healthcare group offer great emp...,registered_nurse
1,create a better future for yourself recruitne...,registered_nurse


In [7]:
df['processed_text'] = df['job_description'].apply(preprocess_text)

df.head(2)

Unnamed: 0,job_description,label,processed_text
0,silver stream healthcare group offer great emp...,registered_nurse,"[silver, stream, healthcare, group, offer, gre..."
1,create a better future for yourself recruitne...,registered_nurse,"[create, better, future, recruitnet, internati..."


In [8]:
# Create 'id2label', 'label2id' variables for mapping the labels.
labels = df['label'].unique().tolist()

num_labels = len(labels)
id2label = {id:label for id, label in enumerate(labels)}
label2id = {label: id for id, label in enumerate(labels)}

print(id2label)
print(label2id)

{0: 'registered_nurse', 1: 'electrician', 2: 'data_analyst'}
{'registered_nurse': 0, 'electrician': 1, 'data_analyst': 2}


In [10]:
# Encode the 'label' column.
df['label_encoded'] = df.label.map(lambda x: label2id[x.strip()])

df.tail(2)

Unnamed: 0,job_description,label,processed_text,label_encoded
1164,the role our operations analysts are responsib...,data_analyst,"[role, operations, analysts, responsible, mana...",2
1165,insurance analyst permanent dublin negotiable ...,data_analyst,"[insurance, analyst, permanent, dublin, negoti...",2


In [12]:
train, test = train_test_split(df, test_size=0.3, random_state=630, stratify=df['label'])
print('TOTAL shape:', df.shape)
print('TRAINING shape:', train.shape)
print('TEST shape:', test.shape)

TOTAL shape: (1166, 4)
TRAINING shape: (816, 4)
TEST shape: (350, 4)


In [14]:
train_sentences  = train['processed_text'].tolist()

Creating new vectors

In [18]:
ft_model = Word2Vec(vector_size=300, min_count=1, epochs=10)
ft_model.build_vocab(train_sentences)

Before training my new model on my dataset, merging the vocabularies and injecting the pre-trained vectors. This step requires adjusting the initial weights for the overlapping vocabulary.

In [23]:
ft_model.build_vocab([list(pretrained_model.key_to_index.keys())], update=True)
ft_model.wv.vectors_lockf = np.ones(len(ft_model.wv))
for word in pretrained_model.key_to_index:
    if word in ft_model.wv.key_to_index:
        ft_model.wv[word] = pretrained_model[word]

KeyboardInterrupt: 

Now, I can train (fine-tune) the model on my dataset. This training will update the vectors for the new words and potentially adjust the vectors for the old words based on the new context provided by my data.

In [24]:
ft_model.train(train_sentences, total_examples=len(train_sentences), epochs=ft_model.epochs)

(1708640, 1708640)

In [None]:
ft_model.save('new_word2vec_temuulen')

## 4. Evaluating the model

In [None]:
model = Word2Vec.load('new_word2vec_temuulen')

In [None]:
def feature_vec(words, model, num_features):
    """Average the word vectors for a set of words."""
    feature_vec = np.zeros((num_features,), dtype="float32")
    nwords = 0
    index2word_set = set(model.wv.index_to_key)
    
    for word in words:
        if word in index2word_set: 
            nwords += 1
            feature_vec = np.add(feature_vec, model.wv[word])
    
    if nwords > 0:
        feature_vec = np.divide(feature_vec, nwords)
    return feature_vec

def get_avg_feature_vecs(texts, model, num_features):
    """Calculate average feature vectors for all texts."""
    counter = 0
    text_feature_vecs = np.zeros((len(texts), num_features), dtype="float32")
    
    for text in texts:
        text_feature_vecs[counter] = feature_vec(text, model, num_features)
        counter += 1
    
    return text_feature_vecs


In [None]:
# Assuming `df` is your DataFrame and `model` is your Word2Vec model
X = get_avg_feature_vecs(df['processed_text'], model, 300)  # 300 is the vector size of the Word2Vec model
y = df['label_encoded'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=630)


In [None]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

predictions = classifier.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))


In [None]:
def document_vector(word2vec_model, doc):
    # Filter words in doc that are in the model's vocabulary
    doc = [word for word in doc if word in word2vec_model.key_to_index]
    if not doc:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(word2vec_model[doc], axis=0)

In [None]:
# Apply feature extraction to the processed text
X = np.array([document_vector(model, doc) for doc in df['processed_text']])
X.shape

In [None]:
# Extract labels
y = df['label'].values

In [None]:
# Initialize the encoder
label_encoder = LabelEncoder()

# Fit and transform labels to encode them
Y = label_encoder.fit_transform(y)

# Now `y_encoded` contains encoded labels suitable for classification

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, Y, test_size=0.3, random_state=820, stratify=Y)
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=820, stratify=y_temp)

In [None]:
# Train a logistic regression classifier
classifier = LogisticRegression(random_state=630)
classifier.fit(X_train, y_train)

# Evaluate the classifier
predictions = classifier.predict(X_validation)
print(classification_report(y_validation, predictions))

label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

print("Label Mapping:")
for label, encoded_num in label_mapping.items():
    print(f"{encoded_num}: {label}")

In [None]:
dump(classifier, 'ft_word2vec_temuulen')

## 4 Evaluating the model.

In [None]:
ft_model = load

In [None]:
ttt = load('ft_word2vec_temuulen')

In [None]:
predictions = ttt.predict(X_test)
print(classification_report(y_test, predictions))


In [None]:
labels = y_test
preds = predictions

In [None]:
cm_labels = ['registered nurse', 'electrician', 'data analyst']
cm_matrix = confusion_matrix(labels, preds)
cm_title = "CONFUSION MATRIX: fine-tuned 'bert-base-uncased' model for classification"

fig = ff.create_annotated_heatmap(z=cm_matrix, 
                                  x=cm_labels,
                                  y=cm_labels, 
                                  colorscale='balance', 
                                  showscale=True,
                                  annotation_text=cm_matrix)

fig.update_layout(width=700, 
                  height=700, 
                  title=cm_title, 
                  title_x=0.5,
                  xaxis=dict(title='Predicted Value', side='bottom'), 
                  yaxis_title='True Value')

fig.update_yaxes(tickangle=-90)  
    
fig.show()

# Print detailed classification report.
report = classification_report(labels, preds, output_dict=True)
report_title = "CLASSIFICATION REPORT: fine-tuned 'bert-base-uncased' model for classification"

In [None]:
# Read the text data from a .txt file
benchmark_train = pd.read_csv('ag_news_train.txt', delimiter='\t', header=None, names=['labels', 'text'])

benchmark_test = pd.read_csv('ag_news_test.txt', delimiter='\t', header=None, names=['labels', 'text'])

In [None]:
benchmark_train['processed_text'] = benchmark_train['text'].apply(preprocess_text)

benchmark_test['processed_text'] = benchmark_test['text'].apply(preprocess_text)
