In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate, Dropout

In [2]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,resume_text,job_description_text,label
0,SummaryHighly motivated Sales Associate with e...,Net2Source Inc. is an award-winning total work...,No Fit
1,Professional SummaryCurrently working with Cat...,At Salas OBrien we tell our clients that were ...,No Fit
2,SummaryI started my construction career in Jun...,Schweitzer Engineering Laboratories (SEL) Infr...,No Fit
3,SummaryCertified Electrical Foremanwith thirte...,"Mizick Miller & Company, Inc. is looking for a...",No Fit
4,SummaryWith extensive experience in business/r...,Life at Capgemini\nCapgemini supports all aspe...,No Fit


In [3]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

X_train, X_test, y_train, y_test = train_test_split(df[['resume_text', 'job_description_text']], df['label'], test_size=0.2, random_state=42)

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(pd.concat([X_train['resume_text'], X_train['job_description_text']]))

X_train_resume = tokenizer.texts_to_sequences(X_train['resume_text'])
X_train_job_desc = tokenizer.texts_to_sequences(X_train['job_description_text'])
X_test_resume = tokenizer.texts_to_sequences(X_test['resume_text'])
X_test_job_desc = tokenizer.texts_to_sequences(X_test['job_description_text'])

max_len = 100
X_train_resume = pad_sequences(X_train_resume, maxlen=max_len)
X_train_job_desc = pad_sequences(X_train_job_desc, maxlen=max_len)
X_test_resume = pad_sequences(X_test_resume, maxlen=max_len)
X_test_job_desc = pad_sequences(X_test_job_desc, maxlen=max_len)

input_resume = Input(shape=(max_len,))
input_job_desc = Input(shape=(max_len,))

embedding = Embedding(input_dim=10000, output_dim=128, input_length=max_len)

encoded_resume = embedding(input_resume)
encoded_job_desc = embedding(input_job_desc)

shared_lstm = LSTM(64)

lstm_resume = shared_lstm(encoded_resume)
lstm_job_desc = shared_lstm(encoded_job_desc)

merged = Concatenate()([lstm_resume, lstm_job_desc])
dense = Dense(64, activation='relu')(merged)
dropout = Dropout(0.5)(dense)
output = Dense(3, activation='softmax')(dropout)

model = Model(inputs=[input_resume, input_job_desc], outputs=output)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit([X_train_resume, X_train_job_desc], y_train, epochs=30, batch_size=32, validation_split=0.2)

loss, accuracy = model.evaluate([X_test_resume, X_test_job_desc], y_test)
print(f'Test Accuracy: {accuracy}')


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test Accuracy: 0.7718174457550049
