In [38]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

In [39]:
# Load dataset
df = pd.read_csv(
    "fake_job_postings.csv",
    engine="python",
    on_bad_lines="skip"
)

In [40]:
# Select & merge important text columns
TEXT_COLS = [
    'title',
    'company_profile',
    'description',
    'requirements',
    'benefits'
]

df[TEXT_COLS] = df[TEXT_COLS].fillna("")

X = (
    df['title'] + " " +
    df['company_profile'] + " " +
    df['description'] + " " +
    df['requirements'] + " " +
    df['benefits']
).astype(str)

y = df['fraudulent'].astype(int).values

In [41]:
# Text cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"[^a-z ]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text

X = X.apply(clean_text).tolist()

In [42]:
# Tokenize text
MAX_WORDS = 20000
MAX_LEN = 300

tokenizer = Tokenizer(
    num_words=MAX_WORDS,
    oov_token="<OOV>"
)
tokenizer.fit_on_texts(X)

X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=MAX_LEN)

In [43]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_pad, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [44]:
# Handle class imbalance
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(class_weights))

In [45]:
# LSTM model
model = Sequential([
    Embedding(MAX_WORDS, 128, input_length=MAX_LEN),

    LSTM(128, return_sequences=True),
    Dropout(0.3),

    LSTM(64),
    Dropout(0.3),

    Dense(1, activation="sigmoid")
])

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)



In [46]:
# Early stopping
early_stop = EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True
)

In [47]:
# Train
model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stop],
    class_weight=class_weights
)

Epoch 1/20
[1m403/403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 26ms/step - accuracy: 0.7695 - loss: 0.6083 - val_accuracy: 0.0790 - val_loss: 0.7506
Epoch 2/20
[1m403/403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 25ms/step - accuracy: 0.7825 - loss: 0.5085 - val_accuracy: 0.8812 - val_loss: 0.3257
Epoch 3/20
[1m403/403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 31ms/step - accuracy: 0.9525 - loss: 0.1720 - val_accuracy: 0.9252 - val_loss: 0.2048
Epoch 4/20
[1m403/403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 25ms/step - accuracy: 0.9691 - loss: 0.0692 - val_accuracy: 0.9706 - val_loss: 0.1161
Epoch 5/20
[1m403/403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 25ms/step - accuracy: 0.9894 - loss: 0.0325 - val_accuracy: 0.9602 - val_loss: 0.1411
Epoch 6/20
[1m403/403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 23ms/step - accuracy: 0.9956 - loss: 0.0156 - val_accuracy: 0.9672 - val_loss: 0.1477
Epoch 7/20
[1m40

<keras.src.callbacks.history.History at 0x7ef9d1051d90>

In [48]:
# Evaluate
loss, acc = model.evaluate(X_test, y_test)
print("Test Accuracy:", acc)

[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9684 - loss: 0.1052
Test Accuracy: 0.9711968898773193


In [49]:
# Test prediction
test_text = [
    "Work from home job. Earn money fast with no experience required."
]

test_text = [clean_text(test_text[0])]

test_seq = pad_sequences(
    tokenizer.texts_to_sequences(test_text),
    maxlen=MAX_LEN
)

pred = model.predict(test_seq)

print("Prediction score:", pred[0][0])
print("Fake Job" if pred[0][0] > 0.5 else "Real Job")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 182ms/step
Prediction score: 0.99588144
Fake Job


In [52]:
model.save("lstm_model.keras")

In [53]:
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)