In [None]:
# LOAD THE DATASET

import pandas as pd

# Load the cleaned job scam dataset
df = pd.read_csv("Job_scam_cleaned_dataset.csv")

# Combine relevant text columns into one
df['text'] = df[['title', 'department', 'industry', 'function']].astype(str).agg(' '.join, axis=1)
df['text'] = df['text'].str.lower()

# Define target variable
y = df['is_scam']


In [None]:
# NAIVE BAYES IMPLEMENTATION

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Vectorize text data
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['text'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = nb_model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.94      1.00      0.97      3335
           1       0.87      0.14      0.24       241

    accuracy                           0.94      3576
   macro avg       0.90      0.57      0.60      3576
weighted avg       0.94      0.94      0.92      3576



In [None]:
# LSTM IMPLEMENTATION

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])
padded = pad_sequences(sequences, maxlen=100)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded, y, test_size=0.2, random_state=42)

# LSTM model
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=100),
    LSTM(64),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=3, batch_size=64, validation_split=0.1)

# Predictions and evaluation
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))


Epoch 1/3




[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 83ms/step - accuracy: 0.9185 - loss: 0.3138 - val_accuracy: 0.9350 - val_loss: 0.2131
Epoch 2/3
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 83ms/step - accuracy: 0.9448 - loss: 0.1930 - val_accuracy: 0.9469 - val_loss: 0.1839
Epoch 3/3
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 87ms/step - accuracy: 0.9504 - loss: 0.1638 - val_accuracy: 0.9462 - val_loss: 0.1876
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      3335
           1       0.80      0.30      0.44       241

    accuracy                           0.95      3576
   macro avg       0.88      0.65      0.70      3576
weighted avg       0.94      0.95      0.94      3576

