In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score, balanced_accuracy_score

In [3]:
# -----------------------------
# Data Targets
# -----------------------------
# read file
data_path = '/Users/jonathankipping/code/syeda-tabassum-rahaman/scam-job-detector/raw_data/data_cleaned.csv'
df = pd.read_csv(data_path)
X = df.drop(columns=["fraudulent"])
y = df["fraudulent"]
# -----------------------------
# Combine text columns
# -----------------------------
text_columns = [
    "title",
    "company_profile",
    "description",
    "requirements",
    "benefits"
]
combined_text = X[text_columns].fillna("").agg(" ".join, axis=1).to_list()
# -----------------------------
# Train-test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    combined_text, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
# -----------------------------
# Tokenizer
# -----------------------------
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-small")
train_tokens = tokenizer(
    X_train,
    padding="max_length",
    truncation=True,
    max_length=256,
    return_tensors="np",
)
test_tokens = tokenizer(
    X_test,
    padding="max_length",
    truncation=True,
    max_length=256,
    return_tensors="np",
)
# -----------------------------
# Load BERT Tiny model
# -----------------------------
model = TFAutoModelForSequenceClassification.from_pretrained(
    "prajjwal1/bert-small",
    num_labels=2,
    from_pt=True
)
# -----------------------------
# Compile Model
# -----------------------------
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)
# -----------------------------
# Fit Model
# -----------------------------
history = model.fit(
    {
        "input_ids": train_tokens["input_ids"],
        "attention_mask": train_tokens["attention_mask"],
    },
    y_train.values,
    validation_split=0.1,
    epochs=3,
    batch_size=16
)
# -----------------------------
# Evaluate
# -----------------------------
test_loss, test_acc = model.evaluate(
    {
        "input_ids": test_tokens["input_ids"],
        "attention_mask": test_tokens["attention_mask"],
    },
    y_test.values
)
print("Test Accuracy:", test_acc)

# Predict on test set
pred_logits = model.predict(
    {
        "input_ids": test_tokens["input_ids"],
        "attention_mask": test_tokens["attention_mask"],
    }
).logits

# Convert logits â†’ class labels
y_pred = np.argmax(pred_logits, axis=1)

# Print metrics
print(f"""
Model Performance
Recall:             {recall_score(y_test, y_pred):.4f}
Precision:          {precision_score(y_test, y_pred):.4f}
Balanced Accuracy:  {balanced_accuracy_score(y_test, y_pred):.4f}
F1 Score:           {f1_score(y_test, y_pred):.4f}
""")


TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should p

Epoch 1/3
123/805 [===>..........................] - ETA: 22:35 - loss: 0.2224 - accuracy: 0.9400

KeyboardInterrupt: 