In [62]:
import pandas as pd
import emoji, string
import nltk
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from nltk.corpus import stopwords
nltk.download('stopwords')

df = pd.read_csv('DataSet.csv')
# Set pandas option to display all columns
pd.set_option('display.max_columns', None)

df.shape

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mervseah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(17880, 18)

## Exploratory Data Analysis

## Data Cleaning

In [63]:
# df.drop_duplicates(subset=['title','location', 'department' , 'salary_range', 'company_profile', 'description', 'requirements', 'benefits','telecommuting', 'has_company_logo', 'has_questions', 'employment_type', 'required_experience', 'required_education', 'industry', 'function', 'fraudulent'], keep='first', inplace=True)
df.shape

(17880, 18)

## Train Validation Test Split (Text and Categorical)
Train-Test-Validation: 70-15-15

In [64]:
X = df.drop('fraudulent', axis=1, inplace=False)
y = df['fraudulent']  # Target variable to predict

## splitting into training (70%) and remaining (30%)
X_train, X_rem, y_train, y_rem = train_test_split(X, y, test_size = 0.3, random_state=42)

## splitting remaining into test (15%) and validation (15%)
X_test, X_val, y_test, y_val = train_test_split(X_rem, y_rem, test_size = 0.15 / 0.85, random_state=42)

for dataset in [X_train, X_val, X_test]:
    display(dataset.shape)

(12516, 17)

(947, 17)

(4417, 17)

## Train Validation Test Split (No Combined Cleaned Text)


In [65]:
columns_to_drop = ['fraudulent', 'title', 'benefits', 'description', 'requirements', 'company_profile']
X_cat = df.drop(columns_to_drop, axis = 1)

## splitting into training (70%) and remaining (30%)
X_train_cat, X_rem_cat, y_train_cat, y_rem_cat = train_test_split(X_cat, y, test_size = 0.3, random_state=42)

## splitting remaining into test (15%) and validation (15%)
X_test_cat, X_val_cat, y_test_cat, y_val_cat = train_test_split(X_rem_cat, y_rem_cat, test_size = 0.15 / 0.85, random_state=42)

for dataset in [X_train_cat, X_val_cat, X_test_cat]:
    display(dataset.shape)

(12516, 12)

(947, 12)

(4417, 12)

## Feature Engineering

In [66]:
## TEXT PREPROCESSING FOR TEXTUAL COLUMNS

def clean_text_features(text):
    try:
        if pd.isnull(text):
            return ""
        
        # Remove HTML tags
        text = re.sub(r'<[^>]*>', '', text)
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove non-alphanumeric characters (except specified punctuation)
        text = re.sub(r'[^\w\s.,:;!?\'"-]', '', text)
        
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Remove emojis
        text = emoji.replace_emoji(text, replace="")
        
        # Remove stopwords
        stop_words = set(stopwords.words("english"))
        words = text.split()
        filtered_words = [word for word in words if word not in stop_words]
        text = " ".join(filtered_words)
        
        return text

    except Exception as e:
        print(f"Error cleaning text: {e}")
        return ""


text_columns = ['title', 'benefits', 'description', 'requirements', 'company_profile']

for col in text_columns:
    X_train[col] = X_train[col].apply(clean_text_features)
    X_val[col] = X_val[col].apply(clean_text_features)
    X_test[col] = X_test[col].apply(clean_text_features)

X_train['combined_text_data'] = (
    X_train['title'] + ' ' +
    X_train['description'] + ' ' +
    X_train['requirements'] + ' ' +
    X_train['company_profile'] + ' ' +
    X_train['benefits']
)

X_val['combined_text_data'] = (
    X_val['title'] + ' ' +
    X_val['description'] + ' ' +
    X_val['requirements'] + ' ' +
    X_val['company_profile'] + ' ' +
    X_val['benefits']
)

X_test['combined_text_data'] = (
    X_test['title'] + ' ' +
    X_test['description'] + ' ' +
    X_test['requirements'] + ' ' +
    X_test['company_profile'] + ' ' +
    X_test['benefits']
)

In [77]:
## encoding categorical columns

categorical_columns = ['location', 'department', 'employment_type', 'required_experience', 
                       'required_education', 'industry', 'function']

encoders = {} ## storing this here first if we wanna use them later

for column in categorical_columns:
    if column in X_train.columns:

        le = LabelEncoder()
        encoders[column] = le

        for dataset in [X_train, X_train_cat]:

            dataset[column] = dataset[column].fillna('Unknown') 
            dataset[f"{column}_encoded"] = le.fit_transform(dataset[column])

        def transform_with_unknown(encoder, series):
            return series.apply(lambda x: encoder.transform([x])[0] if x in encoder.classes_ else -1)

        for dataset in [X_val, X_test, X_val_cat, X_test_cat]:

            dataset[column] = dataset[column].fillna('Unknown') 
            dataset[f"{column}_encoded"] = transform_with_unknown(le, dataset[column]) 
            

In [78]:
## encodign binary columns

'''
important: idk if we should keep in_balanced_dataset
'''
binary_columns = ['telecommuting', 'has_company_logo', 'has_questions', 'fraudulent', 'in_balanced_dataset']

for column in binary_columns:
    if column in X_train.columns:
        for dataset in [X_train, X_val, X_test, X_train_cat, X_val_cat, X_test_cat]:
            dataset[column] = dataset[column].apply(lambda x: 1 if x == 't' else (0 if x == 'f' else None))

In [79]:
## numerical columns

columns_to_normalize = ['salary_lower', 'salary_upper', 'salary_average', 'salary_range_diff']

for dataset in [X_train, X_val, X_test, X_train_cat, X_val_cat, X_test_cat]:
    dataset[['salary_lower', 'salary_upper']] = dataset['salary_range'].str.split('-', expand=True).astype(float)
    dataset['salary_average'] = (dataset['salary_lower'] + dataset['salary_upper']) / 2
    dataset['salary_range_diff'] = dataset['salary_upper'] - dataset['salary_lower']
    
    scaler = MinMaxScaler()
    dataset[columns_to_normalize] = scaler.fit_transform(dataset[columns_to_normalize])

In [70]:
# DROPPING RAW TEXTUAL COLUMNS THAT ARE UNUSED

columns_to_drop = ['title', 'benefits', 'description', 'requirements', 'company_profile']
for dataset in [X_train, X_val, X_test]:
    dataset.drop(columns_to_drop, axis = 1, inplace = True)


# df.drop(columns_to_drop, axis = 1, inplace = True)

# Embeddings for textual data

In [80]:
#Count Vectorizer Embedding 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(X_train['combined_text_data'])
X_train_count =  count_vect.transform(X_train['combined_text_data'])
X_val_count = count_vect.transform(X_val['combined_text_data'])
X_test_count = count_vect.transform(X_test['combined_text_data'])

#TFIDF Embedding 
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['combined_text_data'])
X_train_tfidf = tfidf_vect.transform(X_train['combined_text_data'])
X_val_tfidf = tfidf_vect.transform(X_val['combined_text_data'])
X_test_tfidf = tfidf_vect.transform(X_test['combined_text_data'])

In [73]:
display(X_train_count.shape)
display(X_val_count.shape)
display(X_test_count.shape)

(12516, 60665)

(947, 60665)

(4417, 60665)

In [74]:
display(X_train_tfidf.shape)
display(X_val_tfidf.shape)
display(X_test_tfidf.shape)

(12516, 60598)

(947, 60598)

(4417, 60598)

## Machine Learning Models


### Log Reg

In [81]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler

model = LogisticRegression(max_iter= 3000)

#### Categorical Only

In [83]:
# Fit the model to the training data
model.fit(X_train_cat, y_train)

# Evaluate on validation set (to minimize loss)
val_pred_proba = model.predict_proba(X_val_cat)[:, 1]
val_auc = roc_auc_score(y_val, val_pred_proba)
print(f"Validation AUC-ROC: {val_auc:.4f}")

# Final evaluation on test set
test_pred = model.predict(X_test_cat)
test_pred_proba = model.predict_proba(X_test_cat)[:, 1]

accuracy = accuracy_score(y_test, test_pred)
precision = precision_score(y_test, test_pred)
recall = recall_score(y_test, test_pred)
f1 = f1_score(y_test, test_pred)
auc = roc_auc_score(y_test, test_pred_proba)

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")
print(f"Test AUC-ROC: {auc:.4f}")

ValueError: could not convert string to float: 'US, IN, Indianapolis'

#### Count Vect

In [60]:
# Fit the model to the training data
model.fit(X_train_count, y_train)

# Evaluate on validation set (to minimize loss)
val_pred_proba = model.predict_proba(X_val_count)[:, 1]
val_auc = roc_auc_score(y_val, val_pred_proba)
print(f"Validation AUC-ROC: {val_auc:.4f}")

# Final evaluation on test set
test_pred = model.predict(X_test_count)
test_pred_proba = model.predict_proba(X_test_count)[:, 1]

accuracy = accuracy_score(y_test, test_pred)
precision = precision_score(y_test, test_pred)
recall = recall_score(y_test, test_pred)
f1 = f1_score(y_test, test_pred)
auc = roc_auc_score(y_test, test_pred_proba)

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")
print(f"Test AUC-ROC: {auc:.4f}")

NameError: name 'model' is not defined

#### TF-IDF

In [None]:
# Fit the model to the training data
model.fit(X_train_tfidf, y_train)

# Evaluate on validation set (to minimize loss)
val_pred_proba = model.predict_proba(X_val_tfidf)[:, 1]
val_auc = roc_auc_score(y_val, val_pred_proba)
print(f"Validation AUC-ROC: {val_auc:.4f}")

# Final evaluation on test set
test_pred = model.predict(X_test_tfidf)
test_pred_proba = model.predict_proba(X_test_tfidf)[:, 1]

accuracy = accuracy_score(y_test, test_pred)
precision = precision_score(y_test, test_pred)
recall = recall_score(y_test, test_pred)
f1 = f1_score(y_test, test_pred)
auc = roc_auc_score(y_test, test_pred_proba)

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")
print(f"Test AUC-ROC: {auc:.4f}")

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, test_pred_proba)

# Calculate the AUC score
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC Curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', lw=2, label='Random Guess')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)', fontsize=12)
plt.ylabel('True Positive Rate (TPR)', fontsize=12)
plt.title('Receiver Operating Characteristic (ROC) Curve', fontsize=14)
plt.legend(loc='lower right', fontsize=10)
plt.grid(alpha=0.3)
plt.show()