In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.layers import Embedding, LSTM, Dense,Bidirectional
# from tensorflow.keras.layers import Dropout
# from tensorflow.keras.preprocessing.text import one_hot
# from tensorflow.keras.layers import Input, Concatenate
# from tensorflow.keras.models import Model
# from sklearn.preprocessing import LabelEncoder

In [6]:
data = pd.read_csv('preprocessed_data_with_numerical.csv')

In [7]:
data

Unnamed: 0,title,location,department,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,in_balanced_dataset,missing company profile,missing company information
0,marketing intern,usa ny new york,marketing,food52 created groundbreaking award winning co...,food52 fast growing james beard award winning ...,experience content management system major plu...,,0,1,0,2,4,9,,marketing,0,0,1,3
1,customer service cloud video production,nz auckland,success,90 second world cloud video production service...,organised focused vibrant awesome passion cust...,expect key responsibility communicate client 9...,get u part 90 second team gain experience work...,0,1,0,1,6,9,marketing advertising,customer service,0,0,1,3
2,commissioning machinery assistant cma,usa ia wever,,valor service provides workforce solution meet...,client located houston actively seeking experi...,implement pre commissioning commissioning proc...,,0,1,0,2,6,9,,,0,0,1,3
3,account executive washington dc,usa dc washington,sale,passion improving quality life geography heart...,company esri environmental system research ins...,education bachelor master gi business administ...,culture anything corporate collaborative creat...,0,1,0,1,5,1,computer software,sale,0,0,1,3
4,bill review manager,usa fl fort worth,,spotsource solution llc global human capital m...,job title itemization review manager location ...,qualification rn license state texas diploma b...,full benefit offered,0,1,1,1,5,1,hospital health care,health care provider,0,0,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,account director distribution,ca toronto,sale,vend looking awesome new talent come join u wo...,case first time visited website vend award win...,ace role eat comprehensive statement work brea...,expect u open culture openly share result inpu...,0,1,1,1,5,9,computer software,sale,0,0,1,3
17876,payroll accountant,usa pa philadelphia,accounting,weblinc e commerce platform service provider f...,payroll accountant focus primarily payroll fun...,b b accounting desire fun love genuine passion...,health wellness medical plan prescription drug...,0,1,1,1,5,1,internet,accounting auditing,0,0,1,3
17877,project cost control staff engineer cost contr...,usa tx houston,,provide full time permanent position many medi...,experienced project cost control staff enginee...,least 12 year professional experience ability ...,,0,0,0,1,6,9,,,0,0,1,2
17878,graphic designer,ng la lagos,,,nemsia studio looking experienced visual graph...,1 must fluent latest version corel adobe cc es...,competitive salary compensation based experien...,0,0,1,0,6,6,graphic design,design,0,0,0,0


In [8]:
# replace NaN with empty strings
data.fillna('', inplace=True)

# combine all text
data['full_text'] = data['title'] + " " + data['location']  + " " + data['department']  + " " + data['company_profile']  + " " + data['description']  + " " + data['requirements']  + " "  + data['benefits'] + data['industry']  + " " + data['function']

In [9]:
X_train_full, X_test_full,y_train , y_test = train_test_split(data.drop('fraudulent', axis=1), data["fraudulent"], test_size=0.3, random_state=0)
X_train = X_train_full[['full_text', 'has_questions', 'employment_type', 'required_experience', 'required_education', 'missing company information']]
X_test = X_test_full[['full_text', 'has_questions', 'employment_type', 'required_experience', 'required_education', 'missing company information']]

<h1>Text Only Models</h1>

TFIDF Vectorizer

In [10]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['full_text'])
X_test_tfidf = tfidf_vectorizer.transform(X_test['full_text'])

In [11]:
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate F1 score
    f1 = f1_score(y_test, y_pred)
    
    return f1

logreg_model = LogisticRegression()
logreg_f1 = train_and_evaluate_model(logreg_model, X_train_tfidf, y_train, X_test_tfidf, y_test)
print(f"Logistic Regression F1 Score: {logreg_f1}")

svm_model = SVC()
svm_f1 = train_and_evaluate_model(svm_model, X_train_tfidf, y_train, X_test_tfidf, y_test)
print(f"SVM F1 Score: {svm_f1}")

rf_model = RandomForestClassifier()
rf_f1 = train_and_evaluate_model(rf_model, X_train_tfidf, y_train, X_test_tfidf, y_test)
print(f"Random Forest F1 Score: {rf_f1}")

xgb_model = XGBClassifier()
xgb_f1 = train_and_evaluate_model(xgb_model, X_train_tfidf, y_train, X_test_tfidf, y_test)
print(f"XGBoost F1 Score: {xgb_f1}")

Logistic Regression F1 Score: 0.625
SVM F1 Score: 0.8153846153846154
Random Forest F1 Score: 0.7905759162303664
XGBoost F1 Score: 0.8226600985221675


Word2Vec

In [20]:
# Tokenize the text
tokenized_text = X_train['full_text'].apply(lambda x: x.split())

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=1, workers=4)

# Convert words to vectors
def get_vector(word_list, model):
    valid_words = [word for word in word_list if word in model.wv]
    if not valid_words:
        # If no valid words, return a vector of zeros or handle as needed
        return np.zeros(model.vector_size)
    return np.mean([model.wv[word] for word in valid_words], axis=0)

X_train_word2vec = tokenized_text.apply(lambda x: get_vector(x, word2vec_model))
X_test_word2vec = X_test['full_text'].apply(lambda x: get_vector(x.split(), word2vec_model))

In [21]:
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    # Train the model
    model.fit(X_train.to_list(), y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test.to_list())
    
    # Evaluate F1 score
    f1 = f1_score(y_test, y_pred)
    
    return f1

logreg_model = LogisticRegression()
logreg_f1 = train_and_evaluate_model(logreg_model, X_train_word2vec, y_train, X_test_word2vec, y_test)
print(f"Logistic Regression F1 Score: {logreg_f1}")

svm_model = SVC()
svm_f1 = train_and_evaluate_model(svm_model, X_train_word2vec, y_train, X_test_word2vec, y_test)
print(f"SVM F1 Score: {svm_f1}")

rf_model = RandomForestClassifier()
rf_f1 = train_and_evaluate_model(rf_model, X_train_word2vec, y_train, X_test_word2vec, y_test)
print(f"Random Forest F1 Score: {rf_f1}")

xgb_model = XGBClassifier()
xgb_f1 = train_and_evaluate_model(xgb_model, X_train_word2vec, y_train, X_test_word2vec, y_test)
print(f"XGBoost F1 Score: {xgb_f1}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression F1 Score: 0.39644970414201186
SVM F1 Score: 0.49844236760124616
Random Forest F1 Score: 0.5970149253731343
XGBoost F1 Score: 0.7068062827225131


Ngrams Analysis

In [22]:
# split train into fraud and non-fraud
X_train_nonfraud = X_train.loc[y_train==0]
X_train_fraud = X_train.loc[y_train==1]

In [23]:
# define function to return ngrams sorted by frequency
def get_ngrams(ngram, corpus):
    vec = CountVectorizer(ngram_range=(ngram, ngram)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = {}
    for word, idx in vec.vocabulary_.items():
        words_freq[word] = sum_words[0, idx]
    words_freq = dict(sorted(words_freq.items(), key=lambda item: item[1], reverse=True))
    return words_freq

In [24]:
nonfraud_unigram = get_ngrams(1, X_train_nonfraud['full_text'])
fraud_unigram = get_ngrams(1, X_train_fraud['full_text'])
nonfraud_unigram_top5 = [(k,v) for k,v in nonfraud_unigram.items() if k not in fraud_unigram.keys()][:5]
fraud_unigram_top5 = [(k,v) for k,v in fraud_unigram.items() if k not in nonfraud_unigram.keys()][:5]

nonfraud_num_unigram, fraud_num_unigram = sum(nonfraud_unigram.values()), sum(fraud_unigram.values())
diff_unigram = [(k, nonfraud_unigram[k]*100/nonfraud_num_unigram, fraud_unigram[k]*100/fraud_num_unigram, 
                 abs((nonfraud_unigram[k]*100/nonfraud_num_unigram)-(fraud_unigram[k]*100/fraud_num_unigram))) for k in nonfraud_unigram.keys() if k in fraud_unigram.keys()]
diff_unigram = sorted(diff_unigram, key=lambda x: x[3], reverse=True)

print(nonfraud_unigram_top5)
print(fraud_unigram_top5)
print(diff_unigram[:5])

[('php', 955), ('tidewater', 950), ('athens', 913), ('european', 808), ('1500', 727)]
[('aker', 177), ('0fa3f7c5e23a16de16a841e368006cae916884407d90b154dfef3976483a71ae', 60), ('accion', 53), ('novation', 38), ('ddb080358fa5eecf5a67c649cfb4ffc343c484389f1bbaf2a1cb071e3f2b6e7e', 36)]
[('team', 0.9025851637746181, 0.5926581845862268, 0.3099269791883913), ('engineering', 0.16009264936303563, 0.45206692454676795, 0.2919742751837323), ('position', 0.2662371577147788, 0.5444998524180144, 0.27826269470323556), ('skill', 0.5543066058087517, 0.7946124807755045, 0.24030587496675282), ('marketing', 0.3309554627764315, 0.10874462102499573, 0.2222108417514358)]


In [25]:
nonfraud_bigram = get_ngrams(2, X_train_nonfraud['full_text'])
fraud_bigram = get_ngrams(2,  X_train_fraud['full_text'])
nonfraud_bigram_top5 = [(k,v) for k,v in nonfraud_bigram.items() if k not in fraud_bigram.keys()][:5]
fraud_bigram_top5 = [(k,v) for k,v in fraud_bigram.items() if k not in nonfraud_bigram.keys()][:5]

nonfraud_num_bigram, fraud_num_bigram = sum(nonfraud_bigram.values()), sum(fraud_bigram.values())
diff_bigram = [(k, nonfraud_bigram[k]*100/nonfraud_num_bigram, fraud_bigram[k]*100/fraud_num_bigram, 
                abs((nonfraud_bigram[k]*100/nonfraud_num_bigram)-(fraud_bigram[k]*100/fraud_num_bigram))) for k in nonfraud_bigram.keys() if k in fraud_bigram.keys()]
diff_bigram = sorted(diff_bigram, key=lambda x: x[3], reverse=True)

print(nonfraud_bigram_top5)
print(fraud_bigram_top5)
print(diff_bigram[:5])

[('increase productivity', 800), ('university degree', 795), ('document communication', 788), ('degree required', 696), ('medium large', 612)]
[('aker solution', 172), ('aptitude staffing', 88), ('bring discovery', 60), ('production maximize', 60), ('maximize recovery', 60)]
[('data entry', 0.008304260280918472, 0.1904717297799427, 0.18216746949902424), ('oil gas', 0.012244713198530764, 0.13504757890135283, 0.12280286570282206), ('work home', 0.007457551389530705, 0.09523586488997135, 0.08777831350044064), ('gas industry', 0.005861830786530685, 0.09211337751652966, 0.08625154672999898), ('signing bonus', 9.769717977551142e-05, 0.07806218433604209, 0.07796448715626658)]


In [26]:
nonfraud_trigram = get_ngrams(3,  X_train_nonfraud['full_text'])
fraud_trigram = get_ngrams(3,  X_train_fraud['full_text'])
nonfraud_trigram_top5 = [(k,v) for k,v in nonfraud_trigram.items() if k not in fraud_trigram.keys()][:5]
fraud_trigram_top5 = [(k,v) for k,v in fraud_trigram.items() if k not in nonfraud_trigram.keys()][:5]

nonfraud_num_trigram, fraud_num_trigram = sum(nonfraud_trigram.values()), sum(fraud_trigram.values())
diff_trigram = [(k, nonfraud_trigram[k]*100/nonfraud_num_trigram, fraud_trigram[k]*100/fraud_num_trigram, 
                 abs((nonfraud_trigram[k]*100/nonfraud_num_trigram)-(fraud_trigram[k]*100/fraud_num_trigram))) for k in nonfraud_trigram.keys() if k in fraud_trigram.keys()]
diff_trigram = sorted(diff_trigram, key=lambda x: x[3], reverse=True)

print(nonfraud_trigram_top5)
print(fraud_trigram_top5)
print(diff_trigram[:5])

[('full time permanent', 590), ('time permanent position', 566), ('permanent position many', 553), ('position many medium', 553), ('many medium large', 553)]
[('gas industry engineering', 62), ('28 000 people', 61), ('aker solution global', 60), ('solution global provider', 60), ('global provider product', 60)]
[('oil gas industry', 0.005884591393588934, 0.09257515847611875, 0.08669056708252981), ('product system service', 6.538434881765482e-05, 0.047072114479382414, 0.04700673013056476), ('approximately 28 000', 6.538434881765482e-05, 0.047072114479382414, 0.04700673013056476), ('service oil gas', 0.00019615304645296446, 0.047072114479382414, 0.04687596143292945), ('usa tx houston', 0.004184598324329909, 0.05099479068599762, 0.04681019236166771)]


CountVectorizer - Unigram

In [27]:
count_vectorizer = CountVectorizer()
X_train_cv = count_vectorizer.fit_transform(X_train['full_text'])
X_test_cv = count_vectorizer.transform(X_test['full_text'])

In [28]:
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate F1 score
    f1 = f1_score(y_test, y_pred)
    
    return f1

logreg_model = LogisticRegression()
logreg_f1 = train_and_evaluate_model(logreg_model, X_train_cv, y_train, X_test_cv, y_test)
print(f"Logistic Regression F1 Score: {logreg_f1}")

svm_model = SVC()
svm_f1 = train_and_evaluate_model(svm_model, X_train_cv, y_train, X_test_cv, y_test)
print(f"SVM F1 Score: {svm_f1}")

rf_model = RandomForestClassifier()
rf_f1 = train_and_evaluate_model(rf_model, X_train_cv, y_train, X_test_cv, y_test)
print(f"Random Forest F1 Score: {rf_f1}")

xgb_model = XGBClassifier()
xgb_f1 = train_and_evaluate_model(xgb_model, X_train_cv, y_train, X_test_cv, y_test)
print(f"XGBoost F1 Score: {xgb_f1}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression F1 Score: 0.8009153318077803
SVM F1 Score: 0.7272727272727272
Random Forest F1 Score: 0.7905759162303664
XGBoost F1 Score: 0.8382352941176471


CountVectorizer - Bigram

In [13]:
count_vectorizer = CountVectorizer(ngram_range=(2,2))
X_train_bicv = count_vectorizer.fit_transform(X_train['full_text'])
X_test_bicv = count_vectorizer.transform(X_test['full_text'])

In [14]:
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate F1 score
    f1 = f1_score(y_test, y_pred)
    
    return f1

logreg_model = LogisticRegression()
logreg_f1 = train_and_evaluate_model(logreg_model, X_train_bicv, y_train, X_test_bicv, y_test)
print(f"Logistic Regression F1 Score: {logreg_f1}")

svm_model = SVC()
svm_f1 = train_and_evaluate_model(svm_model, X_train_bicv, y_train, X_test_bicv, y_test)
print(f"SVM F1 Score: {svm_f1}")

rf_model = RandomForestClassifier()
rf_f1 = train_and_evaluate_model(rf_model, X_train_bicv, y_train, X_test_bicv, y_test)
print(f"Random Forest F1 Score: {rf_f1}")

xgb_model = XGBClassifier()
xgb_f1 = train_and_evaluate_model(xgb_model, X_train_bicv, y_train, X_test_bicv, y_test)
print(f"XGBoost F1 Score: {xgb_f1}")

Logistic Regression F1 Score: 0.8320802005012532
SVM F1 Score: 0.7391304347826088
Random Forest F1 Score: 0.8112244897959183
XGBoost F1 Score: 0.7959183673469389


<h3>Using text features only</h3>

|  | Logistic Regression | SVM | Random Forest | XGBoost | Average |
|----------|----------|----------|----------|----------|----------|
| TFIDF | 0.625 | 0.815 | 0.790 | 0.823 | 0.763 |
| Word2Vec | 0.385 | 0.497 | 0.568 | 0.686 | 0.534 |
| CountVectorizer - Unigram | 0.801 | 0.727 | 0.791 | 0.838 | 0.789 |
| CountVectorizer - Bigram | 0.832 | 0.739 | 0.817 | 0.796 | 0.796 |

Based on the results above, the best word embedding method to use is CountVectorizer - Bigram that obtained the highest average F1 score of 0.796 across all models. Hence, we will be using CountVectorizer - Bigram moving forward.

 Long Short Term Memory (LSTM)

In [None]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.layers import Embedding, LSTM, Dense,Bidirectional
# from tensorflow.keras.layers import Dropout
# from tensorflow.keras.preprocessing.text import one_hot

2023-11-23 11:26:54.813638: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
voc_size=10000
corpus = data["full_text"]
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
# onehot_repr[1]
sent_length=50
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
# print(embedded_docs)
embedding_vector_features=50
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Bidirectional(LSTM(100))) 
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(embedded_docs, data["fraudulent"], test_size=0.3, random_state=0)
model.fit(X_train_lstm, y_train_lstm, epochs=10, batch_size=32, validation_data=(X_test_lstm, y_test_lstm))

[1891,
 2515,
 2552,
 4435,
 1068,
 9489,
 5111,
 1215,
 6958,
 8934,
 7709,
 2552,
 4435,
 1068,
 2515,
 6958,
 8934,
 7709,
 2552,
 4435,
 1068,
 2515,
 3866,
 6250,
 3559,
 8786,
 4364,
 5708,
 5569,
 4435,
 4993,
 4672,
 4437,
 8096,
 7709,
 6958,
 8934,
 2152,
 4435,
 1068,
 9063,
 4103,
 3359,
 7822,
 2552,
 4267,
 4360,
 7102,
 6958,
 9609,
 3947,
 6958,
 8934,
 9261,
 9255,
 1872,
 2307,
 9531,
 4751,
 9317,
 7664,
 4435,
 1068,
 6244,
 9318,
 8556,
 5286,
 4435,
 7225,
 9837,
 5569,
 405,
 5512,
 74,
 7864,
 4512,
 4341,
 7098,
 4435,
 7436,
 1693,
 6288,
 3359,
 64,
 1068,
 1215,
 1566,
 9620,
 6288,
 6958,
 8934,
 8918,
 6169,
 1215,
 3061,
 6958,
 8934,
 4437,
 3326,
 6389,
 4341,
 4435,
 8978,
 6288,
 4690,
 74,
 6250,
 3005,
 7709,
 4175,
 3005,
 5615,
 2579,
 9802,
 6418,
 1200,
 3225,
 5111,
 9147,
 1784,
 4529,
 9872,
 7102,
 6958,
 9609,
 3947,
 7102,
 6958,
 9609,
 5708,
 7102,
 6958,
 9609,
 6073,
 185,
 9592,
 3497,
 4318,
 4129,
 1891,
 2515,
 1093,
 3805,
 9101,


In [None]:
# Evaluate the model on the test set
# loss, accuracy = model.evaluate(X_test_lstm, y_test_lstm)

# print(f"Test Accuracy: {accuracy}")
# print(f"Test Loss: {loss}")

In [None]:
# from sklearn.metrics import classification_report
# Get classification report
# report = classification_report(y_test_lstm, y_pred.round(),target_names = ['0','1'])
# print("Classification Report:")
# print(report)

In [None]:
y_pred = model.predict(X_test_lstm)
y_pred_binary = (y_pred > 0.5).astype('int32')  
f1_test = f1_score(y_test_lstm, y_pred_binary )
print(f'LSTM F1 score: {f1_test}')

LSTM F1 score: 0.7350000000000001


<h1>Combined text and numeric</h1>

Bigram and numeric

In [15]:
# from scipy.sparse import hstack
numeric_features = X_train_full[['has_questions', 'employment_type', 'required_experience', 'required_education', 'missing company information']]
combined_features = hstack([
    StandardScaler().fit_transform(numeric_features),
    X_train_bicv])
X_test_numeric_features = X_test_full[['has_questions', 'employment_type', 'required_experience', 'required_education', 'missing company information']]
X_test_combined_features = hstack([
    StandardScaler().fit_transform(X_test_numeric_features),
    X_test_bicv])

In [17]:
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate F1 score
    f1 = f1_score(y_test, y_pred)
    
    return f1
    
logreg_model = LogisticRegression()
logreg_f1 = train_and_evaluate_model(logreg_model, combined_features, y_train, X_test_combined_features, y_test)
print(f"Logistic Regression F1 Score: {logreg_f1}")

svm_model = SVC()
svm_f1 = train_and_evaluate_model(svm_model, combined_features, y_train, X_test_combined_features, y_test)
print(f"SVM F1 Score: {svm_f1}")

rf_model = RandomForestClassifier()
rf_f1 = train_and_evaluate_model(rf_model, combined_features, y_train, X_test_combined_features, y_test)
print(f"Random Forest F1 Score: {rf_f1}")

xgb_model = XGBClassifier()
xgb_f1 = train_and_evaluate_model(xgb_model, combined_features, y_train, X_test_combined_features, y_test)
print(f"XGBoost F1 Score: {xgb_f1}")

Logistic Regression F1 Score: 0.8395061728395063
SVM F1 Score: 0.745945945945946
Random Forest F1 Score: 0.8214285714285714
XGBoost F1 Score: 0.7990074441687344


LSTM

In [None]:
from tensorflow.keras.layers import Input, Concatenate
from tensorflow.keras.models import Model
from sklearn.preprocessing import LabelEncoder

numerical_data = data[['has_questions', 'employment_type', 'required_experience', 'required_education', 'missing company information']].values

# Define text input
text_input = Input(shape=(sent_length,))
embedding_vector_features = 50
text_embedding = Embedding(voc_size, embedding_vector_features, input_length=sent_length)(text_input)
text_lstm = Bidirectional(LSTM(100))(text_embedding)
text_dropout = Dropout(0.3)(text_lstm)

# Define numerical input
numerical_input = Input(shape=(numerical_data.shape[1],))

# Concatenate text and numerical inputs
concatenated = Concatenate()([text_dropout, numerical_input])

# Dense layers for the merged inputs
dense_layer = Dense(64, activation='relu')(concatenated)
output_layer = Dense(1, activation='sigmoid')(dense_layer)

# Create and compile the model
model = Model(inputs=[text_input, numerical_input], outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

#embedded_docs = np.array(embedded_docs)
#numerical_data = np.array(numerical_data)
labels = np.array(data["fraudulent"])

print(model.summary())

text_train, text_test, num_train, num_test, labels_train, labels_test = train_test_split(
    embedded_docs, numerical_data, labels, test_size=0.3, random_state=0
)

# Train the model using both text and numerical data
model.fit([text_train, num_train], labels_train, epochs=10, batch_size=32, validation_data=([text_test, num_test], labels_test))


Model: "model_12"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_31 (InputLayer)          [(None, 50)]         0           []                               
                                                                                                  
 embedding_16 (Embedding)       (None, 50, 50)       500000      ['input_31[0][0]']               
                                                                                                  
 bidirectional_16 (Bidirectiona  (None, 200)         120800      ['embedding_16[0][0]']           
 l)                                                                                               
                                                                                                  
 dropout_16 (Dropout)           (None, 200)          0           ['bidirectional_16[0][0]']

<keras.callbacks.History at 0x7ffdab129690>

In [None]:
# Make predictions on the test data
test_predictions = model.predict([text_test, num_test])
y_pred_binary = (test_predictions > 0.5).astype('int32') 
f1_test = f1_score(y_test_lstm, y_pred_binary )
print(f'LSTM F1 score: {f1_test}')



<h3>Combined features</h3>

|  | Logistic Regression | SVM | Random Forest | XGBoost | LSTM |
|----------|----------|----------|----------|----------|----------|
| Text Only | 0.832 | 0.739 | 0.817 | 0.796 | 0.735 |
| Text and Numeric | 0.840 | 0.746 | 0.821 | 0.799 | 0.744 |