In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense,Bidirectional
from tensorflow.keras.layers import Dropout
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Input, Concatenate
from tensorflow.keras.models import Model
from sklearn.preprocessing import LabelEncoder

In [54]:
data = pd.read_csv('preprocessed_data_with_num.csv')

In [55]:
# replace NaN with empty strings
data.fillna('', inplace=True)

# combine all text
data['full_text'] = data['title'] + " " + data['location']  + " " + data['department']  + " " + data['company_profile']  + " " + data['description']  + " " + data['requirements']  + " "  + data['benefits'] + data['industry']  + " " + data['function']

<h1>Categorical Only Models</h1>

 Logistic Regression | SVM | Random Forest | XGBoost | Average |
|----------|----------|----------|----------|----------|
|  |  |  |  |  |



<h1>Text Only Models</h1>

In [56]:
X_train_full, X_test_full,y_train , y_test = train_test_split(data.drop('fraudulent', axis=1), data["fraudulent"], test_size=0.3, random_state=0)
X_train = X_train_full[['full_text', 'has_questions', 'has_company_logo', 'employment_type', 'required_experience', 'required_education']]
X_test = X_test_full[['full_text', 'has_questions', 'has_company_logo', 'employment_type', 'required_experience', 'required_education']]

TFIDF Vectorizer

In [57]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['full_text'])
X_test_tfidf = tfidf_vectorizer.transform(X_test['full_text'])

In [58]:
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate F1 score
    f1 = f1_score(y_test, y_pred)
    
    return f1

logreg_model = LogisticRegression()
logreg_f1 = train_and_evaluate_model(logreg_model, X_train_tfidf, y_train, X_test_tfidf, y_test)
print(f"Logistic Regression F1 Score: {logreg_f1}")

svm_model = SVC()
svm_f1 = train_and_evaluate_model(svm_model, X_train_tfidf, y_train, X_test_tfidf, y_test)
print(f"SVM F1 Score: {svm_f1}")

rf_model = RandomForestClassifier()
rf_f1 = train_and_evaluate_model(rf_model, X_train_tfidf, y_train, X_test_tfidf, y_test)
print(f"Random Forest F1 Score: {rf_f1}")

xgb_model = XGBClassifier()
xgb_f1 = train_and_evaluate_model(xgb_model, X_train_tfidf, y_train, X_test_tfidf, y_test)
print(f"XGBoost F1 Score: {xgb_f1}")

Logistic Regression F1 Score: 0.552278820375335
SVM F1 Score: 0.7129186602870813
Random Forest F1 Score: 0.7476635514018691
XGBoost F1 Score: 0.7849223946784921


Word2Vec

In [59]:
# Tokenize the text
tokenized_text = X_train['full_text'].apply(lambda x: x.split())

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=1, workers=4)

# Convert words to vectors
def get_vector(word_list, model):
    valid_words = [word for word in word_list if word in model.wv]
    if not valid_words:
        # If no valid words, return a vector of zeros or handle as needed
        return np.zeros(model.vector_size)
    return np.mean([model.wv[word] for word in valid_words], axis=0)

X_train_word2vec = tokenized_text.apply(lambda x: get_vector(x, word2vec_model))
X_test_word2vec = X_test['full_text'].apply(lambda x: get_vector(x.split(), word2vec_model))

In [66]:
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    # Train the model
    model.fit(X_train.to_list(), y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test.to_list())
    
    # Evaluate F1 score
    f1 = f1_score(y_test, y_pred)
    
    return f1

logreg_model = LogisticRegression()
logreg_f1 = train_and_evaluate_model(logreg_model, X_train_word2vec, y_train, X_test_word2vec, y_test)
print(f"Logistic Regression F1 Score: {logreg_f1}")

svm_model = SVC()
svm_f1 = train_and_evaluate_model(svm_model, X_train_word2vec, y_train, X_test_word2vec, y_test)
print(f"SVM F1 Score: {svm_f1}")

rf_model = RandomForestClassifier()
rf_f1 = train_and_evaluate_model(rf_model, X_train_word2vec, y_train, X_test_word2vec, y_test)
print(f"Random Forest F1 Score: {rf_f1}")

xgb_model = XGBClassifier()
xgb_f1 = train_and_evaluate_model(xgb_model, X_train_word2vec, y_train, X_test_word2vec, y_test)
print(f"XGBoost F1 Score: {xgb_f1}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression F1 Score: 0.3691460055096419
SVM F1 Score: 0.4550561797752809
Random Forest F1 Score: 0.526027397260274
XGBoost F1 Score: 0.615


Ngrams Analysis

In [67]:
# split train into fraud and non-fraud
X_train_nonfraud = X_train.loc[y_train==0]
X_train_fraud = X_train.loc[y_train==1]

In [68]:
# define function to return ngrams sorted by frequency
def get_ngrams(ngram, corpus):
    vec = CountVectorizer(ngram_range=(ngram, ngram)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = {}
    for word, idx in vec.vocabulary_.items():
        words_freq[word] = sum_words[0, idx]
    words_freq = dict(sorted(words_freq.items(), key=lambda item: item[1], reverse=True))
    return words_freq

In [69]:
nonfraud_unigram = get_ngrams(1, X_train_nonfraud['full_text'])
fraud_unigram = get_ngrams(1, X_train_fraud['full_text'])
nonfraud_unigram_top5 = [(k,v) for k,v in nonfraud_unigram.items() if k not in fraud_unigram.keys()][:5]
fraud_unigram_top5 = [(k,v) for k,v in fraud_unigram.items() if k not in nonfraud_unigram.keys()][:5]

nonfraud_num_unigram, fraud_num_unigram = sum(nonfraud_unigram.values()), sum(fraud_unigram.values())
diff_unigram = [(k, nonfraud_unigram[k]*100/nonfraud_num_unigram, fraud_unigram[k]*100/fraud_num_unigram, 
                 abs((nonfraud_unigram[k]*100/nonfraud_num_unigram)-(fraud_unigram[k]*100/fraud_num_unigram))) for k in nonfraud_unigram.keys() if k in fraud_unigram.keys()]
diff_unigram = sorted(diff_unigram, key=lambda x: x[3], reverse=True)

print(nonfraud_unigram_top5)
print(fraud_unigram_top5)
print(diff_unigram[:5])

[('awesome', 1362), ('abroad', 1289), ('athens', 921), ('european', 865), ('berlin', 785)]
[('aker', 152), ('accion', 58), ('0fa3f7c5e23a16de16a841e368006cae916884407d90b154dfef3976483a71ae', 53), ('anyperk', 40), ('novation', 40)]
[('engineering', 0.16199194664859687, 0.4842268371145133, 0.32223489046591647), ('team', 0.8978269988982973, 0.5835986054267092, 0.31422839347158815), ('position', 0.26084672032725065, 0.5297020531217894, 0.26885533279453877), ('marketing', 0.32690296259294693, 0.10189816920148889, 0.22500479339145804), ('work', 0.884937176053005, 1.0998265204722686, 0.21488934441926355)]


In [70]:
nonfraud_bigram = get_ngrams(2, X_train_nonfraud['full_text'])
fraud_bigram = get_ngrams(2,  X_train_fraud['full_text'])
nonfraud_bigram_top5 = [(k,v) for k,v in nonfraud_bigram.items() if k not in fraud_bigram.keys()][:5]
fraud_bigram_top5 = [(k,v) for k,v in fraud_bigram.items() if k not in nonfraud_bigram.keys()][:5]

nonfraud_num_bigram, fraud_num_bigram = sum(nonfraud_bigram.values()), sum(fraud_bigram.values())
diff_bigram = [(k, nonfraud_bigram[k]*100/nonfraud_num_bigram, fraud_bigram[k]*100/fraud_num_bigram, 
                abs((nonfraud_bigram[k]*100/nonfraud_num_bigram)-(fraud_bigram[k]*100/fraud_num_bigram))) for k in nonfraud_bigram.keys() if k in fraud_bigram.keys()]
diff_bigram = sorted(diff_bigram, key=lambda x: x[3], reverse=True)

print(nonfraud_bigram_top5)
print(fraud_bigram_top5)
print(diff_bigram[:5])

[('university degree', 820), ('increase productivity', 783), ('document communication', 773), ('relevant job', 709), ('digital marketing', 617)]
[('aker solution', 148), ('aptitude staffing', 76), ('bring discovery', 53), ('production maximize', 53), ('maximize recovery', 53)]
[('data entry', 0.008330150535368569, 0.1887335387115339, 0.18040338817616533), ('oil gas', 0.011754402138840232, 0.12610447205389486, 0.11435006991505463), ('customer service', 0.15939232704236853, 0.2657503639256576, 0.10635803688328907), ('work home', 0.006914354199317784, 0.09648261620230882, 0.08956826200299103), ('gas industry', 0.005498557863267, 0.08717289007752463, 0.08167433221425763)]


In [71]:
nonfraud_trigram = get_ngrams(3,  X_train_nonfraud['full_text'])
fraud_trigram = get_ngrams(3,  X_train_fraud['full_text'])
nonfraud_trigram_top5 = [(k,v) for k,v in nonfraud_trigram.items() if k not in fraud_trigram.keys()][:5]
fraud_trigram_top5 = [(k,v) for k,v in fraud_trigram.items() if k not in nonfraud_trigram.keys()][:5]

nonfraud_num_trigram, fraud_num_trigram = sum(nonfraud_trigram.values()), sum(fraud_trigram.values())
diff_trigram = [(k, nonfraud_trigram[k]*100/nonfraud_num_trigram, fraud_trigram[k]*100/fraud_num_trigram, 
                 abs((nonfraud_trigram[k]*100/nonfraud_num_trigram)-(fraud_trigram[k]*100/fraud_num_trigram))) for k in nonfraud_trigram.keys() if k in fraud_trigram.keys()]
diff_trigram = sorted(diff_trigram, key=lambda x: x[3], reverse=True)

print(nonfraud_trigram_top5)
print(fraud_trigram_top5)
print(diff_trigram[:5])

[('full time permanent', 587), ('time permanent position', 563), ('permanent position many', 550), ('position many medium', 550), ('many medium large', 550)]
[('gas industry engineering', 55), ('28 000 people', 55), ('aker solution global', 53), ('solution global provider', 53), ('global provider product', 53)]
[('oil gas industry', 0.00551992577517772, 0.0876103635404794, 0.08209043776530167), ('usa tx houston', 0.0038341999396444043, 0.05103516322746372, 0.047200963287819316), ('product system service', 3.305344775555521e-05, 0.04508106085092629, 0.045048007403170734), ('approximately 28 000', 6.610689551111042e-05, 0.04508106085092629, 0.045014953955415174), ('service oil gas', 0.0003635879253111073, 0.04508106085092629, 0.04471747292561518)]


CountVectorizer - Unigram

In [72]:
count_vectorizer = CountVectorizer()
X_train_cv = count_vectorizer.fit_transform(X_train['full_text'])
X_test_cv = count_vectorizer.transform(X_test['full_text'])

In [73]:
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate F1 score
    f1 = f1_score(y_test, y_pred)
    
    return f1

logreg_model = LogisticRegression()
logreg_f1 = train_and_evaluate_model(logreg_model, X_train_cv, y_train, X_test_cv, y_test)
print(f"Logistic Regression F1 Score: {logreg_f1}")

svm_model = SVC()
svm_f1 = train_and_evaluate_model(svm_model, X_train_cv, y_train, X_test_cv, y_test)
print(f"SVM F1 Score: {svm_f1}")

rf_model = RandomForestClassifier()
rf_f1 = train_and_evaluate_model(rf_model, X_train_cv, y_train, X_test_cv, y_test)
print(f"Random Forest F1 Score: {rf_f1}")

xgb_model = XGBClassifier()
xgb_f1 = train_and_evaluate_model(xgb_model, X_train_cv, y_train, X_test_cv, y_test)
print(f"XGBoost F1 Score: {xgb_f1}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression F1 Score: 0.7975206611570248
SVM F1 Score: 0.6157760814249365
Random Forest F1 Score: 0.7323943661971831
XGBoost F1 Score: 0.810344827586207


CountVectorizer - Bigram

In [74]:
count_vectorizer = CountVectorizer(ngram_range=(2,2))
X_train_bicv = count_vectorizer.fit_transform(X_train['full_text'])
X_test_bicv = count_vectorizer.transform(X_test['full_text'])

In [75]:
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate F1 score
    f1 = f1_score(y_test, y_pred)
    
    return f1

logreg_model = LogisticRegression()
logreg_f1 = train_and_evaluate_model(logreg_model, X_train_bicv, y_train, X_test_bicv, y_test)
print(f"Logistic Regression F1 Score: {logreg_f1}")

svm_model = SVC()
svm_f1 = train_and_evaluate_model(svm_model, X_train_bicv, y_train, X_test_bicv, y_test)
print(f"SVM F1 Score: {svm_f1}")

rf_model = RandomForestClassifier()
rf_f1 = train_and_evaluate_model(rf_model, X_train_bicv, y_train, X_test_bicv, y_test)
print(f"Random Forest F1 Score: {rf_f1}")

xgb_model = XGBClassifier()
xgb_f1 = train_and_evaluate_model(xgb_model, X_train_bicv, y_train, X_test_bicv, y_test)
print(f"XGBoost F1 Score: {xgb_f1}")

Logistic Regression F1 Score: 0.7937915742793792
SVM F1 Score: 0.671604938271605
Random Forest F1 Score: 0.7775280898876403
XGBoost F1 Score: 0.8008752735229759


<h3>Using text features only</h3>

|  | Logistic Regression | SVM | Random Forest | XGBoost | Average |
|----------|----------|----------|----------|----------|----------|
| TFIDF | 0.552 | 0.713 | 0.748 | 0.785 | 0.700 |
| Word2Vec | 0.369 | 0.455 | 0.526 | 0.615 | 0.491 |
| CountVectorizer - Unigram | 0.798 | 0.616 | 0.732 | 0.810 | 0.739 |
| CountVectorizer - Bigram | 0.794 | 0.672 | 0.778 | 0.801 | 0.761 |

Based on the results above, the best word embedding method to use is CountVectorizer - Bigram that obtained the highest average F1 score of 0.761 across all models. Hence, we will be using CountVectorizer - Bigram moving forward.

 Long Short Term Memory (LSTM)

In [None]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.layers import Embedding, LSTM, Dense,Bidirectional
# from tensorflow.keras.layers import Dropout
# from tensorflow.keras.preprocessing.text import one_hot

2023-11-23 11:26:54.813638: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
voc_size=10000
corpus = data["full_text"]
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
# onehot_repr[1]
sent_length=50
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
# print(embedded_docs)
embedding_vector_features=50
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Bidirectional(LSTM(100))) 
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(embedded_docs, data["fraudulent"], test_size=0.3, random_state=0)
model.fit(X_train_lstm, y_train_lstm, epochs=10, batch_size=32, validation_data=(X_test_lstm, y_test_lstm))

[1891,
 2515,
 2552,
 4435,
 1068,
 9489,
 5111,
 1215,
 6958,
 8934,
 7709,
 2552,
 4435,
 1068,
 2515,
 6958,
 8934,
 7709,
 2552,
 4435,
 1068,
 2515,
 3866,
 6250,
 3559,
 8786,
 4364,
 5708,
 5569,
 4435,
 4993,
 4672,
 4437,
 8096,
 7709,
 6958,
 8934,
 2152,
 4435,
 1068,
 9063,
 4103,
 3359,
 7822,
 2552,
 4267,
 4360,
 7102,
 6958,
 9609,
 3947,
 6958,
 8934,
 9261,
 9255,
 1872,
 2307,
 9531,
 4751,
 9317,
 7664,
 4435,
 1068,
 6244,
 9318,
 8556,
 5286,
 4435,
 7225,
 9837,
 5569,
 405,
 5512,
 74,
 7864,
 4512,
 4341,
 7098,
 4435,
 7436,
 1693,
 6288,
 3359,
 64,
 1068,
 1215,
 1566,
 9620,
 6288,
 6958,
 8934,
 8918,
 6169,
 1215,
 3061,
 6958,
 8934,
 4437,
 3326,
 6389,
 4341,
 4435,
 8978,
 6288,
 4690,
 74,
 6250,
 3005,
 7709,
 4175,
 3005,
 5615,
 2579,
 9802,
 6418,
 1200,
 3225,
 5111,
 9147,
 1784,
 4529,
 9872,
 7102,
 6958,
 9609,
 3947,
 7102,
 6958,
 9609,
 5708,
 7102,
 6958,
 9609,
 6073,
 185,
 9592,
 3497,
 4318,
 4129,
 1891,
 2515,
 1093,
 3805,
 9101,


In [None]:
# Evaluate the model on the test set
# loss, accuracy = model.evaluate(X_test_lstm, y_test_lstm)

# print(f"Test Accuracy: {accuracy}")
# print(f"Test Loss: {loss}")

In [None]:
# from sklearn.metrics import classification_report
# Get classification report
# report = classification_report(y_test_lstm, y_pred.round(),target_names = ['0','1'])
# print("Classification Report:")
# print(report)

In [None]:
y_pred = model.predict(X_test_lstm)
y_pred_binary = (y_pred > 0.5).astype('int32')  
f1_test = f1_score(y_test_lstm, y_pred_binary )
print(f'LSTM F1 score: {f1_test}')

LSTM F1 score: 0.7350000000000001


<h1>Combined text and numeric</h1>

Bigram and numeric

In [76]:
numeric_features = X_train_full[['has_questions', 'has_company_logo', 'employment_type', 'required_experience', 'required_education']]
combined_features = hstack([
    StandardScaler().fit_transform(numeric_features),
    X_train_bicv])
X_test_numeric_features = X_test_full[['has_questions', 'has_company_logo', 'employment_type', 'required_experience', 'required_education']]
X_test_combined_features = hstack([
    StandardScaler().fit_transform(X_test_numeric_features),
    X_test_bicv])

In [79]:
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate F1 score
    f1 = f1_score(y_test, y_pred)
    
    return f1
    
logreg_model = LogisticRegression()
logreg_f1 = train_and_evaluate_model(logreg_model, combined_features, y_train, X_test_combined_features, y_test)
print(f"Logistic Regression F1 Score: {logreg_f1}")

svm_model = SVC()
svm_f1 = train_and_evaluate_model(svm_model, combined_features, y_train, X_test_combined_features, y_test)
print(f"SVM F1 Score: {svm_f1}")

rf_model = RandomForestClassifier()
rf_f1 = train_and_evaluate_model(rf_model, combined_features, y_train, X_test_combined_features, y_test)
print(f"Random Forest F1 Score: {rf_f1}")

xgb_model = XGBClassifier()
xgb_f1 = train_and_evaluate_model(xgb_model, combined_features, y_train, X_test_combined_features, y_test)
print(f"XGBoost F1 Score: {xgb_f1}")

Logistic Regression F1 Score: 0.8026315789473685
SVM F1 Score: 0.6941747572815535
Random Forest F1 Score: 0.7681818181818182
XGBoost F1 Score: 0.8197424892703862


LSTM

In [None]:
# from tensorflow.keras.layers import Input, Concatenate
# from tensorflow.keras.models import Model
# from sklearn.preprocessing import LabelEncoder

numerical_data = data[['has_questions', 'has_company_logo', 'employment_type', 'required_experience', 'required_education']].values

# Define text input
text_input = Input(shape=(sent_length,))
embedding_vector_features = 50
text_embedding = Embedding(voc_size, embedding_vector_features, input_length=sent_length)(text_input)
text_lstm = Bidirectional(LSTM(100))(text_embedding)
text_dropout = Dropout(0.3)(text_lstm)

# Define numerical input
numerical_input = Input(shape=(numerical_data.shape[1],))

# Concatenate text and numerical inputs
concatenated = Concatenate()([text_dropout, numerical_input])

# Dense layers for the merged inputs
dense_layer = Dense(64, activation='relu')(concatenated)
output_layer = Dense(1, activation='sigmoid')(dense_layer)

# Create and compile the model
model = Model(inputs=[text_input, numerical_input], outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

#embedded_docs = np.array(embedded_docs)
#numerical_data = np.array(numerical_data)
labels = np.array(data["fraudulent"])

print(model.summary())

text_train, text_test, num_train, num_test, labels_train, labels_test = train_test_split(
    embedded_docs, numerical_data, labels, test_size=0.3, random_state=0
)

# Train the model using both text and numerical data
model.fit([text_train, num_train], labels_train, epochs=10, batch_size=32, validation_data=([text_test, num_test], labels_test))


Model: "model_12"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_31 (InputLayer)          [(None, 50)]         0           []                               
                                                                                                  
 embedding_16 (Embedding)       (None, 50, 50)       500000      ['input_31[0][0]']               
                                                                                                  
 bidirectional_16 (Bidirectiona  (None, 200)         120800      ['embedding_16[0][0]']           
 l)                                                                                               
                                                                                                  
 dropout_16 (Dropout)           (None, 200)          0           ['bidirectional_16[0][0]']

<keras.callbacks.History at 0x7ffdab129690>

In [None]:
# Make predictions on the test data
test_predictions = model.predict([text_test, num_test])
y_pred_binary = (test_predictions > 0.5).astype('int32') 
f1_test = f1_score(y_test_lstm, y_pred_binary )
print(f'LSTM F1 score: {f1_test}')



<h3>Combined features</h3>

|  | Logistic Regression | SVM | Random Forest | XGBoost | LSTM |
|----------|----------|----------|----------|----------|----------|
| Text Only | 0.794 | 0.672 | 0.778 | 0.801 | 0.735 |
| Text and Numeric | 0.803 | 0.694 | 0.768 | 0.820 | 0.744 |