# RNN

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Input, Concatenate, Reshape, BatchNormalization, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange





In [4]:
with open("C:/Yee Ann/NUS/DSA4266/TruthSeeker2023/Features_For_Traditional_ML_Techniques.csv") as file:
    df = pd.read_csv(file)

In [5]:
#remove columns with s single unique value
df = df.drop(columns = ['Unnamed: 0'])
df = df.loc[:, df.nunique() > 1]
df = df.drop_duplicates()

# Convert boolean columns to 0 and 1
df['majority_target'] = df['majority_target'].astype(int)

Basic Tokenizer

In [None]:
# Assuming df is your DataFrame
# Combine the text fields (statement, tweet) into one
df['text_combined'] = df['statement'] + ' ' + df['tweet']

# Tokenize the combined text field
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['text_combined'])

# Convert text to sequences
X_text = tokenizer.texts_to_sequences(df['text_combined'])
word_index = tokenizer.word_index

In [None]:
# Pad sequences to ensure they are the same length
max_length = 100  # Set max length of sequences
X_text = pad_sequences(X_text, maxlen=max_length, padding='post', truncating='post')

# Separate the numerical features
numerical_features = df.select_dtypes(['int64', 'int32', 'float64']).columns.tolist()
numerical_features.remove('majority_target')
X_num = df[numerical_features].values

# Scale the numerical features
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num)

In [None]:
# Combine text sequences and numerical features into one dataset
X_combined = np.hstack((X_text, X_num_scaled))

# Define the target variable
y = df['majority_target'].values

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.3, random_state=42)

# Build the RNN model
model = Sequential()
model.add(Embedding(input_dim=len(word_index) + 1, output_dim=128, input_length=max_length))  # Embedding layer for text
model.add(LSTM(128, return_sequences=True))  # LSTM layer
model.add(Dropout(0.5))  # Dropout for regularization
model.add(LSTM(64, return_sequences=False))  # Another LSTM layer
model.add(Dropout(0.5))  # More regularization
model.add(Dense(32, activation='relu'))  # Dense layer for additional complexity
model.add(Dropout(0.5))  # Additional Dropout
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')

# Predict on test data
y_pred = model.predict(X_test)

In [None]:
y_pred_use = (y_pred > 0.5).astype(int)
# Evaluate the model using accuracy
accuracy = accuracy_score(y_test, y_pred_use)
print(f"RNN Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_use, target_names=['Fake', 'Real']))

# Confusion Matrix to understand predictions
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_use))

Sentence Embeddings

In [6]:
# Load pre-trained sentence transformer model
model_sbert = SentenceTransformer('all-MiniLM-L6-v2')

# Generate sentence embeddings for the 'statement' column
X_statement_embeddings = model_sbert.encode(df['statement'].tolist(), convert_to_numpy=True)

# Generate sentence embeddings for the 'tweet' column
X_tweet_embeddings = model_sbert.encode(df['tweet'].tolist(), convert_to_numpy=True)

numerical_features = df.select_dtypes(['int64', 'int32', 'float64']).columns.tolist()
numerical_features.remove('majority_target')
X_num = df[numerical_features].values

# Scale the numerical features
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num)

In [None]:
# Define inputs for statement, tweet, and numerical features
input_statement = Input(shape=(X_statement_embeddings.shape[1],))
input_tweet = Input(shape=(X_tweet_embeddings.shape[1],))
input_numerical = Input(shape=(X_num_scaled.shape[1],))

# Reshape to 3D for LSTM (adding a dimension for timesteps)
reshaped_statement = Reshape((1, X_statement_embeddings.shape[1]))(input_statement)  # Shape: (batch_size, 1, 384)
reshaped_tweet = Reshape((1, X_tweet_embeddings.shape[1]))(input_tweet)  # Shape: (batch_size, 1, 384)

# LSTM for statement embeddings
lstm_statement = LSTM(128, return_sequences=False)(reshaped_statement)

# LSTM for tweet embeddings
lstm_tweet = LSTM(128, return_sequences=False)(reshaped_tweet)

# Combine LSTM outputs and numerical features
combined = Concatenate()([lstm_statement, lstm_tweet, input_numerical])

Bidirection RNN (run this instead of the box on top)

In [None]:
# Define inputs for statement, tweet, and numerical features
input_statement = Input(shape=(X_statement_embeddings.shape[1],))
input_tweet = Input(shape=(X_tweet_embeddings.shape[1],))
input_numerical = Input(shape=(X_num_scaled.shape[1],))

# Reshape to 3D for LSTM (adding a dimension for timesteps)
reshaped_statement = Reshape((1, X_statement_embeddings.shape[1]))(input_statement)  # Shape: (batch_size, 1, 384)
reshaped_tweet = Reshape((1, X_tweet_embeddings.shape[1]))(input_tweet)  # Shape: (batch_size, 1, 384)

# Bidirectional LSTM for statement embeddings
bidirectional_lstm_statement = Bidirectional(LSTM(128, return_sequences=False))(reshaped_statement)

# Bidirectional LSTM for tweet embeddings
bidirectional_lstm_tweet = Bidirectional(LSTM(128, return_sequences=False))(reshaped_tweet)

# Combine LSTM outputs and numerical features
combined = Concatenate()([bidirectional_lstm_statement, bidirectional_lstm_tweet, input_numerical])

Continue with model building

In [None]:
# Add dense layers
# x = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(combined)
x = Dense(64, activation='relu')(combined)
x = Dropout(0.5)(x)
x = Dense(32, activation='relu')(x)
x = Dropout(0.5)(x)
# x = BatchNormalization()(x)
output = Dense(1, activation='sigmoid')(x)

# Define the model
model = Model(inputs=[input_statement, input_tweet, input_numerical], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Split data into training and test sets
X_train_statement, X_test_statement, X_train_tweet, X_test_tweet, X_train_num, X_test_num, y_train, y_test = train_test_split(
    X_statement_embeddings, X_tweet_embeddings, X_num_scaled, df['majority_target'], test_size=0.3, random_state=42)

# reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2)
# early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# Train the model
# history = model.fit([X_train_statement, X_train_tweet, X_train_num], y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping, reduce_lr])
history = model.fit([X_train_statement, X_train_tweet, X_train_num], y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate([X_test_statement, X_test_tweet, X_test_num], y_test)
print(f'Test Accuracy: {accuracy}')

# Predict on test data
y_pred = model.predict([X_test_statement, X_test_tweet, X_test_num])

In [None]:
y_pred_use = (y_pred > 0.5).astype(int)
# Evaluate the model using accuracy
accuracy = accuracy_score(y_test, y_pred_use)
print(f"RNN Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_use, target_names=['Fake', 'Real']))

# Confusion Matrix to understand predictions
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_use))

Summary of results

In [None]:
# basic tokenizer gives 0.9560
# the one with regularizer gives 0.9571 accuracy
# with only dropout is 0.9576
# with BatchNormalization is 0.9576
# with reduce learning rate and early stopping is also 0.9576
# Bi-directional RNN is 0.9576

In [7]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
import numpy as np

# Define number of folds for cross-validation
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize lists to store metrics for each fold
accuracy_list, precision_list, recall_list, f1_list, auc_list = [], [], [], [], []

for train_index, val_index in kf.split(X_statement_embeddings):
    # Split data into training and validation sets for this fold
    X_train_statement, X_val_statement = X_statement_embeddings[train_index], X_statement_embeddings[val_index]
    X_train_tweet, X_val_tweet = X_tweet_embeddings[train_index], X_tweet_embeddings[val_index]
    X_train_num, X_val_num = X_num_scaled[train_index], X_num_scaled[val_index]
    y_train, y_val = df['majority_target'].iloc[train_index], df['majority_target'].iloc[val_index]
    
    # Define inputs and model
    input_statement = Input(shape=(X_statement_embeddings.shape[1],))
    input_tweet = Input(shape=(X_tweet_embeddings.shape[1],))
    input_numerical = Input(shape=(X_num_scaled.shape[1],))
    
    # Reshape to 3D for LSTM (adding a dimension for timesteps)
    reshaped_statement = Reshape((1, X_statement_embeddings.shape[1]))(input_statement)
    reshaped_tweet = Reshape((1, X_tweet_embeddings.shape[1]))(input_tweet)
    
    # LSTM for statement and tweet embeddings
    lstm_statement = LSTM(128, return_sequences=False)(reshaped_statement)
    lstm_tweet = LSTM(128, return_sequences=False)(reshaped_tweet)
    
    # Combine LSTM outputs and numerical features
    combined = Concatenate()([lstm_statement, lstm_tweet, input_numerical])
    
    # Dense layers
    x = Dense(64, activation='relu')(combined)
    x = Dropout(0.5)(x)
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.5)(x)
    output = Dense(1, activation='sigmoid')(x)
    
    # Define and compile model
    model = Model(inputs=[input_statement, input_tweet, input_numerical], outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Train the model
    model.fit([X_train_statement, X_train_tweet, X_train_num], y_train, epochs=10, batch_size=32, verbose=0)
    
    # Predict on validation data
    y_pred_prob = model.predict([X_val_statement, X_val_tweet, X_val_num])
    y_pred = (y_pred_prob > 0.5).astype(int)
    
    # Calculate and store metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    auc = roc_auc_score(y_val, y_pred_prob)
    
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)
    auc_list.append(auc)

# Display the results for each metric
print(f"Cross-Validation Results ({n_folds} folds):")
print(f"Accuracy: {np.mean(accuracy_list):.4f} ± {np.std(accuracy_list):.4f}")
print(f"Precision: {np.mean(precision_list):.4f} ± {np.std(precision_list):.4f}")
print(f"Recall: {np.mean(recall_list):.4f} ± {np.std(recall_list):.4f}")
print(f"F1 Score: {np.mean(f1_list):.4f} ± {np.std(f1_list):.4f}")
print(f"AUC: {np.mean(auc_list):.4f} ± {np.std(auc_list):.4f}")




[1m839/839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step




[1m839/839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step




[1m839/839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step




[1m839/839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step




[1m839/839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step
Cross-Validation Results (5 folds):
Accuracy: 0.9573 ± 0.0006
Precision: 0.9588 ± 0.0021
Recall: 0.9580 ± 0.0014
F1 Score: 0.9584 ± 0.0006
AUC: 0.9588 ± 0.0002


Cross-Validation Results (5 folds):
Accuracy: 0.9573 ± 0.0006
Precision: 0.9588 ± 0.0021
Recall: 0.9580 ± 0.0014
F1 Score: 0.9584 ± 0.0006
AUC: 0.9588 ± 0.0002

Model Robustness: The model is robust and performs consistently well across different data splits, showing it has generalized well to the dataset.

Performance Ceiling: Since the accuracy, F1 score, and AUC are all very close to 0.96, it’s likely you’ve reached a performance ceiling with this dataset and feature set. Additional tuning may not yield significant improvements because the model seems to have effectively captured the key patterns for distinguishing fake from real news.

Balanced Metrics: The high scores in precision and recall, along with a closely matching F1 score, indicate that the model maintains a good balance between identifying both real and fake news accurately, with a strong AUC showing good separation between classes.