In [1]:
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.models import Sequential, Model
from keras.layers import Dense,Dropout,Input,Embedding,Flatten,TextVectorization,Conv1D,GlobalMaxPooling1D,MaxPooling1D,GlobalAveragePooling1D
from keras.initializers import Constant
from keras.layers import Dense,LSTM,Bidirectional,Attention,Concatenate,GRU,BatchNormalization
import nltk
from nltk.corpus import stopwords
import re
nltk.download('stopwords')
import pickle
import numpy as np
import pandas as pd
import jieba
import random

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
from google.colab import drive
drive.mount('/content/drive')
train_pairs = pd.read_csv('drive/MyDrive/train_pairsS2S.csv')
valid_pairs = pd.read_csv('drive/MyDrive/valid_pairsS2S.csv')
test_pairs = pd.read_csv('drive/MyDrive/test_pairsS2S.csv')

Mounted at /content/drive


BiLSTM with random choice of biased or unbiased sentence per pair

In [3]:
# Initialize an empty list to store labeled data
labeled_train = []

# Iterate through each tuple in 'data'
for row in train_pairs.text_pairs_dict:
    # Randomly select either src_raw or tgt_raw
    row = eval(row)
    text = random.choice(row)

    # Determine the label based on whether the selected text is from src_raw or tgt_raw
    label = 0 if text == row[0] else 1

    # Append the tuple containing the selected text and its label to 'labeled_data'
    labeled_train.append((text, label))

# Shuffle the labeled_data
random.shuffle(labeled_train)

# Separate the data into two lists based on labels
label_0_train = [(text, label) for text, label in labeled_train if label == 0]
label_1_train = [(text, label) for text, label in labeled_train if label == 1]


# Convert to DataFrame
data_train = label_0_train + label_1_train
df_train = pd.DataFrame(data_train, columns=['text', 'label'])
y_train = df_train['label']

# Display the first few rows
print("Train Data:")
print(df_train.head())

Train Data:
                                                text  label
0  city hall of buffalo, new york, an art-deco ma...      0
1  the hospital stands on the site of the former ...      0
2  at judgment day 2004, jbl defeated the late gr...      0
3  several new academic and residence buildings h...      0
4  the president of niger was overthrown in febru...      0


In [4]:
# Initialize an empty list to store labeled data
labeled_val = []

# Iterate through each tuple in 'data'
for row in valid_pairs.text_pairs_dict:
    # Randomly select either src_raw or tgt_raw
    row = eval(row)
    text = random.choice(row)

    # Determine the label based on whether the selected text is from src_raw or tgt_raw
    label = 0 if text == row[0] else 1

    # Append the tuple containing the selected text and its label to 'labeled_data'
    labeled_val.append((text, label))

# Shuffle the labeled_data
random.shuffle(labeled_val)

# Separate the data into two lists based on labels
label_0_val = [(text, label) for text, label in labeled_val if label == 0]
label_1_val = [(text, label) for text, label in labeled_val if label == 1]


# Convert to DataFrame
data_val = label_0_val + label_1_val
df_val = pd.DataFrame(data_val, columns=['text', 'label'])
y_val = df_val['label']

# Display the first few rows
print("Train Data:")
print(df_val.head())

Train Data:
                                                text  label
0  the film was simultaneously screened and webca...      0
1  the internationale (l'internationale in french...      0
2  ironically, ferguson was created to relieve ov...      0
3  southern mindanao has been terrorized by the r...      0
4  nephilim (or giants) are offspring of supernat...      0


In [5]:
# Initialize an empty list to store labeled data
labeled_test = []

# Iterate through each tuple in 'data'
for row in test_pairs.text_pairs_dict:
    # Randomly select either src_raw or tgt_raw
    row = eval(row)
    text = random.choice(row)

    # Determine the label based on whether the selected text is from src_raw or tgt_raw
    label = 0 if text == row[0] else 1

    # Append the tuple containing the selected text and its label to 'labeled_data'
    labeled_test.append((text, label))

# Shuffle the labeled_data
random.shuffle(labeled_test)

# Separate the data into two lists based on labels
label_0_test = [(text, label) for text, label in labeled_test if label == 0]
label_1_test = [(text, label) for text, label in labeled_test if label == 1]


# Convert to DataFrame
data_test = label_0_test + label_1_test
df_test = pd.DataFrame(data_test, columns=['text', 'label'])
y_test = df_test['label']

# Display the first few rows
print("\nTest Data:")
print(df_test.head())


Test Data:
                                                text  label
0  by its terms the proposed withdrawal agreement...      0
1  a football coach with years of successful expe...      0
2  the novel's scope takes in aspects of establis...      0
3  mr. rice's illustrious scouting career spanned...      0
4  hisham selim (arabic: ) is a famous egyptian a...      0


In [6]:
print(f"{len(data_train)} training sentences")
print(f"{len(data_val)} training sentences")
print(f"{len(data_test)} test sentences")

127033 training sentences
27220 training sentences
27220 test sentences


In [6]:
# Remove common English stop words
def clean(text):
  stop_words = set(stopwords.words('english'))
  tokens = [word for word in text.split() if word not in stop_words]
  text = ' '.join(tokens)
  return text

In [7]:
x_train_cleaned = df_train['text'].apply(clean)
x_val_cleaned = df_val['text'].apply(clean)
x_test_cleaned = df_test['text'].apply(clean)

In [8]:
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train_cleaned)
X_train_sequences = tokenizer.texts_to_sequences(x_train_cleaned)
X_val_sequences = tokenizer.texts_to_sequences(x_val_cleaned)
X_test_sequences = tokenizer.texts_to_sequences(x_test_cleaned)

In [9]:
word_index = tokenizer.word_index
total_words = len(word_index)
print(total_words) #same as length of vocabulary

134081


In [10]:
emb_len=len(tokenizer.index_word)+1

In [11]:
#getting the length of the maximum sequence in the dataset
max_length = max([len(w) for w in X_train_sequences])
print(max_length)

163


In [12]:
# Padding the sequences
X_train_padded = pad_sequences(X_train_sequences, maxlen=200, padding='post', truncating='post')
X_val_padded = pad_sequences(X_val_sequences, maxlen=200, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=200, padding='post', truncating='post')

In [13]:
# Input layer
input_layer = Input(shape=(200,))  # Specify max_len as the maximum sequence length

# Embedding layer
embedding_layer = Embedding(emb_len, 128)(input_layer)

# Bidirectional LSTM layer replaced with Attention layer
lstm_layer = Bidirectional(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.3))(embedding_layer)
attention = Attention()([lstm_layer, lstm_layer])  # Attention layer

# 1D Convolutional layer
conv1d_layer = Conv1D(64, kernel_size=3, activation='relu')(attention)

# GlobalMaxPooling1D layer
global_max_pooling_layer = GlobalMaxPooling1D()(conv1d_layer)

# Dense layers
dense_layer_1 = Dense(128, activation='relu')(global_max_pooling_layer)
output_layer = Dense(1, activation='sigmoid')(dense_layer_1)

# Model creation
model = Model(inputs=input_layer, outputs=output_layer)



In [14]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 200)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 200, 128)             1716249   ['input_1[0][0]']             
                                                          6                                       
                                                                                                  
 bidirectional (Bidirection  (None, 200, 256)             263168    ['embedding[0][0]']           
 al)                                                                                              
                                                                                              

In [15]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [17]:
#Training the model
history = model.fit(X_train_padded,y_train,batch_size=128,epochs=10,validation_data=(X_val_padded,y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
predictions = model.predict(X_test_padded)
y_pred_labels = np.where(predictions > 0.5, 1, 0)

In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Compute various evaluation metrics
accuracy = accuracy_score(y_test, y_pred_labels)
precision = precision_score(y_test, y_pred_labels)
recall = recall_score(y_test, y_pred_labels)
f1 = f1_score(y_test, y_pred_labels)

# Generate a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_labels)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.5896767083027186
Precision: 0.5865914000975678
Recall: 0.6164493921195254
F1-score: 0.6011498767989143
Confusion Matrix:
[[7634 5932]
 [5237 8417]]


In [27]:
model.save("drive/MyDrive/bilstm_bias.keras")

In [28]:
model.save_weights("drive/MyDrive/bilstm_bias_weights.h5")


BiLSTM with both unbiased and biased sentences included, training limited to 130,000 sentences.

In [3]:
# Initialize an empty list to store labeled data
labeled_train = []

# Iterate through each tuple in 'data'
for row in train_pairs.text_pairs_dict:
    # Randomly select either src_raw or tgt_raw
    row = eval(row)

    # Append the tuple containing the selected text and its label to 'labeled_data'
    labeled_train.append((row[0], 0))
    labeled_train.append((row[1], 1))

# Shuffle the labeled_data
random.shuffle(labeled_train)

# Convert to DataFrame
data_train = labeled_train
df_train = pd.DataFrame(data_train, columns=['text', 'label'])
y_train = df_train['label']

In [4]:
df_train = df_train[:130000]
y_train = df_train['label']

In [5]:
# Initialize an empty list to store labeled data
labeled_val = []

# Iterate through each tuple in 'data'
for row in valid_pairs.text_pairs_dict:
    # Randomly select either src_raw or tgt_raw
    row = eval(row)

    # Append the tuple containing the selected text and its label to 'labeled_data'
    labeled_val.append((row[0], 0))
    labeled_val.append((row[1], 1))

# Shuffle the labeled_data
random.shuffle(labeled_val)

# Convert to DataFrame
data_val = labeled_val
df_val = pd.DataFrame(data_val, columns=['text', 'label'])
y_val = df_val['label']

In [6]:
# Initialize an empty list to store labeled data
labeled_test = []

# Iterate through each tuple in 'data'
for row in test_pairs.text_pairs_dict:
    # Randomly select either src_raw or tgt_raw
    row = eval(row)

    # Append the tuple containing the selected text and its label to 'labeled_data'
    labeled_test.append((row[0], 0))
    labeled_test.append((row[1], 1))

# Shuffle the labeled_data
random.shuffle(labeled_test)

# Convert to DataFrame
data_test = labeled_test
df_test = pd.DataFrame(data_test, columns=['text', 'label'])
y_test = df_test['label']

In [7]:
df_train.to_csv("drive/MyDrive/df_train.csv")
df_val.to_csv("drive/MyDrive/df_val.csv")
df_test.to_csv("drive/MyDrive/df_test.csv")

In [8]:
x_train_cleaned = df_train['text'].apply(clean)
x_val_cleaned = df_val['text'].apply(clean)
x_test_cleaned = df_test['text'].apply(clean)

In [9]:
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train_cleaned)
X_train_sequences = tokenizer.texts_to_sequences(x_train_cleaned)
X_val_sequences = tokenizer.texts_to_sequences(x_val_cleaned)
X_test_sequences = tokenizer.texts_to_sequences(x_test_cleaned)

In [10]:
word_index = tokenizer.word_index
total_words = len(word_index)
print(total_words) #same as length of vocabulary

117415


In [11]:
emb_len=len(tokenizer.index_word)+1

In [12]:
#getting the length of the maximum sequence in the dataset
max_length = max([len(w) for w in X_train_sequences])
print(max_length)

164


In [13]:
# Padding the sequences
X_train_padded = pad_sequences(X_train_sequences, maxlen=200, padding='post', truncating='post')
X_val_padded = pad_sequences(X_val_sequences, maxlen=200, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=200, padding='post', truncating='post')

In [14]:
# Input layer
input_layer = Input(shape=(200,))  # Specify max_len as the maximum sequence length

# Embedding layer
embedding_layer = Embedding(emb_len, 128)(input_layer)

# Bidirectional LSTM layer replaced with Attention layer
lstm_layer = Bidirectional(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.3))(embedding_layer)
attention = Attention()([lstm_layer, lstm_layer])  # Attention layer

# 1D Convolutional layer
conv1d_layer = Conv1D(64, kernel_size=3, activation='relu')(attention)

# GlobalMaxPooling1D layer
global_max_pooling_layer = GlobalMaxPooling1D()(conv1d_layer)

# Dense layers
dense_layer_1 = Dense(128, activation='relu')(global_max_pooling_layer)
output_layer = Dense(1, activation='sigmoid')(dense_layer_1)

# Model creation
model = Model(inputs=input_layer, outputs=output_layer)



In [14]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 200)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 200, 128)             1782195   ['input_1[0][0]']             
                                                          2                                       
                                                                                                  
 bidirectional (Bidirection  (None, 200, 256)             263168    ['embedding[0][0]']           
 al)                                                                                              
                                                                                              

In [15]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [16]:
#Training the model
history = model.fit(X_train_padded,y_train,batch_size=128,epochs=10,validation_data=(X_val_padded,y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [17]:
## Evaluate Model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Test Loss: 1.0432424545288086, Test Accuracy: 0.6074944734573364


In [18]:
# Predictions
predictions = model.predict(X_test_padded)
print(predictions)

[[0.34324637]
 [0.4670872 ]
 [0.46466786]
 ...
 [0.7599771 ]
 [0.8916219 ]
 [0.6614792 ]]


In [22]:
from sklearn.metrics import confusion_matrix
threshold = 0.5
binary_predictions = np.where(predictions >= threshold, 1, 0)
confusion_matrix(y_test, binary_predictions)

array([[16044, 11176],
       [10192, 17028]])

In [27]:
num_zeros = np.count_nonzero(binary_predictions == 0)
print(num_zeros)
num_ones = np.count_nonzero(binary_predictions == 1)
print(num_ones)

26236
28204


In [19]:
model.save("drive/MyDrive/bilstm_bias2.keras")
model.save_weights("drive/MyDrive/bilstm_bias_weights2.h5")
model.save("drive/MyDrive/bilstm_bias2.h5")

  saving_api.save_model(
