In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split, learning_curve, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix


In [3]:
# Data preparation and preprocessing

# Load the dataset
with open('data_json/SubtaskA/subtaskA_train_monolingual.jsonl', 'r') as f:
    df = pd.read_json(f, lines=True, orient='records')

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42)

In [4]:
# Tokenize training and testing sets
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
sequences_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(sequences_train, maxlen=100, truncating='post')

tokenizer.fit_on_texts(X_test)
sequences_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(sequences_test, maxlen=100, truncating='post')

In [5]:
# Define the model
model = Sequential()
model.add(Embedding(10000, 32))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))

In [6]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=7, verbose=1)


Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.src.callbacks.History at 0x1cd66a9faf0>

In [7]:
# Predict the probabilities of the test set
probabilities = model.predict(X_test)

# Convert probabilities to binary predictions
y_pred = [1 if prob > 0.5 else 0 for prob in probabilities]

probabilities = list(zip(y_test, probabilities.flatten()))
probabilities_df = pd.DataFrame(
    probabilities, columns=['actual', 'predicted'])
probabilities_df.to_csv('RNN_outputs/ROC.csv', index=False)




In [8]:
classification_report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
classification_report_df.to_csv('RNN_outputs/classification_report.csv', index=False)

confusion_df = pd.DataFrame(confusion_matrix(y_test, y_pred))
confusion_df.to_csv('RNN_outputs/confusion_matrix.csv', index=False)


In [9]:
# Get the weights of the embedding layer
embedding_weights = model.get_weights()[0]

# Get the word index from the tokenizer
word_index = tokenizer.word_index

# Create a list of (word, weight) pairs
word_weights = []
for word, index in word_index.items():
    if index < embedding_weights.shape[0]:
        weight = np.linalg.norm(embedding_weights[index])
        word_weights.append((word, weight))

# Store the weights in a csv file
sorted_features = sorted(word_weights, key=lambda x: x[1])

# Print the weights of the top 30 words to a file
with open("RNN_outputs/top_bottom_words.csv", "w") as f:
    f.write(f"word,weight\n")
    for word, weight in sorted_features[-30:]:
        f.write(f"{word},{weight}\n")

    # Print the weights of the bottom 30 words to the same file
    for word, weight in sorted_features[:30]:
        f.write(f"{word},{weight}\n")

# Store all of the weights in a separate csv file
weights_df = pd.DataFrame(sorted_features, columns=['word', 'weight'])
weights_df.to_csv('RNN_outputs/weights.csv', index=False)


In [10]:
# Load the dev set
with open("data_json/SubtaskA/subtaskA_dev_monolingual.jsonl", "r") as f:
    lines = f.readlines()

# Parse each line as a separate JSON object
data = []
for line in lines:
    obj = json.loads(line)
    data.append(obj)

# Convert the list of JSON objects to a pandas DataFrame
dev_df = pd.DataFrame(data)

# Tokenize and pad the sentences in the dev set
dev_sequences = tokenizer.texts_to_sequences(dev_df['text'])
dev_padded_sequences = pad_sequences(dev_sequences, maxlen=100, truncating='post')

# Predict the labels for the dev set
predictions = model.predict(dev_padded_sequences)

# Convert the predicted probabilities to binary labels
pred_labels = [1 if p >= 0.5 else 0 for p in predictions]

# Store the predictions in a separate jsonl file
predictions = list(zip(dev_df['id'], pred_labels))
predictions_df = pd.DataFrame(predictions, columns=['id', 'label'])
predictions_df.to_json('RNN_outputs/dev_predictions.jsonl', lines=True, orient='records')




In [11]:
classification_report2_df = pd.DataFrame(
    classification_report(dev_df['label'], pred_labels, output_dict=True)).transpose()
classification_report2_df.to_csv(
    'RNN_outputs/classification_report2.csv', index=False)


In [12]:
# # Define the train sizes
# train_sizes = np.linspace(0.1, 1.0, 10)

# # Define lists to store the train and validation scores for each size
# train_scores = []
# val_scores = []

# # Loop over the train sizes
# for size in train_sizes:
#     # Split the training set into a smaller training set and a validation set
#     X_train_small, X_val, y_train_small, y_val = train_test_split(
#         X_train, y_train, train_size=size, random_state=42)
    
#     # Train the model on the smaller training set
#     model.fit(X_train_small, y_train_small, epochs=7, verbose=0)
    
#     # Evaluate the model on the smaller training set and the validation set
#     train_loss, train_acc = model.evaluate(X_train_small, y_train_small, verbose=0)
#     val_loss, val_acc = model.evaluate(X_val, y_val, verbose=0)
    
#     # Append the scores to the lists
#     train_scores.append(train_acc)
#     val_scores.append(val_acc)

# # Convert the lists to arrays
# train_scores = np.array(train_scores)
# val_scores = np.array(val_scores)

# # Calculate the mean and standard deviation of the train and validation scores
# df_learning_curve = pd.DataFrame({
#     'train_sizes': train_sizes,
#     'train_scores_mean': train_scores.mean(axis=1),
#     'test_scores_mean': val_scores.mean(axis=1),
#     'train_scores_std': train_scores.std(axis=1),
#     'test_scores_std': val_scores.std(axis=1)
# })

# # Save the learning curve to a csv file
# df_learning_curve.to_csv("RNN_outputs/learning_curve.csv", index=False)

In [13]:
# Hyperparameter tuning / grid search

# from keras.models import Sequential
# from keras.layers import SimpleRNN, Dense
# from keras.wrappers.scikit_learn import KerasClassifier
# from sklearn.model_selection import GridSearchCV

# # Function to create model, required for KerasClassifier
# def create_model(units=50, optimizer='adam'):
#     model = Sequential()
#     model.add(SimpleRNN(units, input_shape=(100, 1)))  # Assume input sequences of length 100
#     model.add(Dense(1, activation='sigmoid'))
#     model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
#     return model

# # Create the KerasClassifier wrapper
# model = KerasClassifier(build_fn=create_model, verbose=0)

# # Define the grid search parameters
# param_grid = {
#     'units': [50, 100, 150],
#     'optimizer': ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam'],
#     'batch_size': [10, 20, 40, 60, 80, 100],
#     'epochs': [10, 50, 100]
# }

# # Create Grid Search
# grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
# grid_result = grid.fit(X_train, y_train)

# # Report Results
# print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")