In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split, learning_curve, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [85]:
# Data preparation and preprocessing

# Load the dataset
with open('data_json/SubtaskB/subtaskB_train.jsonl', 'r') as f:
    df = pd.read_json(f, lines=True, orient='records')

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['model'], test_size=0.2, random_state=42)

In [108]:
# Load the dataset
with open('data_json/SubtaskB/subtaskB_dev.jsonl', 'r') as f:
    dev_df = pd.read_json(f, lines=True, orient='records')

dev_df = dev_df.sample(frac=1, random_state=42)
dev_df['text'], dev_df['model'] = X_dev, y_dev

In [109]:
# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

# Fit the label encoder on the string labels
label_encoder.fit(y_train)

# Transform the string labels to integer values
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)
y_dev = label_encoder.transform(y_dev)


In [111]:
# Tokenize training and testing sets
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
sequences_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(sequences_train, maxlen=100, truncating='post')

tokenizer.fit_on_texts(X_test)
sequences_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(sequences_test, maxlen=100, truncating='post')

tokenizer.fit_on_texts(X_dev)
sequences_dev = tokenizer.texts_to_sequences(X_dev)
X_dev = pad_sequences(sequences_dev, maxlen=100, truncating='post')

In [12]:
# Define the model
model = Sequential()
model.add(Embedding(10000, 32))
model.add(SimpleRNN(32))
model.add(Dense(num_classes, activation='softmax'))


In [15]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Convert the target labels to one-hot encoded format
y_train_one_hot = tf.keras.utils.to_categorical(y_train, num_classes)

# Train the model
model.fit(X_train, y_train_one_hot, epochs=7, verbose=1)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.src.callbacks.History at 0x26cf8cfc460>

In [50]:
y_test_one_hot = tf.keras.utils.to_categorical(y_test, num_classes)

# Predict the probabilities for the test set
probabilities = model.predict(X_test)

# Convert the predicted probabilities to a DataFrame
probabilities_df = pd.DataFrame(probabilities, columns=['0', '1', '2', '3', '4', '5'])

probabilities = list(zip(y_test_one_hot, probabilities.flatten()))
probabilities_df['actual'] = label_encoder.inverse_transform(y_test)
probabilities_df.to_csv('statistics/RNN_B_outputs/ROC.csv', index=False)

probabilities_df['predicted'] = probabilities_df[['0', '1', '2', '3', '4', '5']].idxmax(axis=1)

y_pred = probabilities_df['predicted'].astype(int)
y_pred = y_pred.to_numpy()


# Save the DataFrame to a CSV file
probabilities_df.to_csv('statistics/RNN_B_outputs/probabilities.csv', index=False)

 



In [53]:
classification_report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
classification_report_df.to_csv('statistics/RNN_B_outputs/classification_report.csv', index=False)
print(classification_report(y_test, y_pred))

confusion_df = pd.DataFrame(confusion_matrix(y_test, y_pred))
confusion_df.to_csv('statistics/RNN_B_outputs/confusion_matrix.csv', index=False)


              precision    recall  f1-score   support

           0       0.78      0.64      0.70      2404
           1       0.25      0.22      0.23      2441
           2       0.30      0.35      0.32      2204
           3       0.21      0.23      0.22      2407
           4       0.20      0.21      0.20      2360
           5       0.23      0.24      0.24      2390

    accuracy                           0.31     14206
   macro avg       0.33      0.31      0.32     14206
weighted avg       0.33      0.31      0.32     14206



In [54]:
# Get the weights of the embedding layer
embedding_weights = model.get_weights()[0]

# Get the word index from the tokenizer
word_index = tokenizer.word_index

# Create a list of (word, weight) pairs
word_weights = []
for word, index in word_index.items():
    if index < embedding_weights.shape[0]:
        weight = np.linalg.norm(embedding_weights[index])
        word_weights.append((word, weight))

# Store the weights in a csv file
sorted_features = sorted(word_weights, key=lambda x: x[1])

# Print the weights of the top 30 words to a file
with open("statistics/RNN_B_outputs/top_bottom_words.csv", "w") as f:
    f.write(f"word,weight\n")
    for word, weight in sorted_features[-30:]:
        f.write(f"{word},{weight}\n")

    # Print the weights of the bottom 30 words to the same file
    for word, weight in sorted_features[:30]:
        f.write(f"{word},{weight}\n")

# Store all of the weights in a separate csv file
weights_df = pd.DataFrame(sorted_features, columns=['word', 'weight'])
weights_df.to_csv('statistics/RNN_B_outputs/weights.csv', index=False)


In [112]:
y_dev_one_hot = tf.keras.utils.to_categorical(y_dev, num_classes)

# Predict the labels for the dev set
predictions = model.predict(X_dev)

predictions_df = pd.DataFrame(predictions, columns=['0', '1', '2', '3', '4', '5'])

predictions = list(zip(y_dev_one_hot, predictions.flatten()))
predictions_df['actual'] = label_encoder.inverse_transform(y_dev)
predictions_df['predicted'] = predictions_df[['0', '1', '2', '3', '4', '5']].idxmax(axis=1).astype(int)

predictions_df['id'] = dev_df['id']
predictions_df['predicted'] = predictions_df['predicted'].astype(int)
predictions_df['model'] = label_encoder.inverse_transform(predictions_df['predicted'])

# Save the predictions along with 'id' to a JSONL file
predictions_df[['id', 'model']].to_json('statistics/RNN_B_outputs/dev_predictions.jsonl', lines=True, orient='records')

predicted_labels = predictions_df['predicted'].values

# Calculate accuracy
accuracy = sum(y_dev == predicted_labels) / len(y_dev)

# Print accuracy
print("Accuracy:", accuracy)

Accuracy: 0.2723333333333333


In [115]:
pred_model = probabilities_df['predicted'].astype(int)
pred_model = pred_model.to_numpy()

classification_report2_df = pd.DataFrame(
    classification_report(predicted_labels, y_dev, output_dict=True)).transpose()
classification_report2_df.to_csv(
    'statistics/RNN_B_outputs/classification_report2.csv', index=False)

print(classification_report(predicted_labels, y_dev))


              precision    recall  f1-score   support

           0       0.61      0.75      0.67       403
           1       0.14      0.20      0.16       340
           2       0.26      0.24      0.25       555
           3       0.21      0.21      0.21       505
           4       0.20      0.17      0.19       582
           5       0.22      0.18      0.20       615

    accuracy                           0.27      3000
   macro avg       0.27      0.29      0.28      3000
weighted avg       0.26      0.27      0.27      3000



In [None]:


# # Evaluate the model
# model.evaluate(X_test, y_test_one_hot)

# # Save the model
# model.save('models/RNNmodelB.h5')

# # Load the model
# model = tf.keras.models.load_model('models/RNNmodelB.h5')

# # Predict on the test set
# y_pred = model.predict(X_test)

# # Convert the predictions to integer format
# y_pred = np.argmax(y_pred, axis=1)

# # Convert the integer predictions to string labels
# y_pred = label_encoder.inverse_transform(y_pred)

# # Convert the test set labels to string format
# y_test = label_encoder.inverse_transform(y_test)

# # Print the classification report
# print(classification_report(y_test, y_pred))

# # Print the confusion matrix
# print(confusion_matrix(y_test, y_pred))

# # Save the label encoder
# with open('models/label_encoderB.json', 'w') as f:
#     json.dump(label_encoder.classes_.tolist(), f)

# # Plot the learning curve
# train_sizes, train_scores, test_scores = learning_curve(model, X_train, y_train, cv=5)
# train_scores_mean = np.mean(train_scores, axis=1)
# test_scores_mean = np.mean(test_scores, axis=1)

# # Plot the learning curve
# plt.style.use('seaborn')
# plt.plot(train_sizes, train_scores_mean, label='Training accuracy')
# plt.plot(train_sizes, test_scores_mean, label='Validation accuracy')
# plt.ylabel('Accuracy', fontsize=14)
# plt.xlabel('Training set size', fontsize=14)
# plt.title('Learning curves for RNN model', fontsize=18, y=1.03)
# plt.legend()
# plt.ylim(0.5, 1)
# plt.show()

# # Hyperparameter tuning

# # Define the hyperparameter grid
# param_grid = {'batch_size': [32, 64, 128],
#               'epochs': [5, 10, 15]}

# # Define the random search
# random_search = RandomizedSearchCV(model, param_grid, cv=5)

# # Train the random search
# random_search.fit(X_train, y_train_one_hot)

# # Print the best set of parameters
# print("Best parameters found: ", random_search.best_params_)

# # Print the best score
# print("Best score found: ", random_search.best_score_)

# # Save the best model
# random_search.best_estimator_.model.save('models/RNNmodelB_best.h5')

# # Load the best model
# model = tf.keras.models.load_model('models/RNNmodelB_best.h5')


In [None]:
# Hyperparameter tuning / grid search

# from keras.models import Sequential
# from keras.layers import SimpleRNN, Dense
# from keras.wrappers.scikit_learn import KerasClassifier
# from sklearn.model_selection import GridSearchCV

# # Function to create model, required for KerasClassifier
# def create_model(units=50, optimizer='adam'):
#     model = Sequential()
#     model.add(SimpleRNN(units, input_shape=(100, 1)))  # Assume input sequences of length 100
#     model.add(Dense(1, activation='sigmoid'))
#     model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
#     return model

# # Create the KerasClassifier wrapper
# model = KerasClassifier(build_fn=create_model, verbose=0)

# # Define the grid search parameters
# param_grid = {
#     'units': [50, 100, 150],
#     'optimizer': ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam'],
#     'batch_size': [10, 20, 40, 60, 80, 100],
#     'epochs': [10, 50, 100]
# }

# # Create Grid Search
# grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
# grid_result = grid.fit(X_train, y_train)

# # Report Results
# print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")