In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import re
import pandas as pd
import numpy as np
import random
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, BatchNormalization, Dense, Bidirectional
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
file_path = '/content/drive/MyDrive/X_train.csv'
file_path_y_train = '/content/drive/MyDrive/y_train.csv'
file_path_y2_train = '/content/drive/MyDrive/y_train_level_2.csv'
file_path_x_val = '/content/drive/MyDrive/X_val.csv'
file_path_y_val = '/content/drive/MyDrive/y_val.csv'
file_path_y2_val = '/content/drive/MyDrive/y_val_level_2.csv'
file_path_x_test = '/content/drive/MyDrive/X_test.csv'
file_path_y_test = '/content/drive/MyDrive/y_test.csv'
file_path_y2_test = '/content/drive/MyDrive/y_test_level_2.csv'

df_x_train = pd.read_csv(file_path)
df_y_train = pd.read_csv(file_path_y_train)
df_y2_train = pd.read_csv(file_path_y2_train)
df_x_val = pd.read_csv(file_path_x_val)
df_y_val = pd.read_csv(file_path_y_val)
df_y2_val = pd.read_csv(file_path_y2_val)
df_x_test = pd.read_csv(file_path_x_test)
df_y_test = pd.read_csv(file_path_y_test)
df_y2_test = pd.read_csv(file_path_y2_test)

In [None]:
categories = range(17)

save_dir = '/content/drive/MyDrive/'

# Filter and save the files
for category in categories:
    # Filter training data
    train_mask = df_y_train.iloc[:, 0] == category
    df_x_train_filtered = df_x_train[train_mask]
    df_y2_train_filtered = df_y2_train[train_mask]

    # Filter validation data
    val_mask = df_y_val.iloc[:, 0] == category
    df_x_val_filtered = df_x_val[val_mask]
    df_y2_val_filtered = df_y2_val[val_mask]

    # Filter test data
    test_mask = df_y_test.iloc[:, 0] == category
    df_x_test_filtered = df_x_test[test_mask]
    df_y2_test_filtered = df_y2_test[test_mask]

    df_x_train_filtered.to_csv(f'{save_dir}X_train_{category}.csv', index=False)
    df_y2_train_filtered.to_csv(f'{save_dir}y2_train_{category}.csv', index=False)

    df_x_val_filtered.to_csv(f'{save_dir}X_val_{category}.csv', index=False)
    df_y2_val_filtered.to_csv(f'{save_dir}y2_val_{category}.csv', index=False)

    df_x_test_filtered.to_csv(f'{save_dir}X_test_{category}.csv', index=False)
    df_y2_test_filtered.to_csv(f'{save_dir}y2_test_{category}.csv', index=False)

print("Files have been filtered and saved successfully.")

Files have been filtered and saved successfully.


In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]

    cleaned_text = ' '.join(tokens)
    return cleaned_text

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dropout, Bidirectional, LSTM, BatchNormalization, Dense, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Attention, GlobalMaxPooling1D # Import GlobalMaxPooling1D


def create_and_train_model(X_train, y_train, X_val, y_val, vocab_size, embedding_dim, max_sentence_length, embedding_matrix, num_classes, learning_rate=0.001, epochs=100, batch_size=64, patience=20):
    l2_reg = 0.001

    model = Sequential()
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=True))
    model.add(SpatialDropout1D(0.2))
    model.add(Bidirectional(LSTM(200, return_sequences=True)))
    model.add(BatchNormalization())
    model.add(SpatialDropout1D(0.2))

    model.add(GlobalMaxPooling1D())

    model.add(Dense(17, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)
    model_checkpoint = ModelCheckpoint('best_model.keras', monitor='val_loss', save_best_only=True, mode='max')

    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[early_stopping, model_checkpoint]
    )

    return model, history

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def evaluate_model(model, X_train, y_train, X_test, y_test):
    # Predict on training and test data
    y_pred_train = model.predict(X_train)
    y_pred_train = np.argmax(y_pred_train, axis=1)

    y_pred_test = model.predict(X_test)
    y_pred_test = np.argmax(y_pred_test, axis=1)

    # Calculate and print accuracy
    train_accuracy = accuracy_score(y_train, y_pred_train) * 100
    test_accuracy = accuracy_score(y_test, y_pred_test) * 100
    print('Accuracy on training set:', train_accuracy)
    print('Accuracy on test set:', test_accuracy)

    # Calculate and print correct and incorrect predictions
    correct_train = sum(y_train == y_pred_train)
    correct_test = sum(y_test == y_pred_test)
    incorrect_train = sum(y_train != y_pred_train)
    incorrect_test = sum(y_test != y_pred_test)
    print("Correct predictions on training data:", correct_train)
    print("Correct predictions on test data:", correct_test)
    print("Incorrect predictions on training data:", incorrect_train)
    print("Incorrect predictions on test data:", incorrect_test)

    # Print confusion matrix
    train_conf_matrix = confusion_matrix(y_train, y_pred_train)
    test_conf_matrix = confusion_matrix(y_test, y_pred_test)
    print('Confusion matrix on training data:\n', train_conf_matrix)
    print('Confusion matrix on test data:\n', test_conf_matrix)

    # Print classification report
    train_class_report = classification_report(y_train, y_pred_train, digits=4)
    test_class_report = classification_report(y_test, y_pred_test, digits=4)
    print("Classification report of training data:\n", train_class_report)
    print("Classification report of test data:\n", test_class_report)

    return {
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'correct_train': correct_train,
        'correct_test': correct_test,
        'incorrect_train': incorrect_train,
        'incorrect_test': incorrect_test,
        'train_conf_matrix': train_conf_matrix,
        'test_conf_matrix': test_conf_matrix,
        'train_class_report': train_class_report,
        'test_class_report': test_class_report
    }

In [None]:
def train_word2vec_model(corpus):
    model = gensim.models.Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, sg=0)
    return model

# Function to create the embedding matrix
def create_embedding_matrix(word2vec_model, tokenizer, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in tokenizer.word_index.items():
        if word in word2vec_model.wv:
            embedding_matrix[i] = word2vec_model.wv[word]

    print(vocab_size)
    print(embedding_dim)
    return embedding_matrix

In [None]:
import numpy as np
import pandas as pd
import gensim
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
import pickle

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Specific category to process
specific_category = "9"

# Load the data from the CSV files
X_train = pd.read_csv(f'/content/drive/MyDrive/X_train_{specific_category}.csv')
y_train = pd.read_csv(f'/content/drive/MyDrive/y2_train_{specific_category}.csv')
X_val = pd.read_csv(f'/content/drive/MyDrive/X_val_{specific_category}.csv')
y_val = pd.read_csv(f'/content/drive/MyDrive/y2_val_{specific_category}.csv')
X_test = pd.read_csv(f'/content/drive/MyDrive/X_test_{specific_category}.csv')
y_test = pd.read_csv(f'/content/drive/MyDrive/y2_test_{specific_category}.csv')

# Accessing the first column using .iloc
corpus_train = X_train.iloc[:, 0].tolist()
corpus_val = X_val.iloc[:, 0].tolist()
corpus_test = X_test.iloc[:, 0].tolist()

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus_train)

# Convert text to sequences
sequences_train = tokenizer.texts_to_sequences(corpus_train)
sequences_val = tokenizer.texts_to_sequences(corpus_val)
sequences_test = tokenizer.texts_to_sequences(corpus_test)

# Determine the maximum sentence length
max_sentence_length = max(
    max([len(seq) for seq in sequences_train]),
    max([len(seq) for seq in sequences_val]),
    max([len(seq) for seq in sequences_test])
)

# Pad sequences to ensure uniform length
padded_train = pad_sequences(sequences_train, maxlen=max_sentence_length, padding='post')
padded_val = pad_sequences(sequences_val, maxlen=max_sentence_length, padding='post')
padded_test = pad_sequences(sequences_test, maxlen=max_sentence_length, padding='post')

labels_train = y_train.values.flatten()
labels_val = y_val.values.flatten()
labels_test = y_test.values.flatten()

# Process corpus for Word2Vec
useful_info = [gensim.utils.simple_preprocess(text) for text in corpus_train]
word2vec_model = train_word2vec_model(useful_info)

# Create the embedding matrix
embedding_dim = word2vec_model.vector_size
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = create_embedding_matrix(word2vec_model, tokenizer, vocab_size, embedding_dim)

# Train the model
num_classes = len(np.unique(labels_train))
model, history = create_and_train_model(padded_train, labels_train, padded_val, labels_val, vocab_size, embedding_dim, max_sentence_length, embedding_matrix, num_classes)

# Evaluate the model
metrics = evaluate_model(model, padded_train, labels_train, padded_test, labels_test)
print(metrics)

# Store and print the test accuracy
test_accuracies = []
test_accuracies.append(metrics['test_accuracy'])

print(f"\nTest accuracy for {specific_category}: {metrics['test_accuracy']:.2f}%")

# Calculate and print the average test accuracy
average_test_accuracy = np.mean(test_accuracies)
print(f"\nAverage test accuracy: {average_test_accuracy:.2f}%")

# Save the tokenizer and max_sentence_length for later use
tokenizer_filename = f'/content/drive/MyDrive/Tokenizer_model_{specific_category}.pkl'
max_len_filename = f'/content/drive/MyDrive/Max_len_{specific_category}.pkl'

with open(tokenizer_filename, 'wb') as f:
    pickle.dump(tokenizer, f)

with open(max_len_filename, 'wb') as f:
    pickle.dump(max_sentence_length, f)

print(f"Tokenizer and max sentence length saved successfully for category {specific_category}.")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


17144
100
Epoch 1/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 454ms/step - accuracy: 0.1338 - loss: 4.5874 - val_accuracy: 0.2407 - val_loss: 2.1366
Epoch 2/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 393ms/step - accuracy: 0.3033 - loss: 2.3228 - val_accuracy: 0.2407 - val_loss: 1.9992
Epoch 3/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 365ms/step - accuracy: 0.3911 - loss: 1.7641 - val_accuracy: 0.1296 - val_loss: 1.9403
Epoch 4/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 371ms/step - accuracy: 0.4140 - loss: 1.7576 - val_accuracy: 0.1667 - val_loss: 1.8955
Epoch 5/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 367ms/step - accuracy: 0.4611 - loss: 1.5647 - val_accuracy: 0.1481 - val_loss: 1.8650
Epoch 6/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 366ms/step - accuracy: 0.4697 - loss: 1.5634 - val_accuracy: 0.1852 - val_loss: 1.8466
Epoch 7/100
[1m8/8[0m 

In [None]:
model_save_path = "/content/drive/My Drive/model_9.keras"
model.save(model_save_path)
print(f"Model saved to {model_save_path}")

Model saved to /content/drive/My Drive/model_10.keras
