# SVM

In [None]:
import pandas as pd
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
import time

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

def load_glove_model(glove_file):
    model = {}
    with open(glove_file, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            model[word] = vector
    return model

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalnum() and token.lower() not in stop_words]

    return ' '.join(tokens)

def document_vector(doc, model, num_features):
    doc_vector = np.zeros((num_features,), dtype="float32")
    num_words = 0

    for word in doc.split():
        if word in model:
            num_words += 1
            doc_vector = np.add(doc_vector, model[word])

    if num_words != 0:
        doc_vector = np.divide(doc_vector, num_words)

    return doc_vector

def train_and_predict(train_file, test_file, output_file, glove_file, num_features=300):
    glove_model = load_glove_model(glove_file)

    train_data = pd.read_csv(train_file)
    train_data['processed_text'] = (train_data['title'] + " " + train_data['plot_synopsis']).apply(preprocess_text)
    train_document_vectors = np.array([document_vector(doc, glove_model, num_features) for doc in train_data['processed_text']])
    train_label = train_data[['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']].values

    test_data = pd.read_csv(test_file)
    test_data['processed_text'] = (test_data['title'] + " " + test_data['plot_synopsis']).apply(preprocess_text)
    test_document_vectors = np.array([document_vector(doc, glove_model, num_features) for doc in test_data['processed_text']])

    svm_classifier = OneVsRestClassifier(LinearSVC(C=3), n_jobs=-1)
    start_training_time = time.time()
    svm_classifier.fit(train_document_vectors, train_label)
    end_training_time = time.time()
    print("Training time: " + str(end_training_time - start_training_time))

    start_testing_time = time.time()
    test_label = svm_classifier.predict(test_document_vectors)
    end_testing_time = time.time()
    print("Testing time: " + str(end_testing_time - start_testing_time))

    result_df = pd.DataFrame({'doc_id': test_data['ID']})
    result_df = pd.concat([result_df, pd.DataFrame(test_label)], axis=1)
    result_df.to_csv(output_file, index=False, header=False)

glove_file = 'glove.6B.300d.txt'
train_and_predict('./data/Training-dataset.csv', './data/Task-2-test-dataset1.csv', '10728942-Task2-method-a.csv', glove_file)


# LSTM

In [2]:
!pip install nltk



In [8]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Bidirectional, Dense, SpatialDropout1D
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
import string
from tqdm import tqdm
import time

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

def generate_output(model, X_test, output_file):
    start_testing_time = time.time()
    predictions = model.predict(X_test)
    end_testing_time = time.time()
    print("Testing time: " + str(end_testing_time - start_testing_time))

    binary_predictions = (predictions >= 0.5).astype(int)

    output_df = pd.DataFrame(binary_predictions)
    output_df.insert(0, 'ID', test_ids)
    output_df.to_csv(output_file, index=False, header=False)

def load_glove_embeddings(embeddings_path):
    embeddings_index = {}
    with open(embeddings_path, encoding='utf-8') as f:
        for line in tqdm(f, desc="Loading GloVe", unit=" vectors"):
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_path = 'glove.6B.300d.txt'
glove_embeddings_index = load_glove_embeddings(glove_path)

lemmatizer = WordNetLemmatizer()

def remove_stopwords_and_punctuation(text):
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)

    word_tokens = word_tokenize(text)
    filtered_text = [word.lower() for word in word_tokens if word.lower() not in stop_words and word.lower() not in punctuation]
    filtered_text = [lemmatizer.lemmatize(word) for word in filtered_text]
    return ' '.join(filtered_text)

def create_embedding_matrix(tokenizer, embeddings_index, embedding_dim):
    word_index = tokenizer.word_index
    num_words = min(len(word_index) + 1, len(embeddings_index))
    embedding_matrix = np.zeros((num_words, embedding_dim))

    for word, i in word_index.items():
        if i >= num_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

def preprocess_data(file_path, tokenizer, max_len):
    df = pd.read_csv(file_path)
    documents = (df['title'] + ' ' + df['plot_synopsis']).apply(remove_stopwords_and_punctuation)
    sequences = tokenizer.texts_to_sequences(documents)
    X_test = pad_sequences(sequences, maxlen=max_len)

    return df['ID'], X_test

train_file_path = './data/Training-dataset.csv'
df_train = pd.read_csv(train_file_path)
df_train['processed_text'] = (df_train['title'] + ' ' + df_train['plot_synopsis']).apply(remove_stopwords_and_punctuation)

max_words = 88151
max_len = 500
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df_train['processed_text'])
sequences = tokenizer.texts_to_sequences(df_train['processed_text'])
X_train = pad_sequences(sequences, maxlen=max_len)

embedding_dim = 300
embedding_matrix = create_embedding_matrix(tokenizer, glove_embeddings_index, embedding_dim)

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(9, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

batch_size = 32
epochs = 10
start_training_time = time.time()
model.fit(X_train, df_train[['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']], epochs=epochs, batch_size=batch_size, validation_split=0.2)
end_training_time = time.time()
print("Training time: " + str(end_training_time - start_training_time))

test_file_path = './data/Task-2-validation-dataset.csv'
test_ids, X_test = preprocess_data(test_file_path, tokenizer, max_len)

output_file = '10728942-Task2-method-b-validation.csv'
generate_output(model, X_test, output_file)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Loading GloVe: 400001 vectors [00:33, 11998.49 vectors/s]


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training time: 5126.841902971268


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Testing time: 5.07581639289856


# Evaluation

In [9]:
%cd /content/drive/MyDrive/Colab Notebooks/NLP CW/Task 2

!python task2_eval_script_student_version.py '10728942-Task2-method-b-validation.csv' 'Task-2-validation-dataset.csv'

/content/drive/MyDrive/Colab Notebooks/NLP CW/Task 2
Class level: 
Class  1 F1 score: 0.2601
Class  2 F1 score: 0.3869
Class  3 F1 score: 0.1873
Class  4 F1 score: 0.1379
Class  5 F1 score: 0.7472
Class  6 F1 score: 0.1463
Class  7 F1 score: 0.5992
Class  8 F1 score: 0.0000
Class  9 F1 score: 0.6501
----------------------------
Movie (document) level: 
Precision: 0.5810
Recall: 0.5123
