**Importing Necessary Libraries**

In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report

**Loading the Training Data and converting into a dataframe**

In [13]:
# Load and parse the data file
def load_data(file_path):
    records = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(" ::: ")
            if len(parts) == 4:
                record_id, title, genre, plot_summary = parts
                records.append({'title': title, 'genre': genre, 'plot_summary': plot_summary})
    return pd.DataFrame(records)

train_data=load_data("train_data.txt")

In [15]:
train_data

Unnamed: 0,title,genre,plot_summary
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...
...,...,...,...
54209,"""Bonino"" (1953)",comedy,This short-lived NBC live sitcom centered on B...
54210,Dead Girls Don't Cry (????),horror,The NEXT Generation of EXPLOITATION. The siste...
54211,Ronald Goedemondt: Ze bestaan echt (2008),documentary,"Ze bestaan echt, is a stand-up comedy about gr..."
54212,Make Your Own Bed (1944),comedy,Walter and Vivian live in the country and have...


**Preprocessing the Data**

In [16]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Remove extra spaces"
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [17]:
# Apply text preprocessing
train_data['plot_summary'] = train_data['plot_summary'].apply(preprocess_text)

**Removing the Stopwords**

In [18]:
from nltk.corpus import stopwords as sw
#nltk.download('stopwords') 
sw.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [19]:
# Remove stopwords
def remove_stopwords(text):
    stop_words = set(sw.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])

In [20]:
 # Remove stopwords
train_data['plot_summary'] = train_data['plot_summary'].apply(remove_stopwords)

In [23]:
train_data['plot_summary'][1:5]

1    brother sister past incestuous relationship cu...
2    bus empties students field trip museum natural...
3    help unemployed father make ends meet edith tw...
4    films title refers unrecovered bodies ground z...
Name: plot_summary, dtype: object

**Loading the Test data and converting it into a dataframe**

In [22]:
test_file_path = 'test_data_solution.txt'  # Replace with your actual testing file path
test_data = load_data(test_file_path)

# Apply text preprocessing
test_data['plot_summary'] = test_data['plot_summary'].apply(preprocess_text)

# Remove stopwords
test_data['plot_summary'] = test_data['plot_summary'].apply(remove_stopwords)

**Encode the labels**

In [24]:
label_encoder = LabelEncoder()
train_data['genre'] = label_encoder.fit_transform(train_data['genre'])
test_data['genre'] = label_encoder.transform(test_data['genre'])

**Tokenize the text data**

In [25]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['plot_summary'])
X_train_seq = tokenizer.texts_to_sequences(train_data['plot_summary'])
X_test_seq = tokenizer.texts_to_sequences(test_data['plot_summary'])

**Pad the sequences**

In [26]:
max_seq_length = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_seq_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_seq_length)

**Convert labels to categorical**

In [27]:
num_classes = len(label_encoder.classes_)
y_train_cat = to_categorical(train_data['genre'], num_classes)
y_test_cat = to_categorical(test_data['genre'], num_classes)

**Define the LSTM model**

In [28]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='softmax'))



In [29]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

**Train the model**

In [31]:
model.fit(X_train_pad, y_train_cat, epochs=50, batch_size=64, validation_data=(X_test_pad, y_test_cat), verbose=2)

Epoch 1/50
848/848 - 111s - 131ms/step - accuracy: 0.6639 - loss: 1.1097 - val_accuracy: 0.5708 - val_loss: 1.4616
Epoch 2/50
848/848 - 111s - 130ms/step - accuracy: 0.6816 - loss: 1.0446 - val_accuracy: 0.5652 - val_loss: 1.4868
Epoch 3/50
848/848 - 111s - 130ms/step - accuracy: 0.6979 - loss: 0.9846 - val_accuracy: 0.5657 - val_loss: 1.5323
Epoch 4/50
848/848 - 110s - 130ms/step - accuracy: 0.7096 - loss: 0.9356 - val_accuracy: 0.5600 - val_loss: 1.5600
Epoch 5/50
848/848 - 110s - 129ms/step - accuracy: 0.7227 - loss: 0.8835 - val_accuracy: 0.5559 - val_loss: 1.5973
Epoch 6/50
848/848 - 111s - 130ms/step - accuracy: 0.7381 - loss: 0.8345 - val_accuracy: 0.5553 - val_loss: 1.6688
Epoch 7/50
848/848 - 110s - 130ms/step - accuracy: 0.7515 - loss: 0.7910 - val_accuracy: 0.5440 - val_loss: 1.6922
Epoch 8/50
848/848 - 110s - 130ms/step - accuracy: 0.7615 - loss: 0.7519 - val_accuracy: 0.5489 - val_loss: 1.7371
Epoch 9/50
848/848 - 110s - 130ms/step - accuracy: 0.7706 - loss: 0.7158 - val_a

<keras.src.callbacks.history.History at 0x1d2be43cd90>

**Save the model**

In [33]:
model.save('movie_genre_model.keras')

**Evaluate the model**

In [34]:
y_pred = model.predict(X_test_pad)
y_pred_classes = y_pred.argmax(axis=-1)

[1m1694/1694[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 12ms/step


In [None]:
y_pred

**Print the classification report**

In [35]:
print("LSTM Classification Report:")
print(classification_report(test_data['genre'], y_pred_classes, target_names=label_encoder.classes_))

LSTM Classification Report:
              precision    recall  f1-score   support

      action       0.32      0.32      0.32      1314
       adult       0.41      0.35      0.38       590
   adventure       0.25      0.21      0.23       775
   animation       0.19      0.19      0.19       498
   biography       0.02      0.01      0.02       264
      comedy       0.48      0.51      0.50      7446
       crime       0.14      0.10      0.12       505
 documentary       0.70      0.73      0.71     13096
       drama       0.55      0.60      0.58     13612
      family       0.20      0.13      0.16       783
     fantasy       0.12      0.07      0.09       322
   game-show       0.60      0.63      0.62       193
     history       0.05      0.04      0.05       243
      horror       0.55      0.51      0.53      2204
       music       0.51      0.49      0.50       731
     musical       0.17      0.13      0.15       276
     mystery       0.10      0.06      0.08       318