In [6]:
import pandas as pd


df = pd.read_csv('Data/netfix_cleaned.csv')

# Display the top 5 rows of the dataframe
print(df.head())

# Basic information about the dataset
print("\nDataset Info:")
df.info()

# Descriptive statistics for numeric columns
print("\nDescriptive Statistics:")
print(df.describe())

# Checking for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Check the distribution of a categorical variable (if applicable)
if 'category_column_name' in df.columns:
    print("\nCategory Distribution:")
    print(df['category_column_name'].value_counts())

# Feel free to replace 'category_column_name' with an actual column name from your dataset
# that you're interested to explore.

# Another useful exploration is to see the number of unique values in each column
print("\nUnique Values per Column:")
for col in df.columns:
    print(f"{col}: {df[col].nunique()}")

# Displaying the distribution of numeric data
# Importing necessary libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set the visualisation style
sns.set(style="whitegrid")

# Plotting the distribution of a numeric variable (if applicable)
if 'numeric_column_name' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(df['numeric_column_name'], kde=True, bins=30)
    plt.title('Distribution of Numeric Column')
    plt.xlabel('Numeric Column Name')
    plt.ylabel('Frequency')
    plt.show()

# Remember to replace 'numeric_column_name' with an actual numeric column name from your dataset.
df["mood"] = df["mood"].fillna("Unlabeled")
# Extract hours and minutes from the duration column
df['hours'] = df['duration'].str.extract('(\d+)h').fillna(0)
df['minutes'] = df['duration'].str.extract('(\d+)m').fillna(0)

# Convert the hours and minutes to integers
df['hours'] = df['hours'].astype(int)
df['minutes'] = df['minutes'].astype(int)

# Calculate the total minutes
df['total_minutes'] = df['hours'] * 60 + df['minutes']

# Now you can drop the 'hours' and 'minutes' columns if they are not needed
df = df.drop(['hours', 'minutes'], axis=1)


                 names  release_year maturity_rating duration  \
0        Mission Majnu          2023        U/A 16+     2h 9m   
1               Cirkus          2022         U/A 7+    2h 14m   
2  Gangubai Kathiawadi          2022        U/A 16+    2h 33m   
3              Thunivu          2023        U/A 16+    2h 22m   
4    Bhool Bhulaiyaa 2          2022        U/A 13+    2h 21m   

                                         description  \
0  In the 1970s, an undercover Indian spy takes o...   
1  Chaos and comedy take the spotlight when a rin...   
2  Duped and sold to a brothel, a young woman fea...   
3  A major bank heist takes an unnerving turn whe...   
4  When strangers Reet and Ruhan cross paths, the...   

                                               genre         mood  \
0  ['Spy Movies', 'Hindi-Language Movies', 'Bolly...  Suspenseful   
1  ['Hindi-Language Movies', 'Bollywood Movies', ...        Goofy   
2  ['Hindi-Language Movies', 'Movies Based on Boo...  Provocative

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, text):
        embedded = self.embedding(text)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        dense_outputs = self.fc(hidden[-1])
        return dense_outputs


#Determing Vocab size and setting up features

In [13]:
from collections import Counter

# Tokenize descriptions and build a vocabulary
all_tokens = [token for description in df["description"] for token in description.split()]
vocab = Counter(all_tokens)
vocab_size = len(vocab) + 1  # Adding 1 for 'unknown' token

# Determine output dimensions based on the task
output_dim = df['mood'].nunique()  # Adjust 'label' to your actual column name

# Initialize the model, loss function, and optimizer
model = LSTMModel(vocab_size, 100, 256, output_dim)
criterion = nn.CrossEntropyLoss()  # Assuming a multi-class classification problem
optimizer = optim.Adam(model.parameters())


In [16]:

from transformers import BertModel, BertTokenizer
import torch

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()  # Set the model to evaluation mode


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [17]:
# Tokenize and encode sentences in the DataFrame
encoded_inputs = tokenizer(df["description"].tolist(), padding=True, truncation=True, return_tensors="pt", max_length=512)


In [18]:
# Function to generate embeddings in batches
def generate_embeddings(model, encoded_inputs, batch_size=10):
    # Ensure inputs are on the same device as the model
    device = next(model.parameters()).device
    input_ids = encoded_inputs['input_ids'].to(device)
    attention_mask = encoded_inputs['attention_mask'].to(device)
    
    model.to(device)
    
    embeddings = []
    for i in range(0, len(input_ids), batch_size):
        batch_input_ids = input_ids[i:i+batch_size]
        batch_attention_mask = attention_mask[i:i+batch_size]
        
        with torch.no_grad():
            outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
        
        # Use mean pooling to get a single vector for the sentence
        batch_embeddings = outputs.last_hidden_state.mean(dim=1)
        embeddings.append(batch_embeddings.cpu())
    
    # Concatenate all batch embeddings
    embeddings = torch.cat(embeddings, dim=0)
    return embeddings

# Generate embeddings
embeddings = generate_embeddings(model, encoded_inputs)


In [23]:
embeddings.shape

torch.Size([560, 768])

In [24]:
from sklearn.preprocessing import LabelEncoder
import torch

# Encode mood labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(df["mood"])

# Convert labels to a tensor
labels_tensor = torch.tensor(labels_encoded, dtype=torch.long)


In [25]:
from sklearn.model_selection import train_test_split

# Split embeddings and labels into training and test sets
embeddings_train, embeddings_test, labels_train, labels_test = train_test_split(
    embeddings, labels_tensor, test_size=0.2, random_state=42
)

# Convert splits to tensors
embeddings_train_tensor = embeddings_train
embeddings_test_tensor = embeddings_test

In [26]:
class MoodPredictor(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MoodPredictor, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.softmax(out)
        return out

# Determine the number of unique moods to set the output size
output_size = len(torch.unique(labels_tensor))

# Initialize the model
model = MoodPredictor(input_size=embeddings.size(1), hidden_size=128, output_size=output_size)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [28]:
num_epochs = 15

for epoch in range(num_epochs):
    model.train()
    
    # Convert embeddings and labels to Variables
    inputs = torch.autograd.Variable(embeddings_train_tensor)
    targets = torch.autograd.Variable(labels_train)

    # Forward pass
    outputs = model(inputs)
    loss = criterion(outputs, targets)

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [1/15], Loss: 3.7644
Epoch [2/15], Loss: 3.7512
Epoch [3/15], Loss: 3.7416
Epoch [4/15], Loss: 3.7358
Epoch [5/15], Loss: 3.7327
Epoch [6/15], Loss: 3.7311
Epoch [7/15], Loss: 3.7303
Epoch [8/15], Loss: 3.7300
Epoch [9/15], Loss: 3.7298
Epoch [10/15], Loss: 3.7297
Epoch [11/15], Loss: 3.7297
Epoch [12/15], Loss: 3.7297
Epoch [13/15], Loss: 3.7297
Epoch [14/15], Loss: 3.7297
Epoch [15/15], Loss: 3.7297


In [29]:
model.eval()  # Set the model to evaluation mode

# Convert test embeddings and labels to Variables
inputs_test = torch.autograd.Variable(embeddings_test_tensor)
labels_test = torch.autograd.Variable(labels_test)

with torch.no_grad():  # Inference mode, no gradients needed
    outputs_test = model(inputs_test)
    
    # Get predictions from the maximum value
    _, predicted_test = torch.max(outputs_test.data, 1)
    
    # Calculate the number of correctly predicted labels
    correct_predictions = (predicted_test == labels_test).sum().item()
    
    # Calculate the accuracy
    accuracy = correct_predictions / labels_test.size(0)
    print(f'Accuracy of the model on the test set: {accuracy * 100:.2f}%')


Accuracy of the model on the test set: 17.86%
