In [1]:
import pandas as pd


df = pd.read_csv('Data/netfix_cleaned.csv')

# Display the top 5 rows of the dataframe
print(df.head())

# Basic information about the dataset
print("\nDataset Info:")
df.info()

# Descriptive statistics for numeric columns
print("\nDescriptive Statistics:")
print(df.describe())

# Checking for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Check the distribution of a categorical variable (if applicable)
if 'category_column_name' in df.columns:
    print("\nCategory Distribution:")
    print(df['category_column_name'].value_counts())

# Feel free to replace 'category_column_name' with an actual column name from your dataset
# that you're interested to explore.

# Another useful exploration is to see the number of unique values in each column
print("\nUnique Values per Column:")
for col in df.columns:
    print(f"{col}: {df[col].nunique()}")

# Displaying the distribution of numeric data
# Importing necessary libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set the visualisation style
sns.set(style="whitegrid")

# Plotting the distribution of a numeric variable (if applicable)
if 'numeric_column_name' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(df['numeric_column_name'], kde=True, bins=30)
    plt.title('Distribution of Numeric Column')
    plt.xlabel('Numeric Column Name')
    plt.ylabel('Frequency')
    plt.show()

# Remember to replace 'numeric_column_name' with an actual numeric column name from your dataset.
df["mood"] = df["mood"].fillna("Unlabeled")


                 names  release_year maturity_rating duration  \
0        Mission Majnu          2023        U/A 16+     2h 9m   
1               Cirkus          2022         U/A 7+    2h 14m   
2  Gangubai Kathiawadi          2022        U/A 16+    2h 33m   
3              Thunivu          2023        U/A 16+    2h 22m   
4    Bhool Bhulaiyaa 2          2022        U/A 13+    2h 21m   

                                         description  \
0  In the 1970s, an undercover Indian spy takes o...   
1  Chaos and comedy take the spotlight when a rin...   
2  Duped and sold to a brothel, a young woman fea...   
3  A major bank heist takes an unnerving turn whe...   
4  When strangers Reet and Ruhan cross paths, the...   

                                               genre         mood  \
0  ['Spy Movies', 'Hindi-Language Movies', 'Bolly...  Suspenseful   
1  ['Hindi-Language Movies', 'Bollywood Movies', ...        Goofy   
2  ['Hindi-Language Movies', 'Movies Based on Boo...  Provocative

In [2]:
df.duration.isna().sum()

0

In [3]:
# Extract hours and minutes from the duration column
df['hours'] = df['duration'].str.extract('(\d+)h').fillna(0)
df['minutes'] = df['duration'].str.extract('(\d+)m').fillna(0)

# Convert the hours and minutes to integers
df['hours'] = df['hours'].astype(int)
df['minutes'] = df['minutes'].astype(int)

# Calculate the total minutes
df['total_minutes'] = df['hours'] * 60 + df['minutes']

# Now you can drop the 'hours' and 'minutes' columns if they are not needed
df = df.drop(['hours', 'minutes'], axis=1)



W2v works less well thatn IF-IVT

In [4]:
from gensim.models import KeyedVectors
import gensim.downloader as api
import numpy as np

# Load GloVe model
w2v = api.load('glove-wiki-gigaword-50')

# Adjust the function to work with the loaded model
def sentence_to_vec(sentence, model, num_features):
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0
    index2word_set = set(model.key_to_index)  # Updated for gensim 4.0.0 and later
    
    for word in sentence.split():
        if word in index2word_set:
            nwords += 1
            featureVec = np.add(featureVec, model[word])  # Corrected access to word vector
    
    if nwords > 0:
        featureVec = np.divide(featureVec, nwords)
    return featureVec

# Since you're using glove-wiki-gigaword-50, each word vector has 50 dimensions
num_features = 50

# Apply the function to each description
# Assuming 'df' is your DataFrame and 'description' is the column with text data
vec_descriptions = np.array([sentence_to_vec(sentence, w2v, num_features) for sentence in df['description']])

# `vec_descriptions` is a 2D numpy array where each row represents a sentence


In [5]:
vec_descriptions

array([[ 0.610378  ,  0.21967086,  0.0120224 , ...,  0.30045193,
        -0.19191389, -0.09676496],
       [ 0.13837972,  0.14362648, -0.41620106, ..., -0.43784407,
        -0.20476098, -0.2689774 ],
       [ 0.21785122,  0.17795451, -0.11302451, ..., -0.50542885,
        -0.00165171, -0.18363637],
       ...,
       [ 0.29497018,  0.3061172 , -0.3795322 , ..., -0.26171908,
        -0.093453  , -0.107467  ],
       [ 0.28957808, -0.05298619,  0.04495952, ..., -0.1645367 ,
         0.15370815, -0.15710595],
       [ 0.21053235,  0.3161477 , -0.30063367, ..., -0.4940572 ,
         0.0823618 , -0.12313429]], dtype=float32)

In [6]:
import numpy as np
import torch
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer

one_hot_encoder = OneHotEncoder(sparse=False)
encoded_release_year = one_hot_encoder.fit_transform(df[['release_year']])
encoded_maturity_rating = one_hot_encoder.fit_transform(df[['maturity_rating']])

# Normalize 'duration' (convert to total minutes)


vec = TfidfVectorizer()
encoded_description = vec.fit_transform(df.description).toarray() 


# Multi-hot encoding for 'genre' and 'cast'
mlb_genre = MultiLabelBinarizer()
mlb_cast = MultiLabelBinarizer()
encoded_genre = mlb_genre.fit_transform(df['genre'])
encoded_cast = mlb_cast.fit_transform(df['cast'])

# One-hot encoding for 'mood'
encoded_mood = one_hot_encoder.fit_transform(df[['mood']])

# Combine all features into a features tensor
features = np.hstack([encoded_release_year, encoded_maturity_rating, df[['total_minutes']].values, encoded_genre, encoded_cast, encoded_description])

# Convert to PyTorch tensor

features_tensor = torch.tensor(features, dtype=torch.float32)
mood_tensor = torch.tensor([encoded_mood])
print(encoded_description[0])


[0. 0. 0. ... 0. 0. 0.]


  mood_tensor = torch.tensor([encoded_mood])


In [19]:
features_tensor.shape
input_shape = features_tensor.shape[1]
input_shape

2348

In [23]:
features_tensor.shape

torch.Size([560, 2348])

In [22]:
mood_tensor.shape

torch.Size([560, 47])

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, random_split

# Assuming features_tensor contains your features and mood_tensor is your one-hot encoded mood column
mood_classes = mood_tensor.shape[1]  # Number of unique mood classes
features_tensor = features_tensor 
 # Your existing feature tensor
mood_tensor = mood_tensor.squeeze(0)  # Your existing target tensor for mood

# Create Dataset
dataset = TensorDataset(features_tensor, mood_tensor)

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)




In [21]:
import torch
import torch.nn as nn
import torch.optim as optim

class MoodPredictor(nn.Module):
    def __init__(self, input_size, mood_classes):
        super(MoodPredictor, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, mood_classes)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Adjust `input_size` and `mood_classes` according to your dataset
model = MoodPredictor(input_shape, mood_classes)

# Loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    
    # Validation step
    model.eval()  # Set model to evaluation mode
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    val_loss /= len(val_loader)
    val_accuracy = 100 * correct / total
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.2f}%')


ValueError: Target size (torch.Size([32, 47])) must be the same as input size (torch.Size([32, 560]))

In [None]:
model.eval()  # Set the model to evaluation mode
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in val_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        _, labels = torch.max(labels.data, 1)  # assuming your labels are also one-hot encoded
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 14.29%


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Create a logistic regression model
# Using a pipeline to include standard scaling of the data
# Solver 'lbfgs' is a good default choice; you might need to increase `max_iter` for convergence
log = make_pipeline(StandardScaler(), LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000))

# Train the model


In [None]:
features_np = features_tensor.numpy()

# Assuming 'mood_tensor' needs to be a 1D numpy array for scikit-learn
# First, ensure 'mood_tensor' is correctly shaped. It looks like you might have an extra dimension.
# If 'encoded_mood' is your target variable in the correct format, you might not need to wrap it in an additional tensor.
# Here's how to reshape and convert it assuming 'encoded_mood' is a 2D numpy array where each row is a one-hot encoded target.
mood_np = np.argmax(encoded_mood, axis=1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(features_np, mood_np, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
  # Increase max_iter if convergence issues occur

# Fit the model on the training data
log.fit(X_train, y_train)

# Make predictions on the test set
predictions = log.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
class_report = classification_report(y_test, predictions)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.8035714285714286
Confusion Matrix:
[[2 0 0 ... 0 0 0]
 [0 2 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 2 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 2]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       0.67      0.33      0.44         6
           3       1.00      1.00      1.00         1
           7       1.00      1.00      1.00         1
           8       1.00      1.00      1.00         7
           9       0.51      0.95      0.67        20
          11       1.00      0.80      0.89         5
          12       1.00      0.50      0.67         2
          13       0.83      0.83      0.83         6
          14       1.00      1.00      1.00         3
          16       0.00      0.00      0.00         0
          17       1.00      1.00      1.00         3
          18       1.00      1.00      1.00         5
          19       1.00      1.00      1.00         1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


So far logistic regression has performed better than the Neural Net