## Experiments

Here you can organize all the experiments and exploration as you figure out how to collect and analyze your data and build your NLP tool. The experiments you conduct here will contribute to the report/presentation of your project.

Once you've finalized everything, you should then transfer the parts that are necessary for your demo to the code in the `nlp` folder.

In [1]:
import pandas as pd


df = pd.read_csv('Data/netfix_cleaned.csv')

# Display the top 5 rows of the dataframe
print(df.head())

# Basic information about the dataset
print("\nDataset Info:")
df.info()

# Descriptive statistics for numeric columns
print("\nDescriptive Statistics:")
print(df.describe())

# Checking for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Check the distribution of a categorical variable (if applicable)
if 'category_column_name' in df.columns:
    print("\nCategory Distribution:")
    print(df['category_column_name'].value_counts())

# Feel free to replace 'category_column_name' with an actual column name from your dataset
# that you're interested to explore.

# Another useful exploration is to see the number of unique values in each column
print("\nUnique Values per Column:")
for col in df.columns:
    print(f"{col}: {df[col].nunique()}")

# Displaying the distribution of numeric data
# Importing necessary libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set the visualisation style
sns.set(style="whitegrid")

# Plotting the distribution of a numeric variable (if applicable)
if 'numeric_column_name' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(df['numeric_column_name'], kde=True, bins=30)
    plt.title('Distribution of Numeric Column')
    plt.xlabel('Numeric Column Name')
    plt.ylabel('Frequency')
    plt.show()

# Remember to replace 'numeric_column_name' with an actual numeric column name from your dataset.


                 names  release_year maturity_rating duration  \
0        Mission Majnu          2023        U/A 16+     2h 9m   
1               Cirkus          2022         U/A 7+    2h 14m   
2  Gangubai Kathiawadi          2022        U/A 16+    2h 33m   
3              Thunivu          2023        U/A 16+    2h 22m   
4    Bhool Bhulaiyaa 2          2022        U/A 13+    2h 21m   

                                         description  \
0  In the 1970s, an undercover Indian spy takes o...   
1  Chaos and comedy take the spotlight when a rin...   
2  Duped and sold to a brothel, a young woman fea...   
3  A major bank heist takes an unnerving turn whe...   
4  When strangers Reet and Ruhan cross paths, the...   

                                               genre         mood  \
0  ['Spy Movies', 'Hindi-Language Movies', 'Bolly...  Suspenseful   
1  ['Hindi-Language Movies', 'Bollywood Movies', ...        Goofy   
2  ['Hindi-Language Movies', 'Movies Based on Boo...  Provocative

In [None]:
df["mood"].unique()

In [2]:
df["mood"] = df["mood"].fillna("Unlabeled")


In [3]:
df["mood"]

0          Suspenseful
1                Goofy
2          Provocative
3             Exciting
4             Offbeat,
            ...       
555          Feel-Good
556    Adrenaline Rush
557       Bittersweet,
558         Emotional,
559       Sentimental,
Name: mood, Length: 560, dtype: object

In [4]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Example sentences
sentences = ["This is an example sentence.", "Another sentence here."]

# Prepare data for Doc2Vec
tagged_data = [TaggedDocument(words=_d.lower().split(), tags=[str(i)]) for i, _d in enumerate(sentences)]

# Train a Doc2Vec model
model = Doc2Vec(tagged_data, vector_size=100, window=2, min_count=1, workers=4)

# Infer vector for a new sentence
vector = model.infer_vector("This is an example sentence.".split())

In [5]:
df["description"]

0      In the 1970s, an undercover Indian spy takes o...
1      Chaos and comedy take the spotlight when a rin...
2      Duped and sold to a brothel, a young woman fea...
3      A major bank heist takes an unnerving turn whe...
4      When strangers Reet and Ruhan cross paths, the...
                             ...                        
555    A revenge-seeking diamond thief gathers a ragt...
556    A fearless warrior on a perilous mission comes...
557    This biopic follows the life of Indian Army of...
558    Reincarnated 30 years after being killed in a ...
559    Years after his father disowns his adopted bro...
Name: description, Length: 560, dtype: object

In [6]:
from gensim.models.doc2vec import TaggedDocument

documents = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(df["description"])]


In [7]:
from gensim.models import Doc2Vec

#model = Doc2Vec(vector_size=50, window=2, min_count=1, workers=4, epochs=40)
#model.build_vocab(documents)
#model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)


In [8]:
model

<gensim.models.doc2vec.Doc2Vec at 0x7fdda1e79ae0>

In [9]:
model_path = "../notebooks/Model.model"
#model.save(model_path)

model = Doc2Vec.load(model_path)

In [10]:
# Vectorize each description
df['descript_vectors'] = df['description'].apply(lambda x: model.infer_vector(x.split()))


In [11]:
unique_moods = df['mood'].unique()
label_to_index = {label: index for index, label in enumerate(unique_moods)}
label_to_index = {label: index for index, label in enumerate(unique_moods)}



In [12]:
import numpy as np

# Function to generate a one-hot encoded vector
def encode_mood(mood):
    # Initialize a vector of zeros
    vector = np.zeros(len(unique_moods))
    # Set the position corresponding to the label to 1
    vector[label_to_index[mood]] = 1
    return vector

# Apply the function to the "mood" column to create the new "mood_encoded" column
df['mood_encoded'] = df['mood'].apply(encode_mood)


In [13]:
df

Unnamed: 0,names,release_year,maturity_rating,duration,description,genre,mood,cast,subtitles,audio,descript_vectors,mood_encoded
0,Mission Majnu,2023,U/A 16+,2h 9m,"In the 1970s, an undercover Indian spy takes o...","['Spy Movies', 'Hindi-Language Movies', 'Bolly...",Suspenseful,"['Sidharth Malhotra', 'Rashmika Mandanna', 'Pa...","['English,', 'English,', 'Hindi']","['English,', 'Hindi - Audio Description,', 'Hi...","[-0.1500155, 0.08632618, -0.0152871935, -0.083...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,Cirkus,2022,U/A 7+,2h 14m,Chaos and comedy take the spotlight when a rin...,"['Hindi-Language Movies', 'Bollywood Movies', ...",Goofy,"['Ranveer Singh', 'Varun Sharma', 'Pooja Hegde...","['English,', 'Hindi']",['Hindi [Original]'],"[-0.25588515, 0.2742026, 0.1460689, -0.0527879...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,Gangubai Kathiawadi,2022,U/A 16+,2h 33m,"Duped and sold to a brothel, a young woman fea...","['Hindi-Language Movies', 'Movies Based on Boo...",Provocative,"['Alia Bhatt', 'Vijay Raaz', 'Seema Pahwa', 'A...","['English,', 'Hindi']","['Hindi [Original],', 'Telugu']","[-0.1339833, 0.2899793, 0.17676055, 0.1969521,...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,Thunivu,2023,U/A 16+,2h 22m,A major bank heist takes an unnerving turn whe...,"['Crime Movies', 'Action & Adventure']",Exciting,"['Ajith Kumar', 'Manju Warrier', 'Samuthirakan...","['English,', 'Hindi']",['Tamil [Original]'],"[-0.2917229, 0.14196143, 0.16030501, 0.1659849...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,Bhool Bhulaiyaa 2,2022,U/A 13+,2h 21m,"When strangers Reet and Ruhan cross paths, the...","['Hindi-Language Movies', 'Bollywood Movies', ...","Offbeat,","['Tabu', 'Kartik Aaryan', 'Kiara Advani', 'Raj...","['English,', 'Hindi']",['Hindi [Original]'],"[0.051226757, 0.036891937, 0.18064724, -0.0406...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
555,Happy New Year,2014,U/A 13+,2h 58m,A revenge-seeking diamond thief gathers a ragt...,"['Hindi-Language Movies', 'Bollywood Movies', ...",Feel-Good,"['Shah Rukh Khan', 'Deepika Padukone', 'Abhish...",['English'],['Hindi [Original]'],"[-0.18083912, 0.08293516, 0.054100316, 0.03080...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
556,RRR (Hindi),2022,A,3h 5m,A fearless warrior on a perilous mission comes...,"['Action & Adventure', 'Dramas']",Adrenaline Rush,"['NTR Jr.', 'Ram Charan', 'Ajay Devgn', 'Alia ...","['English,', 'Hindi']","['English,', 'Hindi']","[-0.04770275, 0.16565038, -0.065784074, -0.366...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
557,Major (Telugu),2022,U/A 16+,2h 25m,This biopic follows the life of Indian Army of...,"['Movies Based on Real Life', 'Action & Advent...","Bittersweet,","['Adivi Sesh', 'Prakash Raj', 'Sobhita Dhulipa...",['English'],['Telugu [Original]'],"[-0.14330018, 0.15610804, 0.10171779, -0.14134...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
558,Om Shanti Om,2007,U/A 16+,2h 48m,Reincarnated 30 years after being killed in a ...,"['Romantic Comedies', 'Hindi-Language Movies',...","Emotional,","['Shah Rukh Khan', 'Deepika Padukone', 'Shreya...",['English'],['Hindi [Original]'],"[-0.21662304, 0.35099527, 0.2030976, 0.0746853...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."


In [14]:
import numpy as np

# Assuming `descript_vectors` is a list of lists and `mood_encoded` is a list of one-hot encoded lists
X = np.array(df['descript_vectors'].tolist())
y = np.array(df['mood_encoded'].tolist())

# LSTM expects 3D input: [samples, time_steps, features]
# If `descript_vectors` are not in sequence form (e.g., single vector per description), you may need to reshape or add a dimension
# For example, if your vectors are already in the desired sequence form, you might not need to reshape
# If you need to add a time_step dimension (assuming each vector represents a single time step):
#X = np.expand_dims(X, 1)  # Adds a time_step dimension


Trying Logistic Regression First

In [15]:
# If y is one-hot encoded, convert it to integer labels
y_int_labels = np.argmax(y, axis=1)

In [16]:
from sklearn.model_selection import train_test_split

# Split the data while retaining indices
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y_int_labels, df.index, test_size=0.2, random_state=42)



In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Create a logistic regression model
# Using a pipeline to include standard scaling of the data
# Solver 'lbfgs' is a good default choice; you might need to increase `max_iter` for convergence
model = make_pipeline(StandardScaler(), LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000))

# Train the model
model.fit(X_train, y_train)


In [18]:
from sklearn.metrics import accuracy_score

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")


Test Accuracy: 0.6428571428571429


In [19]:

# Convert integer labels back to mood strings if necessary
# Assuming you have a mapping from step 2 of the logistic regression part
mood_mapping = {index: mood for mood, index in label_to_index.items()}
predicted_moods = [mood_mapping[label] for label in y_pred]

# Retrieve corresponding rows from the original DataFrame
test_descriptions = df.loc[indices_test, 'description']
actual_moods = df.loc[indices_test, 'mood']

# Display a few examples
for i, (desc, pred_mood, actual_mood) in enumerate(zip(test_descriptions, predicted_moods, actual_moods)):
    print(f"Description: {desc}\nPredicted Mood: {pred_mood}\nActual Mood: {actual_mood}\n")
    if i == 10:  # Change this number to display more or fewer examples
        break

Description: An ancient rivalry pits two legends against each other while a mission into uncharted terrain unearths clues to the Titans' very origins.
Predicted Mood: Exciting
Actual Mood: Exciting

Description: When an earthquake obliterates their cave, an unworldly prehistoric family is forced to journey through unfamiliar terrain in search of a new home.
Predicted Mood: Exciting
Actual Mood: Exciting

Description: After a mother's sudden death, chaos and grief collide when four adult siblings return to their traditional father's home for the funeral.
Predicted Mood: Quirky,
Actual Mood: Quirky,

Description: Josh and his parents are shocked to find a singing crocodile in their attic but quickly learn to love him. If only their grumpy neighbor felt the same!
Predicted Mood: Feel-Good
Actual Mood: Feel-Good

Description: Four animal friends get a taste of the wild life when they break out of captivity at the Central Park Zoo and wash ashore on the island of Madagascar.
Predicted Mood:

Now Trying LSTM Model

In [45]:
X = np.array(df['descript_vectors'].tolist())
y = np.array(df['mood_encoded'].tolist())

Unnamed: 0,names,release_year,maturity_rating,duration,description,genre,mood,cast,subtitles,audio,descript_vectors,mood_encoded
0,Mission Majnu,2023,U/A 16+,2h 9m,"In the 1970s, an undercover Indian spy takes o...","['Spy Movies', 'Hindi-Language Movies', 'Bolly...",Suspenseful,"['Sidharth Malhotra', 'Rashmika Mandanna', 'Pa...","['English,', 'English,', 'Hindi']","['English,', 'Hindi - Audio Description,', 'Hi...","[-0.1500155, 0.08632618, -0.0152871935, -0.083...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,Cirkus,2022,U/A 7+,2h 14m,Chaos and comedy take the spotlight when a rin...,"['Hindi-Language Movies', 'Bollywood Movies', ...",Goofy,"['Ranveer Singh', 'Varun Sharma', 'Pooja Hegde...","['English,', 'Hindi']",['Hindi [Original]'],"[-0.25588515, 0.2742026, 0.1460689, -0.0527879...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,Gangubai Kathiawadi,2022,U/A 16+,2h 33m,"Duped and sold to a brothel, a young woman fea...","['Hindi-Language Movies', 'Movies Based on Boo...",Provocative,"['Alia Bhatt', 'Vijay Raaz', 'Seema Pahwa', 'A...","['English,', 'Hindi']","['Hindi [Original],', 'Telugu']","[-0.1339833, 0.2899793, 0.17676055, 0.1969521,...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,Thunivu,2023,U/A 16+,2h 22m,A major bank heist takes an unnerving turn whe...,"['Crime Movies', 'Action & Adventure']",Exciting,"['Ajith Kumar', 'Manju Warrier', 'Samuthirakan...","['English,', 'Hindi']",['Tamil [Original]'],"[-0.2917229, 0.14196143, 0.16030501, 0.1659849...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,Bhool Bhulaiyaa 2,2022,U/A 13+,2h 21m,"When strangers Reet and Ruhan cross paths, the...","['Hindi-Language Movies', 'Bollywood Movies', ...","Offbeat,","['Tabu', 'Kartik Aaryan', 'Kiara Advani', 'Raj...","['English,', 'Hindi']",['Hindi [Original]'],"[0.051226757, 0.036891937, 0.18064724, -0.0406...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
555,Happy New Year,2014,U/A 13+,2h 58m,A revenge-seeking diamond thief gathers a ragt...,"['Hindi-Language Movies', 'Bollywood Movies', ...",Feel-Good,"['Shah Rukh Khan', 'Deepika Padukone', 'Abhish...",['English'],['Hindi [Original]'],"[-0.18083912, 0.08293516, 0.054100316, 0.03080...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
556,RRR (Hindi),2022,A,3h 5m,A fearless warrior on a perilous mission comes...,"['Action & Adventure', 'Dramas']",Adrenaline Rush,"['NTR Jr.', 'Ram Charan', 'Ajay Devgn', 'Alia ...","['English,', 'Hindi']","['English,', 'Hindi']","[-0.04770275, 0.16565038, -0.065784074, -0.366...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
557,Major (Telugu),2022,U/A 16+,2h 25m,This biopic follows the life of Indian Army of...,"['Movies Based on Real Life', 'Action & Advent...","Bittersweet,","['Adivi Sesh', 'Prakash Raj', 'Sobhita Dhulipa...",['English'],['Telugu [Original]'],"[-0.14330018, 0.15610804, 0.10171779, -0.14134...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
558,Om Shanti Om,2007,U/A 16+,2h 48m,Reincarnated 30 years after being killed in a ...,"['Romantic Comedies', 'Hindi-Language Movies',...","Emotional,","['Shah Rukh Khan', 'Deepika Padukone', 'Shreya...",['English'],['Hindi [Original]'],"[-0.21662304, 0.35099527, 0.2030976, 0.0746853...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."


In [20]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the LSTM model class
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Initialize hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        # Forward propagate the LSTM
        out, _ = self.lstm(x, (h0, c0))
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

In [21]:
# Assuming X and y are already numpy arrays loaded as described previously
# Convert to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

# If your y values are one-hot encoded and you want to use CrossEntropyLoss,
# you should convert them to a long tensor with class indices instead:
_, y_indices = y_tensor.max(dim=1)
y_indices = y_indices.type(torch.long)

In [22]:
print(X.shape)
X = X.reshape((X.shape[0], 1, X.shape[1]))
print(X.shape)

(560, 50)
(560, 1, 50)


In [23]:
# Set the input size, hidden size, output size, and number of layers
input_size = X.shape[2]  # Features
hidden_size = 50  # Can vary
output_size = y.shape[1]  # Number of classes
num_layers = 1  # Can vary

# Initialize the model
model = LSTMClassifier(input_size, hidden_size, output_size, num_layers)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [47]:
# Set the number of epochs
num_epochs = 10

# Convert to PyTorch data types
train_data = torch.utils.data.TensorDataset(X_tensor, y_indices)
train_loader = torch.utils.data.DataLoader(dataset=train_data, batch_size=64, shuffle=True)

# Training process
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(train_loader):
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')



RuntimeError: For unbatched 2-D input, hx and cx should also be 2-D but got (3-D, 3-D) tensors

In [50]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
vec1 = CountVectorizer(min_df=5, ngram_range=(1,3), binary=True, stop_words='english')

In [51]:
vec = TfidfVectorizer()

In [52]:
X_count = vec.fit_transform(df.description)
y_count = df['mood_encoded']

In [53]:
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X_count, y_int_labels, df.index, test_size=0.2, random_state=42)


In [54]:
clf = LogisticRegression(max_iter=1000, C=2, class_weight='balanced')         
    # (4) Finally, train on ALL data one final time and
    # train. Save the classifier to disk.
clf.fit(X_train, y_train)

In [55]:
# Predict on the test set
y_pred = clf.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 0.7946428571428571


Countvectorizer worked better