First step: loading, merging and cleaning the datasets

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shivamb/netflix-shows")

print("Path to dataset files:", path)

# Download latest version
path = kagglehub.dataset_download("ashpalsingh1525/imdb-movies-dataset")

print("Path to dataset files:", path)

import pandas as pd
import os

netflix_path = '/kaggle/input/netflix-shows/netflix_titles.csv'
imdb_path = '/kaggle/input/imdb-movies-dataset/imdb_movies.csv'

# Load datasets using pandas
try:
    netflix_df = pd.read_csv(netflix_path)
    imdb_df = pd.read_csv(imdb_path)
except FileNotFoundError:
    print(f"Error: One or both of the CSV files were not found. Please ensure the datasets have been downloaded correctly.")
    exit()


## --- Data Cleaning and Preprocessing for Netflix Data ---

# Select relevant columns
netflix_df = netflix_df[['title', 'type', 'description']]

# Remove rows with missing titles
netflix_df.dropna(subset=['title'], inplace=True)

# Convert titles to lowercase for case-insensitive matching
netflix_df['title'] = netflix_df['title'].str.lower()

# Remove duplicate entries based on title (keep the first occurrence)
netflix_df.drop_duplicates(subset='title', keep='first', inplace=True)


## --- Data Cleaning and Preprocessing for IMDB Data ---

# Select relevant columns
imdb_df = imdb_df[['names','score','orig_lang','genre']]

# Convert titles to lowercase for case-insensitive matching
imdb_df['names'] = imdb_df['names'].str.lower()

# Rename 'names' column to 'title' for easier merging
imdb_df['title'] = imdb_df['names']

# Remove rows with missing reviews
imdb_df.dropna(subset=['score'], inplace=True)

# Remove duplicate entries based on title (keep the first occurrence)
imdb_df.drop_duplicates(subset='names', keep='first', inplace=True)


# Merging datasets based on title
classified_data_set_netflix_and_imdb = pd.merge(netflix_df, imdb_df, on = 'title')

# Save the new dataset to a CSV file
classified_data_set_netflix_and_imdb.to_csv('drive/MyDrive/ds_movie/classified_data_set_netflix_and_imdb.csv', index=False)

print("Merged dataset saved as classified_data_set_netflix_and_imdb.csv")

Path to dataset files: /kaggle/input/netflix-shows
Path to dataset files: /kaggle/input/imdb-movies-dataset
Merged dataset saved as classified_data_set_netflix_and_imdb.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  netflix_df.dropna(subset=['title'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  netflix_df['title'] = netflix_df['title'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  netflix_df.drop_duplicates(subset='title', keep='first', inplace=True)


Second step: training a genre classifying model based on our data **(no need to run everytime)**

In [1]:
# Install dependencies
!pip install -q transformers datasets scikit-learn

# Imports
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import random

# Set seed for reproducibility
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)

# Load dataset
df = pd.read_csv("drive/MyDrive/ds_movie/classified_data_set_netflix_and_imdb.csv")

# Basic preprocessing: keep description and genre
df = df[['description', 'genre']].dropna()

# Convert genre column to list of genres
df['genre'] = df['genre'].apply(lambda x: [g.strip() for g in x.split(',')])

# Binarize labels
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(df['genre'])

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['description'], labels, test_size=0.2, random_state=42
)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize
train_encodings = tokenizer(
    list(train_texts), truncation=True, padding=True, max_length=128, return_tensors="pt"
)
val_encodings = tokenizer(
    list(val_texts), truncation=True, padding=True, max_length=128, return_tensors="pt"
)

# Dataset class
class GenreDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# Create DataLoaders
train_dataset = GenreDataset(train_encodings, train_labels)
val_dataset = GenreDataset(val_encodings, val_labels)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Load model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(mlb.classes_), problem_type="multi_label_classification"
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
model.train()
for epoch in range(3):
    print(f"Epoch {epoch+1}")
    total_train_loss = 0
    for batch in tqdm(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Training Loss: {total_train_loss / len(train_loader):.4f}")


# Evaluation loop (optional)
model.eval()
val_loss = 0
with torch.no_grad():
    for batch in val_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        val_loss += outputs.loss.item()
print(f"Validation Loss: {val_loss / len(val_loader):.4f}")


# Save the trained model
# Define a path to save the model
model_save_path = "drive/MyDrive/ds_movie/bert_genre_classifier"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model saved to {model_save_path}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1


  0%|          | 0/152 [00:00<?, ?it/s]

Training Loss: 0.3710
Epoch 2


  0%|          | 0/152 [00:00<?, ?it/s]

Training Loss: 0.2818
Epoch 3


  0%|          | 0/152 [00:00<?, ?it/s]

Training Loss: 0.2245
Validation Loss: 0.2719
Model saved to drive/MyDrive/ds_movie/bert_genre_classifier


Train logistic regression

In [None]:
# Logistic regression to predict HIT independent from the BERT model

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import re
import nltk
try:
    nltk.download('stopwords', quiet=True)
except Exception as e:
    print(f"Error downloading NLTK stopwords: {e}")

from nltk.corpus import stopwords

# Import joblib for saving and loading models and preprocessors
import joblib
import os # Import os module to check for file existence

# Define paths to save the model and preprocessors
model_save_path_lr = "drive/MyDrive/ds_movie/logistic_regression_hit_predictor.joblib"
vectorizer_save_path_lr = "drive/MyDrive/ds_movie/tfidf_vectorizer_lr.joblib"
mlb_save_path_lr = "drive/MyDrive/ds_movie/mlb_lr.joblib"
stopwords_save_path_lr = "drive/MyDrive/ds_movie/stopwords_lr.joblib" # Optionally save stopwords if needed later

# Check if saved model and preprocessors exist
if os.path.exists(model_save_path_lr) and os.path.exists(vectorizer_save_path_lr) and os.path.exists(mlb_save_path_lr):
    print("Loading saved model and preprocessors...")
    lr_model = joblib.load(model_save_path_lr)
    vectorizer_lr = joblib.load(vectorizer_save_path_lr)
    mlb_lr = joblib.load(mlb_save_path_lr)
    stopwords_english = joblib.load(stopwords_save_path_lr)
    print("Loaded saved stopwords.")
    print("Model and preprocessors loaded successfully.")

else:
    print("Saved model and preprocessors not found. Training new model...")
    # 1. Prepare data
    df_lr = pd.read_csv('drive/MyDrive/ds_movie/classified_data_set_netflix_and_imdb.csv')
    df_lr = df_lr[['description', 'genre', 'score']].dropna()
    df_lr['genre'] = df_lr['genre'].apply(lambda x: [g.strip() for g in x.split(',')])

    # 2. Preprocessing
    def preprocess(text):
        if isinstance(text, str):
            text = text.lower()
            text = re.sub(r'[^\w\s]', '', text)
            tokens = text.split()
            # Use the stopwords from nltk.corpus directly
            tokens = [w for w in tokens if w not in stopwords.words('english')]
            return " ".join(tokens)
        return ""

    df_lr['description_clean'] = df_lr['description'].apply(preprocess)
    df_lr['hit'] = df_lr['score'].apply(lambda x: 1 if x > 65 else 0)

    # 3. Vectorizing
    vectorizer_lr = TfidfVectorizer(max_features=3000)
    X_desc_lr = vectorizer_lr.fit_transform(df_lr['description_clean']).toarray()

    mlb_lr = MultiLabelBinarizer()
    X_genre_lr = mlb_lr.fit_transform(df_lr['genre'])

    X_lr = np.hstack([X_desc_lr, X_genre_lr])
    y_lr = df_lr['hit'].values

    # 4. Train model
    X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X_lr, y_lr, test_size=0.2, random_state=42)
    lr_model = LogisticRegression(max_iter=200)
    lr_model.fit(X_train_lr, y_train_lr)

    # 5. Evaluation after training)
    y_pred_lr = lr_model.predict(X_test_lr)
    print("Accuracy:", accuracy_score(y_test_lr, y_pred_lr))
    print(classification_report(y_test_lr, y_pred_lr))

    # Save the trained model and preprocessors
    joblib.dump(lr_model, model_save_path_lr)
    joblib.dump(vectorizer_lr, vectorizer_save_path_lr)
    joblib.dump(mlb_lr, mlb_save_path_lr)
    joblib.dump(stopwords.words('english'), stopwords_save_path_lr)
    print("Model and preprocessors trained and saved.")


Saved model and preprocessors not found. Training new model...
Accuracy: 0.7302631578947368
              precision    recall  f1-score   support

           0       0.70      0.75      0.72       143
           1       0.76      0.71      0.74       161

    accuracy                           0.73       304
   macro avg       0.73      0.73      0.73       304
weighted avg       0.73      0.73      0.73       304

Model and preprocessors trained and saved.


Third step: ask for user input

In [None]:
# Ask for user input
text_to_predict = input("Write your movie idea: ")

Write your movie idea: After a global blackout wipes all digital records, a former hacker is hired by a rogue archivist to retrieve the last remaining physical backups of humanity’s knowledge. But powerful forces want to keep the world in ignorance. As he journeys across a lawless post-collapse America, he finds that some truths were buried for a reason. He’s forced to choose between freedom and forgetting. Knowledge is power—but also a curse.


Fourth step: predict genre and audience

In [None]:
# --- Genre Prediction ---
# Get saved model
model_save_path = "drive/MyDrive/ds_movie/bert_genre_classifier"

import torch
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

# Load the dataset
df_for_mlb = pd.read_csv("drive/MyDrive/ds_movie/classified_data_set_netflix_and_imdb.csv")

try:
    # Try loading the dataset again to fit mlb if it's not already available (just to be safe)
    df_for_mlb = pd.read_csv("drive/MyDrive/ds_movie/classified_data_set_netflix_and_imdb.csv")
    df_for_mlb = df_for_mlb[['description', 'genre']].dropna()
    df_for_mlb['genre'] = df_for_mlb['genre'].apply(lambda x: [g.strip() for g in x.split(',')])
    mlb = MultiLabelBinarizer()
    mlb.fit(df_for_mlb['genre']) # Fit mlb with the genre labels from the dataset
    print("MultiLabelBinarizer fitted from data.")
except FileNotFoundError:
    print("Warning: classified_data_set_netflix_and_imdb.csv not found. mlb might not be initialized correctly.")
    # Handle error or ensure mlb is available from a previous cell

tokenizer = BertTokenizer.from_pretrained(model_save_path)

# Load the trained model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load the model structure
if 'mlb' in locals(): # Check if mlb is defined
    num_labels = len(mlb.classes_)
else:
    print("Error: MultiLabelBinarizer not initialized. Cannot load model with correct num_labels.")
    exit()

model = BertForSequenceClassification.from_pretrained(model_save_path, num_labels=num_labels, problem_type="multi_label_classification")
model.to(device)

print(f"Model and tokenizer loaded from {model_save_path}")

inputs = tokenizer(text_to_predict, return_tensors="pt", truncation=True, padding=True).to(device)

# Predict
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.sigmoid(logits).cpu().numpy().squeeze()

# Thresholding
predicted_indices = np.where(probs > 0.4)[0]
predicted_labels = mlb.classes_[predicted_indices]

# Print results
print(f"\nInput: {text_to_predict}")
print(f"Predicted Genres: {list(predicted_labels)}")
print(f"All Genre Probabilities:\n{dict(zip(mlb.classes_, probs))}")

MultiLabelBinarizer fitted from data.
Model and tokenizer loaded from drive/MyDrive/ds_movie/bert_genre_classifier

Input: After a global blackout wipes all digital records, a former hacker is hired by a rogue archivist to retrieve the last remaining physical backups of humanity’s knowledge. But powerful forces want to keep the world in ignorance. As he journeys across a lawless post-collapse America, he finds that some truths were buried for a reason. He’s forced to choose between freedom and forgetting. Knowledge is power—but also a curse.
Predicted Genres: ['Action', 'Science Fiction', 'Thriller']
All Genre Probabilities:
{'Action': np.float32(0.8484135), 'Adventure': np.float32(0.20147634), 'Animation': np.float32(0.03524376), 'Comedy': np.float32(0.04005445), 'Crime': np.float32(0.11394642), 'Documentary': np.float32(0.0135390945), 'Drama': np.float32(0.2759958), 'Family': np.float32(0.020217663), 'Fantasy': np.float32(0.04935081), 'History': np.float32(0.026049802), 'Horror': np.

In [None]:
# --- Audience Prediction ---

from transformers import pipeline

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define the possible audience labels
audience_labels = [
    "This movie is for preschool children.",
    "This movie is for teenagers.",
    "This movie is for young adults.",
    "This movie is for adults.",
    "This movie is for seniors."
]

# Perform classification
result = classifier(text_to_predict, candidate_labels=audience_labels)

# Print results
print("Input:", text_to_predict)
print("\nPredicted audience:", result["labels"][0])
print("\nAll scores:")
for label, score in zip(result["labels"], result["scores"]):
    print(f"{label}: {score:.4f}")

Device set to use cpu


Input: After a global blackout wipes all digital records, a former hacker is hired by a rogue archivist to retrieve the last remaining physical backups of humanity’s knowledge. But powerful forces want to keep the world in ignorance. As he journeys across a lawless post-collapse America, he finds that some truths were buried for a reason. He’s forced to choose between freedom and forgetting. Knowledge is power—but also a curse.

Predicted audience: This movie is for adults.

All scores:
This movie is for adults.: 0.4270
This movie is for young adults.: 0.2483
This movie is for seniors.: 0.1664
This movie is for teenagers.: 0.0897
This movie is for preschool children.: 0.0686


Fifth step: predicting if the idea will be a hit or not through a logistic regression using our dataset

In [None]:
# Prediction function
import re
import nltk
from nltk.corpus import stopwords
import joblib

nltk.download('stopwords')


def preprocess(text):
        if isinstance(text, str):
            text = text.lower()
            text = re.sub(r'[^\w\s]', '', text)
            tokens = text.split()
            # Use the stopwords from nltk.corpus directly
            tokens = [w for w in tokens if w not in stopwords.words('english')]
            return " ".join(tokens)
        return ""

model_save_path_lr = "drive/MyDrive/ds_movie/logistic_regression_hit_predictor.joblib"
vectorizer_save_path_lr = "drive/MyDrive/ds_movie/tfidf_vectorizer_lr.joblib"
mlb_save_path_lr = "drive/MyDrive/ds_movie/mlb_lr.joblib"
stopwords_save_path_lr = "drive/MyDrive/ds_movie/stopwords_lr.joblib"

vectorizer_lr = joblib.load(vectorizer_save_path_lr)
mlb_lr = joblib.load(mlb_save_path_lr)
lr_model = joblib.load(model_save_path_lr)

def predict_hit_lr(text, genre_labels):
    desc_proc = preprocess(text)
    desc_vec = vectorizer_lr.transform([desc_proc]).toarray()

    if not isinstance(genre_labels, list):
        genre_labels = [str(genre_labels)]
    # Filter out genres not seen during training
    known_genres = [g for g in genre_labels if g in mlb_lr.classes_]
    if not known_genres:
         # If no known genres are present, create a zero vector of the correct size
        genre_vec = np.zeros((1, len(mlb_lr.classes_)))
    else:
        # Create a temporary list of lists for mlb_lr.transform
        genre_vec = mlb_lr.transform([known_genres])


    combined_vec = np.hstack([desc_vec, genre_vec])
    pred = lr_model.predict(combined_vec)[0]
    prob = lr_model.predict_proba(combined_vec)[0][1]
    return "Hit!" if pred == 1 else "Flop!", round(prob * 100, 2)

# Predicting and printing results
result, confidence = predict_hit_lr(text_to_predict, list(predicted_labels))
print(f"Prediction: {result} (Confidence: {confidence}%)")

Prediction: Flop! (Confidence: 23.79%)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Descriptive statistics

In [None]:
import pandas as pd

df = pd.read_csv('drive/MyDrive/ds_movie/classified_data_set_netflix_and_imdb.csv')

print(df.describe())
print(df.columns)
print(df["type"].value_counts())
print(df["orig_lang"].value_counts())
print(df["genre"].value_counts().head(20))

             score
count  1519.000000
mean     65.685319
std       8.713171
min       0.000000
25%      60.000000
50%      66.000000
75%      72.000000
max      86.000000
Index(['title', 'type', 'description', 'names', 'score', 'orig_lang', 'genre'], dtype='object')
type
Movie      1422
TV Show      97
Name: count, dtype: int64
orig_lang
English               1278
Japanese                43
Spanish, Castilian      38
Korean                  32
French                  28
Chinese                 21
Cantonese               11
Hindi                    8
German                   8
Thai                     8
Italian                  7
Norwegian                7
Portuguese               6
Swedish                  4
Russian                  4
Dutch, Flemish           4
Polish                   3
Turkish                  2
Indonesian               2
Danish                   2
Malay                    1
Basque                   1
Icelandic                1
Name: count, dtype: int64
genre
Drama  


# **TEST RUN WITH MULTIPLE PROMPTS**

In [None]:
# Testing with prompts to be evaluated

import pandas as pd

test = pd.read_excel('drive/MyDrive/ds_movie/test_run.xlsx')


In [None]:
# --- Genre Prediction ---
# Get saved model
model_save_path = "drive/MyDrive/ds_movie/bert_genre_classifier"

import torch
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

# Load the dataset
df_for_mlb = pd.read_csv("drive/MyDrive/ds_movie/classified_data_set_netflix_and_imdb.csv")

try:
    # Try loading the dataset again to fit mlb if it's not already available (just to be safe)
    df_for_mlb = pd.read_csv("drive/MyDrive/ds_movie/classified_data_set_netflix_and_imdb.csv")
    df_for_mlb = df_for_mlb[['description', 'genre']].dropna()
    df_for_mlb['genre'] = df_for_mlb['genre'].apply(lambda x: [g.strip() for g in x.split(',')])
    mlb = MultiLabelBinarizer()
    mlb.fit(df_for_mlb['genre']) # Fit mlb with the genre labels from the dataset
    print("MultiLabelBinarizer fitted from data.")
except FileNotFoundError:
    print("Warning: classified_data_set_netflix_and_imdb.csv not found. mlb might not be initialized correctly.")
    # Handle error or ensure mlb is available from a previous cell

tokenizer = BertTokenizer.from_pretrained(model_save_path)

# Load the trained model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load the model structure
if 'mlb' in locals(): # Check if mlb is defined
    num_labels = len(mlb.classes_)
else:
    print("Error: MultiLabelBinarizer not initialized. Cannot load model with correct num_labels.")
    exit()

model = BertForSequenceClassification.from_pretrained(model_save_path, num_labels=num_labels, problem_type="multi_label_classification")
model.to(device)

print(f"Model and tokenizer loaded from {model_save_path}")

# Convert the 'Prompt' column to a list
prompts_list = test["Prompt"].tolist()

inputs = tokenizer(prompts_list, return_tensors="pt", truncation=True, padding=True).to(device)

# Predict
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.sigmoid(logits).cpu().numpy() # probs will be shape (batch_size, num_labels)

# Thresholding and assigning predicted genres to the DataFrame
predicted_genres = []
for i in range(probs.shape[0]):
    predicted_indices = np.where(probs[i] > 0.4)[0]
    predicted_labels = mlb.classes_[predicted_indices]
    predicted_genres.append(list(predicted_labels))

test["Genre"] = predicted_genres

MultiLabelBinarizer fitted from data.
Model and tokenizer loaded from drive/MyDrive/ds_movie/bert_genre_classifier


In [None]:
# --- Audience Prediction ---

from transformers import pipeline

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define the possible audience labels
audience_labels = [
    "This movie is for preschool children.",
    "This movie is for teenagers.",
    "This movie is for young adults.",
    "This movie is for adults.",
    "This movie is for seniors."
]

# Perform classification
result = classifier(prompts_list, candidate_labels=audience_labels)

test["Audience"] = result

Device set to use cpu


In [None]:
# Prediction function
import re
import nltk
from nltk.corpus import stopwords
import joblib

# Download stopwords if not already downloaded
try:
    nltk.data.find('corpora/stopwords')
except nltk.downloader.DownloadError:
    nltk.download('stopwords')
except LookupError:
    nltk.download('stopwords')


def preprocess(text):
        if isinstance(text, str):
            text = text.lower()
            text = re.sub(r'[^\w\s]', '', text)
            tokens = text.split()
            # Use the stopwords from nltk.corpus directly
            tokens = [w for w in tokens if w not in stopwords.words('english')]
            return " ".join(tokens)
        return ""

model_save_path_lr = "drive/MyDrive/ds_movie/logistic_regression_hit_predictor.joblib"
vectorizer_save_path_lr = "drive/MyDrive/ds_movie/tfidf_vectorizer_lr.joblib"
mlb_save_path_lr = "drive/MyDrive/ds_movie/mlb_lr.joblib"
stopwords_save_path_lr = "drive/MyDrive/ds_movie/stopwords_lr.joblib"

# Load model and preprocessors
vectorizer_lr = joblib.load(vectorizer_save_path_lr)
mlb_lr = joblib.load(mlb_save_path_lr)
lr_model = joblib.load(model_save_path_lr)

def predict_hit_lr(text, genre_labels):
    desc_proc = preprocess(text)
    desc_vec = vectorizer_lr.transform([desc_proc]).toarray()

    if not isinstance(genre_labels, list):
        genre_labels = [str(genre_labels)]
    # Filter out genres not seen during training
    known_genres = [g for g in genre_labels if g in mlb_lr.classes_]
    if not known_genres:
         # If no known genres are present, create a zero vector of the correct size
        genre_vec = np.zeros((1, len(mlb_lr.classes_)))
    else:
        # Create a temporary list of lists for mlb_lr.transform
        genre_vec = mlb_lr.transform([known_genres])


    combined_vec = np.hstack([desc_vec, genre_vec])
    pred = lr_model.predict(combined_vec)[0]
    prob = lr_model.predict_proba(combined_vec)[0][1]
    return "Hit!" if pred == 1 else "Flop!", round(prob * 100, 2)

# Predicting and storing results in the DataFrame
test["Hit or not"] = None
test["Confidence score"] = None

for index, row in test.iterrows():
    prompt = row["Prompt"]
    # Ensure 'Genre' column has been populated from the previous step
    if "Genre" in test.columns:
        predicted_genres = row["Genre"]
        result, confidence = predict_hit_lr(prompt, predicted_genres)
        test.loc[index, "Hit or not"] = result
        test.loc[index, "Confidence score"] = confidence
    else:
        print(f"Skipping row {index}: 'Genre' column not found. Please run the genre prediction cell first.")
        break # Stop if Genre column is not available

# Display the updated DataFrame with predictions
display(test)

test.to_excel("drive/MyDrive/ds_movie/test_run_results.xlsx", index=False)

Unnamed: 0,Prompt,Genre,Evaluation,Audience,Evaluation.1,Hit or not,Confidence score
0,A young woman discovers a letter hidden in a l...,"[Drama, Thriller]",,{'sequence': 'A young woman discovers a letter...,,Hit!,67.34
1,After a global blackout wipes all digital reco...,"[Action, Science Fiction, Thriller]",,{'sequence': 'After a global blackout wipes al...,,Flop!,23.79
2,A single father working as a janitor at a univ...,"[Drama, Romance]",,{'sequence': 'A single father working as a jan...,,Hit!,70.41
3,A crew of space miners responds to a distress ...,"[Horror, Science Fiction, Thriller]",,{'sequence': 'A crew of space miners responds ...,,Flop!,14.21
4,"In a sleepy coastal town, an aging lighthouse ...","[Action, Horror, Thriller]",,"{'sequence': 'In a sleepy coastal town, an agi...",,Flop!,15.0
...,...,...,...,...,...,...,...
95,After her fiancé disappears just days before t...,"[Horror, Thriller]",,{'sequence': 'After her fiancé disappears just...,,Flop!,26.11
96,A data analyst working for a dating app discov...,"[Drama, Romance]",,{'sequence': 'A data analyst working for a dat...,,Hit!,68.6
97,A reclusive painter living in a remote forest ...,"[Horror, Thriller]",,{'sequence': 'A reclusive painter living in a ...,,Flop!,22.2
98,A traveling circus in the 1930s hires a down-o...,[Comedy],,{'sequence': 'A traveling circus in the 1930s ...,,Flop!,30.07
