In [1]:
!pip install transformers torch pandas numpy scikit-learn

# Cell 2: Import libraries
import pandas as pd
import numpy as np
from transformers import pipeline
import torch
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [3]:
movies = pd.read_csv('movies_cleaned.csv')

In [5]:
print("Loading BART model...")
classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device=0 if torch.cuda.is_available() else -1
)
print("Model loaded successfully!")

Loading BART model...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


Model loaded successfully!


In [7]:
candidate_genres = [
    "Action", "Adventure", "Animation", "Comedy", "Crime", "Drama",
    "Family", "Fantasy", "Horror", "Mystery", "Romance", "Sci-Fi",
    "Thriller", "War", "Western", "Biography", "Documentary", "History"
]

print(f"Will classify into {len(candidate_genres)} genres: {candidate_genres}")


Will classify into 18 genres: ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 'Horror', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western', 'Biography', 'Documentary', 'History']


In [8]:
def get_predicted_genres(overview_text, top_k=3):
    """
    Get top predicted genres for a movie overview

    Args:
        overview_text (str): Movie overview/plot
        top_k (int): Number of top genres to return

    Returns:
        tuple: (top_genres_list, top_scores_list)
    """
    if pd.isna(overview_text) or overview_text.strip() == "":
        return [], []

    try:
        # Get predictions
        result = classifier(str(overview_text), candidate_genres)

        # Return top k genres and their scores
        top_genres = result['labels'][:top_k]
        top_scores = [round(score, 3) for score in result['scores'][:top_k]]

        return top_genres, top_scores

    except Exception as e:
        print(f"Error processing text: {e}")
        return [], []


In [9]:
overview_column = 'Overview'  # Adjust this to match your CSV column name

if overview_column in movies.columns:
    print("Testing on first movie...")
    test_overview = movies[overview_column].iloc[0]
    test_genres, test_scores = get_predicted_genres(test_overview, top_k=5)

    print(f"Sample overview: {test_overview[:200]}...")
    print(f"Predicted genres: {test_genres}")
    print(f"Confidence scores: {test_scores}")
else:
    print(f"Column '{overview_column}' not found!")
    print("Available columns:", movies.columns.tolist())
    # You may need to adjust the column name above

Testing on first movie...
Sample overview: Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency....
Predicted genres: ['Action', 'Adventure', 'History', 'Documentary', 'Western']
Confidence scores: [0.224, 0.133, 0.093, 0.077, 0.073]


In [10]:
print("\nProcessing all movies...")

# Initialize lists to store results
predicted_genres_top1 = []
predicted_genres_top3 = []
prediction_scores = []

# Process each movie
for idx, row in movies.iterrows():
    if idx % 100 == 0:  # Progress indicator
        print(f"Processed {idx}/{len(movies)} movies...")

    overview = row[overview_column]
    top_genres, top_scores = get_predicted_genres(overview, top_k=3)

    # Store results
    if len(top_genres) > 0:
        predicted_genres_top1.append(top_genres[0])  # Best prediction
        predicted_genres_top3.append(", ".join(top_genres))  # Top 3 predictions
        prediction_scores.append(top_scores[0] if len(top_scores) > 0 else 0.0)  # Best score
    else:
        predicted_genres_top1.append("Unknown")
        predicted_genres_top3.append("Unknown")
        prediction_scores.append(0.0)

print("Processing complete!")


Processing all movies...
Processed 0/1000 movies...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Processed 100/1000 movies...
Processed 200/1000 movies...
Processed 300/1000 movies...
Processed 400/1000 movies...
Processed 500/1000 movies...
Processed 600/1000 movies...
Processed 700/1000 movies...
Processed 800/1000 movies...
Processed 900/1000 movies...
Processing complete!


In [11]:
movies['Predicted_Genre_Top1'] = predicted_genres_top1
movies['Predicted_Genres_Top3'] = predicted_genres_top3
movies['Prediction_Confidence'] = prediction_scores

print("New columns added to DataFrame:")
print("- Predicted_Genre_Top1: Single best predicted genre")
print("- Predicted_Genres_Top3: Top 3 predicted genres (comma-separated)")
print("- Prediction_Confidence: Confidence score for the top prediction")

# Cell 9: Display results
print("\nSample results:")
display_columns = ['Title', 'Genre', 'Predicted_Genre_Top1', 'Predicted_Genres_Top3', 'Prediction_Confidence']
# Adjust column names if your CSV has different names
available_display_cols = [col for col in display_columns if col in movies.columns]

print(movies[available_display_cols].head(10))

New columns added to DataFrame:
- Predicted_Genre_Top1: Single best predicted genre
- Predicted_Genres_Top3: Top 3 predicted genres (comma-separated)
- Prediction_Confidence: Confidence score for the top prediction

Sample results:
                       Genre Predicted_Genre_Top1       Predicted_Genres_Top3  \
0                      Drama               Action  Action, Adventure, History   
1               Crime, Drama                Crime       Crime, Family, Action   
2       Action, Crime, Drama                Crime        Crime, Action, Drama   
3               Crime, Drama               Family      Family, Crime, History   
4               Crime, Drama               Action    Action, Crime, Adventure   
5   Action, Adventure, Drama            Adventure      Adventure, Action, War   
6               Crime, Drama                Crime        Crime, Family, Drama   
7  Biography, Drama, History                  War        War, Action, History   
8  Action, Adventure, Sci-Fi           

In [12]:
movies

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross,tagged_description,Predicted_Genre_Top1,Predicted_Genres_Top3,Prediction_Confidence
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469,0 Two imprisoned men bond over a number of yea...,Action,"Action, Adventure, History",0.224
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411,1 An organized crime dynasty's aging patriarch...,Crime,"Crime, Family, Action",0.623
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444,2 When the menace known as the Joker wreaks ha...,Crime,"Crime, Action, Drama",0.157
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000,3 The early life and career of Vito Corleone i...,Family,"Family, Crime, History",0.368
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000,4 A jury holdout attempts to prevent a miscarr...,Action,"Action, Crime, Adventure",0.329
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,https://m.media-amazon.com/images/M/MV5BNGEwMT...,Breakfast at Tiffany's,1961,A,115 min,"Comedy, Drama, Romance",7.6,A young New York socialite becomes interested ...,76.0,Blake Edwards,Audrey Hepburn,George Peppard,Patricia Neal,Buddy Ebsen,166544,,995 A young New York socialite becomes interes...,Action,"Action, Adventure, Drama",0.121
996,https://m.media-amazon.com/images/M/MV5BODk3Yj...,Giant,1956,G,201 min,"Drama, Western",7.6,Sprawling epic covering the life of a Texas ca...,84.0,George Stevens,Elizabeth Taylor,Rock Hudson,James Dean,Carroll Baker,34075,,996 Sprawling epic covering the life of a Texa...,Western,"Western, Family, Drama",0.381
997,https://m.media-amazon.com/images/M/MV5BM2U3Yz...,From Here to Eternity,1953,Passed,118 min,"Drama, Romance, War",7.6,"In Hawaii in 1941, a private is cruelly punish...",85.0,Fred Zinnemann,Burt Lancaster,Montgomery Clift,Deborah Kerr,Donna Reed,43374,30500000,"997 In Hawaii in 1941, a private is cruelly pu...",Drama,"Drama, Horror, History",0.115
998,https://m.media-amazon.com/images/M/MV5BZTBmMj...,Lifeboat,1944,,97 min,"Drama, War",7.6,Several survivors of a torpedoed merchant ship...,78.0,Alfred Hitchcock,Tallulah Bankhead,John Hodiak,Walter Slezak,William Bendix,26471,,998 Several survivors of a torpedoed merchant ...,War,"War, History, Action",0.660


In [13]:
movies.to_csv('movies_with_predictions.csv', index=False)
print(f"\nResults saved to 'movies_with_predictions.csv'")
print(f"DataFrame shape: {movies.shape}")

# Cell 11: Quick analysis
print("\nQuick Analysis:")
print(f"Movies processed: {len(movies)}")
print(f"Movies with predictions: {sum(1 for x in predicted_genres_top1 if x != 'Unknown')}")


Results saved to 'movies_with_predictions.csv'
DataFrame shape: (1000, 20)

Quick Analysis:
Movies processed: 1000
Movies with predictions: 1000


In [14]:
from collections import Counter
genre_counts = Counter(predicted_genres_top1)
print("\nTop 10 most predicted genres:")
for genre, count in genre_counts.most_common(10):
    print(f"  {genre}: {count} movies")

# High confidence predictions
high_confidence = movies[movies['Prediction_Confidence'] > 0.8]
print(f"\nHigh confidence predictions (>0.8): {len(high_confidence)} movies")


Top 10 most predicted genres:
  Family: 174 movies
  Action: 153 movies
  Crime: 138 movies
  Adventure: 134 movies
  War: 74 movies
  Mystery: 54 movies
  Sci-Fi: 42 movies
  Drama: 42 movies
  History: 41 movies
  Western: 29 movies

High confidence predictions (>0.8): 13 movies


In [16]:
movies_predictions = pd.read_csv("movies_with_predictions.csv")
movies_emotions = pd.read_csv("movies_with_emotions.csv")

# Extract only the required columns from the emotions file
emotions_data = movies_emotions[['emotion', 'emotion_confidence']]

# Append the columns to the movies_predictions DataFrame
movies_final = pd.concat([movies_predictions, emotions_data], axis=1)

# Save the final DataFrame to a new CSV file
movies_final.to_csv("movies_final.csv", index=False)

print("✅ movies_final.csv has been created successfully!")

✅ movies_final.csv has been created successfully!


In [17]:
movies_final=pd.read_csv('movies_final.csv')

In [18]:
movies_final

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,...,Star3,Star4,No_of_Votes,Gross,tagged_description,Predicted_Genre_Top1,Predicted_Genres_Top3,Prediction_Confidence,emotion,emotion_confidence
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,...,Bob Gunton,William Sadler,2343110,28341469,0 Two imprisoned men bond over a number of yea...,Action,"Action, Adventure, History",0.224,sadness,0.328030
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,...,James Caan,Diane Keaton,1620367,134966411,1 An organized crime dynasty's aging patriarch...,Crime,"Crime, Family, Action",0.623,sadness,0.316358
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,...,Aaron Eckhart,Michael Caine,2303232,534858444,2 When the menace known as the Joker wreaks ha...,Crime,"Crime, Action, Drama",0.157,sadness,0.350182
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,...,Robert Duvall,Diane Keaton,1129952,57300000,3 The early life and career of Vito Corleone i...,Family,"Family, Crime, History",0.368,joy,0.299055
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,...,Martin Balsam,John Fiedler,689845,4360000,4 A jury holdout attempts to prevent a miscarr...,Action,"Action, Crime, Adventure",0.329,sadness,0.328986
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,https://m.media-amazon.com/images/M/MV5BNGEwMT...,Breakfast at Tiffany's,1961,A,115 min,"Comedy, Drama, Romance",7.6,A young New York socialite becomes interested ...,76.0,Blake Edwards,...,Patricia Neal,Buddy Ebsen,166544,,995 A young New York socialite becomes interes...,Action,"Action, Adventure, Drama",0.121,sadness,0.319391
996,https://m.media-amazon.com/images/M/MV5BODk3Yj...,Giant,1956,G,201 min,"Drama, Western",7.6,Sprawling epic covering the life of a Texas ca...,84.0,George Stevens,...,James Dean,Carroll Baker,34075,,996 Sprawling epic covering the life of a Texa...,Western,"Western, Family, Drama",0.381,sadness,0.308605
997,https://m.media-amazon.com/images/M/MV5BM2U3Yz...,From Here to Eternity,1953,Passed,118 min,"Drama, Romance, War",7.6,"In Hawaii in 1941, a private is cruelly punish...",85.0,Fred Zinnemann,...,Deborah Kerr,Donna Reed,43374,30500000,"997 In Hawaii in 1941, a private is cruelly pu...",Drama,"Drama, Horror, History",0.115,sadness,0.316036
998,https://m.media-amazon.com/images/M/MV5BZTBmMj...,Lifeboat,1944,,97 min,"Drama, War",7.6,Several survivors of a torpedoed merchant ship...,78.0,Alfred Hitchcock,...,Walter Slezak,William Bendix,26471,,998 Several survivors of a torpedoed merchant ...,War,"War, History, Action",0.660,sadness,0.324485
