# Filmception Pipeline

This notebook contains the complete end-to-end pipeline for the Filmception project, including:

1. Preprocessing movie summaries
2. Training and evaluating the genre prediction model
3. Translating summaries and generating TTS audio
4. Launching the interactive GUI


## 1. Setup and Imports

In [3]:
# For the main folder.
import os
PROJECT_ROOT = r'C:\Users\PMLS\Documents\6th Sem\AI Course 6th Sem\Project\another try'

os.chdir(PROJECT_ROOT)

In [2]:
import os
import ast
import re
import threading
import tempfile
import pandas as pd
from collections import Counter

# NLTK for preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

# scikit-learn for modeling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, multilabel_confusion_matrix
import joblib

# Translation & TTS
from deep_translator import GoogleTranslator
from gtts import gTTS

# GUI
import tkinter as tk
from tkinter import ttk, messagebox
from playsound3 import playsound

# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Global resources
STOPWORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PMLS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PMLS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 2. Preprocessing Function

In [8]:
def preprocess():
    print("[1/4] Preprocessing...")
    # Load raw data
    summaries = pd.read_csv('plot_summaries.txt', sep='\t', names=['movie_id','summary'], dtype={'movie_id':str})
    meta = pd.read_csv(
        'movie.metadata.tsv', sep='\t',
        names=['movie_id','mid','title','release','budget','length','language','country','genres'],
        dtype={'movie_id':str}
    )
    # Parse and merge
    meta['genres'] = meta['genres'].apply(lambda s: list(ast.literal_eval(s).values()))
    df = summaries.merge(meta[['movie_id','genres']], on='movie_id', how='inner')

    # Cleaning
    def clean_text(text):
        text = text.lower()
        text = re.sub(r'[^a-z0-9\s]', ' ', text)
        tokens = text.split()
        return " ".join(LEMMATIZER.lemmatize(tok) for tok in tokens if tok not in STOPWORDS)

    print(f"Cleaning {len(df)} summaries...")
    tqdm.pandas(desc="Cleaning")
    df['clean_summary'] = df['summary'].progress_map(clean_text)
    df.to_csv('cleaned_data.csv', index=False)
    print("✅ Preprocessing complete.\n")

# Run preprocessing
preprocess()


[1/4] Preprocessing...
Cleaning 42204 summaries...


Cleaning: 100%|██████████| 42204/42204 [00:23<00:00, 1833.29it/s]


✅ Preprocessing complete.



## 3. Training and Evaluation Function

In [9]:
def train_and_evaluate():
    print("[2/4] Training & Evaluation...")
    df = pd.read_csv('cleaned_data.csv')
    df['genres'] = df['genres'].apply(ast.literal_eval)

    # Remove spurious and constant labels
    all_genres = [g for sub in df['genres'] for g in sub]
    counts = Counter(all_genres)
    n = len(df)
    spurious = {g for g in counts if len(g) == 1}
    constant = {g for g, c in counts.items() if c == n}
    if spurious:
        print(f"Removing spurious labels: {spurious}")
        df['genres'] = df['genres'].apply(lambda gs: [g for g in gs if g not in spurious])
    if constant:
        print(f"Removing constant labels: {constant}")
        df['genres'] = df['genres'].apply(lambda gs: [g for g in gs if g not in constant])

    X = df['clean_summary']
    y_raw = df['genres']
    X_train, X_test, y_train_raw, y_test_raw = train_test_split(X, y_raw, test_size=0.2, random_state=42)

    tfidf = TfidfVectorizer(max_features=10000)
    X_train_tfidf = tfidf.fit_transform(X_train)
    X_test_tfidf = tfidf.transform(X_test)

    mlb = MultiLabelBinarizer()
    y_train = mlb.fit_transform(y_train_raw)
    y_test = mlb.transform(y_test_raw)

    clf = OneVsRestClassifier(LogisticRegression(max_iter=1000))
    clf.fit(X_train_tfidf, y_train)

    print("--- Training Set ---")
    y_tr_pred = clf.predict(X_train_tfidf)
    print("Accuracy:", accuracy_score(y_train, y_tr_pred))
    print(classification_report(y_train, y_tr_pred, target_names=mlb.classes_))

    print("--- Test Set ---")
    y_te_pred = clf.predict(X_test_tfidf)
    print("Accuracy:", accuracy_score(y_test, y_te_pred))
    print(classification_report(y_test, y_te_pred, target_names=mlb.classes_))

    cms = multilabel_confusion_matrix(y_test, y_te_pred) # confusion matrix
    print("\n--- Confusion Matrix ---")
    for i, label in enumerate(mlb.classes_):
        tn, fp, fn, tp = cms[i].ravel()
        print(f"{label}: TP={tp}, FP={fp}, FN={fn}, TN={tn}")

    joblib.dump(tfidf, 'tfidf.pkl')
    joblib.dump(clf, 'model.pkl')
    joblib.dump(mlb, 'mlb.pkl')
    print("✅ Training and evaluation complete.\n")

# Run training & evaluation
train_and_evaluate()


[2/4] Training & Evaluation...




--- Training Set ---
Accuracy: 0.09978378698575363


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                          precision    recall  f1-score   support

                               Absurdism       0.00      0.00      0.00        67
                            Acid western       0.00      0.00      0.00         6
                                  Action       0.79      0.33      0.47      4704
                           Action Comedy       0.00      0.00      0.00       117
                        Action Thrillers       0.00      0.00      0.00       322
                        Action/Adventure       0.83      0.18      0.29      2858
                         Addiction Drama       0.00      0.00      0.00        35
                                   Adult       0.00      0.00      0.00        90
                               Adventure       0.85      0.20      0.32      2588
                        Adventure Comedy       0.00      0.00      0.00       101
                  Airplanes and airports       0.00      0.00      0.00        44
               

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


✅ Training and evaluation complete.



## 4. Translation and TTS

In [10]:
def translate_and_tts(n=50):
    print(f"[3/4] Translating and saving TTS for first {n} summaries...")
    df = pd.read_csv('cleaned_data.csv')
    subset = df.iloc[:n]
    os.makedirs('audio', exist_ok=True)
    for _, row in subset.iterrows():
        cid, text = row['movie_id'], row['clean_summary']
        for target, code in [('arabic','ar'), ('urdu','ur'), ('korean','ko')]:
            try:
                trans = GoogleTranslator(source='auto', target=target).translate(text)
                tts = gTTS(text=trans, lang=code)
                path = f"audio/{cid}_{code}.mp3"
                tts.save(path)
                print(f"Saved {target} audio: {path}")
            except Exception as e:
                print(f"Error {cid}-{target}: {e}")
    print("✅ Translation & TTS complete.\n")

# Run translation & TTS
translate_and_tts()


[3/4] Translating and saving TTS for first 50 summaries...
Saved arabic audio: audio/23890098_ar.mp3
Saved urdu audio: audio/23890098_ur.mp3
Saved korean audio: audio/23890098_ko.mp3
Saved arabic audio: audio/31186339_ar.mp3
Saved urdu audio: audio/31186339_ur.mp3
Saved korean audio: audio/31186339_ko.mp3


KeyboardInterrupt: 

## 5. Launching the GUI

In [4]:
def launch_gui():
    print("[4/4] Launching GUI...")
        # Load artifacts
    tfidf = joblib.load('tfidf.pkl')
    clf = joblib.load('model.pkl')
    mlb = joblib.load('mlb.pkl')

    class App(tk.Tk):
        def __init__(self):
            super().__init__()
            self.title('Filmception')
            self.geometry('600x500')
            ttk.Label(self, text='Enter Movie Summary:').pack(pady=5)
            self.txt = tk.Text(self, height=10, wrap=tk.WORD)
            self.txt.pack(fill=tk.BOTH, expand=True, padx=10, pady=5)
            frm = ttk.Frame(self)
            frm.pack(pady=5)
            ttk.Label(frm, text='Language:').grid(row=0, column=0)
            cmb = ttk.Combobox(frm, values=['English','Arabic','Urdu','Korean'], state='readonly')
            cmb.current(0)
            cmb.grid(row=0, column=1)
            btnf = ttk.Frame(self)
            btnf.pack(pady=10)
            ttk.Button(btnf, text='Convert to Audio', command=lambda: on_audio(cmb.get())).grid(row=0, column=0, padx=5)
            ttk.Button(btnf, text='Predict Genre', command=on_genre).grid(row=0, column=1, padx=5)

    def on_audio(lang_name):
        txt = app.txt.get('1.0', tk.END).strip()
        if not txt:
            return messagebox.showwarning('Input Required', 'Enter summary')
        code_map = {'English':'en','Arabic':'ar','Urdu':'ur','Korean':'ko'}
        code = code_map[lang_name]
        threading.Thread(target=lambda: playsound(_save_temp_tts(txt, code)), daemon=True).start()

    def _save_temp_tts(text, lang):
        tts = gTTS(text=text, lang=lang)
        tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
        tts.save(tmp.name)
        return tmp.name

    def on_genre():
        txt = app.txt.get('1.0', tk.END).strip()
        if not txt:
            return messagebox.showwarning('Input Required', 'Enter summary')
        clean = " ".join(LEMMATIZER.lemmatize(tok) for tok in re.sub(r'[^a-z0-9\s]', ' ', txt.lower()).split() if tok not in STOPWORDS)
        vec = tfidf.transform([clean])
        pred = clf.predict(vec)[0]
        genres = [g for g,f in zip(mlb.classes_, pred) if f]
        messagebox.showinfo('Predicted Genres', ', '.join(genres) or 'None')

    global app
    app = App()
    app.mainloop()

# Launch the GUI
launch_gui()


[4/4] Launching GUI...
