In [None]:
import time  # Used to measure execution time of code segments
import pandas as pd  # Primary data structure library for data manipulation and analysis
import numpy as np  # Library for support to large, multi-dimensional arrays and matrices, along with a collection of mathematical functions to operate on these arrays
from sqlalchemy import create_engine  # Database toolkit for Python, provides a way to create a connection to the database
import re  # Library for regular expression operations, allows for text searching, matching, and manipulation
from scipy import stats  # Module in SciPy library for statistical functions

import nltk  # Natural Language Toolkit, library for symbolic and statistical natural language processing (NLP)
nltk.download(['punkt', 'wordnet', 'stopwords'])  # Downloads specific packages from NLTK for tokenization, lemmatization, and stopwords
from nltk.tokenize import word_tokenize  # Function for tokenizing strings (splitting strings into words and punctuation)
from nltk.corpus import stopwords  # Provides a list of 'stopwords' that can be filtered out from the text
from nltk.stem import WordNetLemmatizer  # Class for lemmatizing words (reducing them to their base or root form)

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV  # Functions and classes for splitting data, cross-validation, and hyperparameter tuning
from sklearn.pipeline import Pipeline  # Class for creating a pipeline of transforms with a final estimator
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer  # Classes for converting text to vector form and applying TF-IDF transformation
from sklearn.decomposition import TruncatedSVD  # Class for dimensionality reduction using truncated singular value decomposition (SVD)
from sklearn.multioutput import MultiOutputClassifier  # Strategy for multi-target classification
from sklearn.tree import DecisionTreeClassifier  # Decision tree classifier
from sklearn.ensemble import RandomForestClassifier  # Random forest classifier
from sklearn.neighbors import KNeighborsClassifier  # K-nearest neighbors classifier
from sklearn.linear_model import LogisticRegression  # Logistic regression classifier
from sklearn.svm import SVC  # Support vector machine classifier
from sklearn.metrics import hamming_loss, precision_score, recall_score, f1_score  # Functions for calculating common classification metrics

import warnings
warnings.filterwarnings('ignore')  # Ignores warnings to clean up output for readability

from joblib import dump, load

# Recreate the original 'genres' list from Algorithm.py to set up current environment 
# Ensure current environment mirrors the training setup for model deserialization process
engine = create_engine('sqlite:///dialogue.db')  # Creates a connection to the SQLite database file `dialogue.db`
df = pd.read_sql('dialogue', engine)  # Loads the 'dialogue' table from the database into a DataFrame

genres = df['genres'].tolist()  # Converts the 'genres' column to a list
genres = ','.join(genres)  # Joins all genre strings into a single string separated by commas
genres = genres.split(',')  # Splits the single string back into a list of genres, effectively flattening the list
genres = sorted(list(set(genres)))  # Removes duplicates and sorts the genres

#Original custom tokenize function from Algorithm.py for joblib to use as part of model deserialization process
def tokenize(text):
    text = re.sub('[^a-zA-Z0-9]', ' ', text)  # Replaces all characters not in a-zA-Z0-9 with a space
    tokens = word_tokenize(text)  # Tokenizes the cleaned text
    lemmatizer = WordNetLemmatizer()  # Initializes the WordNet lemmatizer
    clean_tokens = (lemmatizer.lemmatize(token).lower().strip() for token in tokens if token \
                    not in stopwords.words('english'))  # Lemmatizes, converts to lowercase, strips whitespace, and removes stopwords from the tokens
    return clean_tokens  # Returns the cleaned tokens
    
#load the trained model
model = load('CinemAiModel.joblib')

# function that utilizes trained model for prediction of new data
def predict_genres(text):
    pred = pd.DataFrame(model.predict([text]), columns = genres)  # Predicts the genres for a given text input
    pred = pred.transpose().reset_index()  # Transposes the prediction DataFrame for easier manipulation
    pred.columns = ['genre', 'prediction']  # Renames columns for clarity
    predictions = pred[pred['prediction'] == 1]['genre'].tolist()  # Extracts the genres predicted as present (1)
    return predictions  # Returns the list of predicted genres

line1 = "If god did not exist it would be necessary to invent him."  # Example line of dialogue for testing
line2 = "It's funny... the world is so different in the daylight. In the dark, your fantasies get so out of hand. \
But in the daylight everything falls back into place again."  # Another example line of dialogue for testing
print('Line 1: {}'.format(predict_genres(line1)))  # Prints genres predicted for line1
print('Line 2: {}'.format(predict_genres(line2)))  # Prints genres predicted for line2

line  = ''  # Initializes an empty string
while line != 'exit':  # Continues to prompt for input until 'exit' is entered
    line = input("Enter a line of text: ")  # Prompts user for a line of text
    print('Genre: {}'.format(predict_genres(line)))  # Prints the predicted genres for the entered text
