In [None]:
import time  # Used to measure execution time of code segments
import pandas as pd  # Primary data structure library for data manipulation and analysis
import numpy as np  # Library for support to large, multi-dimensional arrays and matrices, along with a collection of mathematical functions to operate on these arrays
from sqlalchemy import create_engine  # Database toolkit for Python, provides a way to create a connection to the database
import re  # Library for regular expression operations, allows for text searching, matching, and manipulation
from scipy import stats  # Module in SciPy library for statistical functions

import nltk  # Natural Language Toolkit, library for symbolic and statistical natural language processing (NLP)
nltk.download(['punkt', 'wordnet', 'stopwords'])  # Downloads specific packages from NLTK for tokenization, lemmatization, and stopwords
from nltk.tokenize import word_tokenize  # Function for tokenizing strings (splitting strings into words and punctuation)
from nltk.corpus import stopwords  # Provides a list of 'stopwords' that can be filtered out from the text
from nltk.stem import WordNetLemmatizer  # Class for lemmatizing words (reducing them to their base or root form)

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV  # Functions and classes for splitting data, cross-validation, and hyperparameter tuning
from sklearn.pipeline import Pipeline  # Class for creating a pipeline of transforms with a final estimator
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer  # Classes for converting text to vector form and applying TF-IDF transformation
from sklearn.decomposition import TruncatedSVD  # Class for dimensionality reduction using truncated singular value decomposition (SVD)
from sklearn.multioutput import MultiOutputClassifier  # Strategy for multi-target classification
from sklearn.tree import DecisionTreeClassifier  # Decision tree classifier
from sklearn.ensemble import RandomForestClassifier  # Random forest classifier
from sklearn.neighbors import KNeighborsClassifier  # K-nearest neighbors classifier
from sklearn.linear_model import LogisticRegression  # Logistic regression classifier
from sklearn.svm import SVC  # Support vector machine classifier
from sklearn.metrics import hamming_loss, precision_score, recall_score, f1_score  # Functions for calculating common classification metrics

import warnings
warnings.filterwarnings('ignore')  # Ignores warnings to clean up output for readability

engine = create_engine('sqlite:///dialogue.db')  # Creates a connection to the SQLite database file `dialogue.db`
df = pd.read_sql('dialogue', engine)  # Loads the 'dialogue' table from the database into a DataFrame
#display(df.head())
print('Total # of exchanges: {}'.format(str(len(df))))  # Prints the total number of exchanges (rows) in the DataFrame

genres = df['genres'].tolist()  # Converts the 'genres' column to a list
genres = ','.join(genres)  # Joins all genre strings into a single string separated by commas
genres = genres.split(',')  # Splits the single string back into a list of genres, effectively flattening the list
genres = sorted(list(set(genres)))  # Removes duplicates and sorts the genres
print('Count of unique genres: {}'.format(str(len(genres))))  # Prints the number of unique genres

for genre in genres:
    df[genre] = df['genres'].apply(lambda x: 1 if genre in x else 0)  # For each genre, adds a new column to the DataFrame where 1 indicates the genre is present in the movie's genres and 0 if not

df['label_count'] = df[genres].sum(axis=1)  # Adds a new column 'label_count' to the DataFrame representing the total number of genres associated with each movie
label_counts = df.groupby('label_count')['movie_id'].nunique().reset_index()  # Groups the DataFrame by 'label_count' and counts unique 'movie_id's for each group
label_counts

df_melt = pd.melt(df, id_vars='movie_id', value_vars=genres, var_name='genre', value_name='label')  # Transforms the DataFrame so each row corresponds to a movie-genre pair
genre_counts = df_melt.groupby('genre')['label'].sum().reset_index()  # Groups the melted DataFrame by 'genre' and sums up the 'label' column to get the total count of each genre
genre_counts.head()  # Displays the first few rows of the genre counts DataFrame


X = df['dialogue']  # Assigns the 'dialogue' column to X, representing the input features for the model
y = df[genres]  # Assigns the genre columns to y, representing the target labels for the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # Splits the data into training and testing sets

# tokenize data to be more suitable
def tokenize(text):
    text = re.sub('[^a-zA-Z0-9]', ' ', text)  # Replaces all characters not in a-zA-Z0-9 with a space
    tokens = word_tokenize(text)  # Tokenizes the cleaned text
    lemmatizer = WordNetLemmatizer()  # Initializes the WordNet lemmatizer
    clean_tokens = (lemmatizer.lemmatize(token).lower().strip() for token in tokens if token \
                    not in stopwords.words('english'))  # Lemmatizes, converts to lowercase, strips whitespace, and removes stopwords from the tokens
    return clean_tokens  # Returns the cleaned tokens

start_time = time.time()

# Setting steps for data processing
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('svd', TruncatedSVD()),
    ('clf', MultiOutputClassifier(DecisionTreeClassifier()))
    ])

# Tunning Model by setting hyper-parameters
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],  # Specifies the range of n-values for different n-grams to be extracted by CountVectorizer
    'clf__estimator__max_depth': [250, 500, 1000],  # Specifies the max depth for the decision tree classifier
    'clf__estimator__min_samples_split': [1, 2, 6]  # Specifies the minimum number of samples required to split an internal node
}

# Cross Validation Training
cv = GridSearchCV(pipeline, param_grid=parameters, scoring='f1_weighted', cv=4, n_jobs=1, verbose=10)  # Initializes a GridSearchCV object for hyperparameter tuning
cv.fit(X, y)  # Fits the grid search to the data
print('GridSearch complete.')  # Indicates that the grid search is complete
print('Best params:')  # Prints the best parameter combination found
print(cv.best_params_)  # Prints the actual best parameters
print('Best score:')  # Prints the best score achieved with the best parameters
print(cv.best_score_)  # Prints the actual best score
print('Time elapsed:')  # Prints the time elapsed since the new start_time
print(time.strftime('%H:%M:%S', time.gmtime(time.time() - start_time)))

# function that utilize trained model for prediction on new data
def predict_genres(text):
    pred = pd.DataFrame(cv.predict([text]), columns=genres)  # Predicts the genres for a given text input
    pred = pred.transpose().reset_index()  # Transposes the prediction DataFrame for easier manipulation
    pred.columns = ['genre', 'prediction']  # Renames columns for clarity
    predictions = pred[pred['prediction']==1]['genre'].tolist()  # Extracts the genres predicted as present (1)
    return predictions  # Returns the list of predicted genres

line1 = "If god did not exist it would be necessary to invent him."  # Example line of dialogue for testing
line2 = "It's funny... the world is so different in the daylight. In the dark, your fantasies get so out of hand. \
But in the daylight everything falls back into place again."  # Another example line of dialogue for testing
print('Line 1: {}'.format(predict_genres(line1)))  # Prints genres predicted for line1
print('Line 2: {}'.format(predict_genres(line2)))  # Prints genres predicted for line2

line  = ''  # Initializes an empty string
while line != 'exit':  # Continues to prompt for input until 'exit' is entered
    line = input("Enter a line of text: ")  # Prompts user for a line of text
    print('Genre: {}'.format(predict_genres(line)))  # Prints the predicted genres for the entered text
