# The Product : PlotMatch
PlotMatch revolutionizes the way screenwriters and film studios navigate cinematic ideas, ensuring your next movie pitch is both original and captivating.

Utilizing machine learning technology, it compares new pitches against an extensive database of existing movies, identifying similarities to avoid cliche stories.

This tool not only streamlines the creative process but also safeguards against inadvertent plagiarism, giving creators peace of mind.

Embrace the future of storytelling, where technology meets originality.

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# Install non-default packages, will need to rerun every time you start the notebook
from datetime import datetime

print("Started packages load   :", datetime.now())
!pip install -q -U spacy
!pip install -q -U cinemagoer
!pip install -q -U sentence-transformers
!pip install -q -U rank_bm25
!python -m spacy download en_core_web_md > /dev/null 2>&1
print("Completed packages load :", datetime.now())

Started packages load   : 2024-04-13 21:36:57.223084
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.2/297.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3/163.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCompleted packages load : 2024-04-13 21:39:20.430732


In [3]:
# Import libraries
print("Started libraries load   :", datetime.now())
import pandas as pd
import numpy as np
import torch
import spacy
import ast
import warnings
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import average_precision_score, ndcg_score
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from scipy.spatial.distance import cosine
from scipy.stats import kendalltau, spearmanr
from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline
from collections import Counter
from IPython.display import Image, display
from imdb import Cinemagoer
from sentence_transformers import SentenceTransformer

print("Completed libraries load :", datetime.now())

Started libraries load   : 2024-04-13 21:39:20.448193
Completed libraries load : 2024-04-13 21:39:41.218843


In [4]:
# Runtime classes and settings
print("Applying settings :", datetime.now())
warnings.filterwarnings('ignore')
nlp       = spacy.load("en_core_web_md")
device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
knn_model = NearestNeighbors(n_neighbors=10, metric='cosine')
ia        = Cinemagoer()
pd.set_option('display.max_columns', None)

# Select sentence transformer
encoder_model = SentenceTransformer('sentence-transformers/all-roberta-large-v1')

# Load a text classification pipeline
classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')
vectorizer = TfidfVectorizer()

print("Settings applied  :", datetime.now())

Applying settings : 2024-04-13 21:39:41.237614


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/328 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Settings applied  : 2024-04-13 21:40:41.528916


### Helper Code
This will be used later in the notebook.

In [5]:
def plotmatch_jupyter_display(results):
  # Review the top 10 results
  for movie_rank in range(0,10):
    # plot_match = results.iloc[movie_rank]['short_synopsis']

    # print("Matched Result  :"+"\n"+plot_match)
    print("\n"+"Movie Details   :", "Rank:", str(movie_rank+1))
    title = results.iloc[movie_rank]['title']
    image_finder(title)

# Function to extract unique nouns and named entities
def extract_nouns_and_entities(text):
    doc = nlp(text)
    items = set()
    for token in doc:
        if token.pos_ in ['NOUN', 'PROPN']:
            items.add(token.text)
    return items

def extract_entities(text):
    doc = nlp(text)
    characters = list({ent.text for ent in doc.ents if ent.label_ in ("PERSON","ORG")})
    locations = list({ent.text for ent in doc.ents if ent.label_ in ("GPE", "LOC")})
    temporal_settings = list({ent.text for ent in doc.ents if ent.label_ in ("DATE", "TIME","EVENT")})
    items = list({ent.text for ent in doc.ents if ent.label_ in ("FAC","PRODUCT")})

    return {
        "characters": characters,
        "locations": locations,
        "temporal_settings": temporal_settings,
        "items": items}

def classify_genres(plot, genres, score_threshold=0.5):
  """
  Apply zero-shot classifier against the pre-determined list of film genres.
  Returns the top 5 genres that are determined by the plot.
  """
  result = classifier(plot, genres, multi_label=True)
  predicted_genres = result['labels'][:5] + [None] * (5 - len(result['labels'][:5]))
  return predicted_genres

def calculate_score(tags, genres):
    # Count how many of the genres are in the input tags
    score = 0
    if genres[0] in tags:
      score += 8
    if genres[1] in tags:
      score += 5
    if genres[2] in tags:
      score += 3
    if genres[3] in tags:
      score += 2
    if genres[4] in tags:
      score += 1
    return score

# Find movie image
def image_finder(input_name):
    movies = ia.search_movie(input_name)
    title = movies[0]['title']
    cover_url = movies[0]['full-size cover url']
    year = movies[0]['year']
    movie_id = movies[0].movieID
    print(movie_id, title, year)
    display(Image(url=cover_url, width=420))

# Functions for entities cosine scoring
def entity_dict_list(entities_dict):
  return list(np.concatenate(list(entities_dict.values())))

def flatten(matrix):
  return [item for row in matrix for item in row]

def flatten_uneven(uneven_list):
  final = []
  for elm in uneven_list:
    if type(elm) == list:
      final.extend(elm)
    else:
      final.append(elm)
  return final

def padding(entities_list):
  n = max_len-len(entities_list)
  zeros = [0]*n
  entities_list.extend(zeros)
  return entities_list

def padding_separate(entities_list):
  n = 1024-len(entities_list)
  if n < 0:
    entities_list = entities_list[:1024]
  else:
    zeros = [0]*n
    entities_list.extend(zeros)
  return entities_list

def flatten_entities(s):
  s = str(s)
  str_list = re.findall( r'\d+\.*\d*',s)
  flattened = []
  flattened = [float(elm) for elm in str_list]
  return flattened

def combine_cosine(df):
  for index, row in df.iterrows():
    imdb = row['test_imdb_id']
    pred_imdb = []

    # get list of predicted movies
    for i in range(1, 11):
      pred_imdb.append(row[f'imdb_{i}'])

    # character cosine values
    if imdb in character_dict:
      char_cosine = character_dict.get(imdb)

    for pred in char_cosine:
      if pred[0] in pred_imdb:
        col_name = f'cos_sim_{pred_imdb.index(pred[0]) + 1}'
        df.at[index, col_name] = df.at[index, col_name] + pred[1]

    # location cosine values
    if imdb in locations_dict:
      loc_cosine = locations_dict.get(imdb)

    for pred in loc_cosine:
      if pred[0] in pred_imdb:
        col_name = f'cos_sim_{pred_imdb.index(pred[0]) + 1}'
        df.at[index, col_name] = df.at[index, col_name] + pred[1]

    # temporal settings cosine values
    if imdb in temporal_settings_dict:
      temp_cosine = temporal_settings_dict.get(imdb)

    for pred in temp_cosine:
      if pred[0] in pred_imdb:
        col_name = f'cos_sim_{pred_imdb.index(pred[0]) + 1}'
        df.at[index, col_name] = df.at[index, col_name] + pred[1]

    # items cosine values
    if imdb in items_dict:
      items_cosine = items_dict.get(imdb)

    for pred in items_cosine:
      if pred[0] in pred_imdb:
        col_name = f'cos_sim_{pred_imdb.index(pred[0]) + 1}'
        df.at[index, col_name] = df.at[index, col_name] + pred[1]

    # genre cosine values
    if imdb in genre_dict:
      genre_cosine = genre_dict.get(imdb)

    for pred in genre_cosine:
      if pred[0] in pred_imdb:
        col_name = f'cos_sim_{pred_imdb.index(pred[0]) + 1}'
        df.at[index, col_name] = df.at[index, col_name] + pred[1]
  return df

In [6]:
# Predefined genres classification
genres = ['Action', 'Adventure', 'Animation', 'Biographical', 'Comedy',
          'Cooking', 'Crime', 'Cult', 'Cyberpunk', 'Dance', 'Detective',
          'Documentary', 'Drama', 'Dystopian', 'Epic', 'Erotic', 'Espionage',
          'Experimental', 'Fairy Tale', 'Fantasy', 'Heist', 'Historical',
          'Horror', 'Independent', 'Martial Arts', 'Mockumentary', 'Monster',
          'Musical', 'Mystery', 'Mythology', 'Noir', 'Paranormal', 'Police',
          'Political', 'Post-Apocalyptic', 'Road Movie', 'Romance', 'Satire',
          'Sci-Fi', 'Silent', 'Space Opera', 'Steampunk', 'Superhero',
          'Supernatural', 'Thriller', 'Time Travel', 'Tragedy', 'War',
          'Western', 'Zombie']

## Step 1 : Create Database Tower
Ingest MPST (Movie Plot Synopses with Tags) data sourced from the following location contains 34,886 movies from around the world with descriptions in English.
https://www.aclweb.org/anthology/L18-1274





In [7]:
# Reading in the pre-processed MPST dataset which contains preprocessed encoding, summaries, and genre classifications
print("Import commenced :", datetime.now())

# Import MPST dataset from Google Drive
df_raw = pd.read_csv(r'/content/drive/MyDrive/mpst.csv')

df_raw['genre_classification'] = df_raw['genre_classification'].astype(str).str.lower()
print("Import completed :", datetime.now())

Import commenced : 2024-04-13 21:40:41.689365
Import completed : 2024-04-13 21:40:50.972661


In [8]:
# Example encoding, should return an array, our movie has now been converted into a vector
plot_test = '''A haunting portrayal of Oskar Schindler, a German businessman who, amidst the horrors of the Holocaust, transforms from a profit-driven opportunist to a righteous savior. Set in Kraków during World War II, the film meticulously chronicles Schindler's courageous efforts to save over a thousand Jews from Nazi persecution by employing them in his factory. Amidst the brutality and despair of the Holocaust, Spielberg masterfully captures moments of humanity and resilience, painting a powerful narrative of hope amidst unimaginable darkness. As Schindler risks everything to shield his workers from the atrocities of the Holocaust, "Schindler's List" stands as a poignant testament to the indomitable human spirit and the enduring power of compassion in the face of unspeakable evil.'''
plot_test_tag = "Schindler's List"
plot_test_embedding = encoder_model.encode([plot_test])
print(plot_test_embedding[0])

# Example NER, should return a dictionary of entities
entities_df = extract_entities(plot_test)
entities_df

[-0.02158558  0.04315173 -0.00485975 ... -0.02121012  0.02114853
  0.03595661]


{'characters': ['Oskar Schindler', 'Spielberg', 'Schindler'],
 'locations': ['Kraków'],
 'temporal_settings': ['Holocaust', 'World War II'],
 'items': []}

In [9]:
# Create database tower
print("Database tower build commenced :", datetime.now())
df_r = df_raw.copy()
df_r['plot_synposis_encoded'] = df_r['plot_synposis_encoded'].apply(lambda x: [float(val) for val in x.strip('[]').split()])
knn_model.fit(df_r['plot_synposis_encoded'].tolist())
print("Database tower build completed :", datetime.now())

Database tower build commenced : 2024-04-13 21:40:54.448451
Database tower build completed : 2024-04-13 21:41:00.838496


In [10]:
# Calculate cosine similarity by entity category

ner_dfs = []
cos_sim_cols = []

# Characters
if len(entities_df['characters']) > 0:
  characters_embedding = encoder_model.encode(entities_df['characters'])
  distances, indices = knn_model.kneighbors(characters_embedding)
  char_nn_df = df_raw.iloc[indices.flatten()][['imdb_id','title','imdb_rating','short_synopsis']].reset_index(drop=True)
  tfidf_matrix = vectorizer.fit_transform(char_nn_df['short_synopsis'].values.tolist() + [plot_test])
  char_cos_sim = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])
  char_nn_df['char_cos_sim'] = char_cos_sim[0]
  ner_dfs.append(char_nn_df)
  cos_sim_cols.append('char_cos_sim')

# Location
if len(entities_df['locations']) > 0:
  loc_embedding = encoder_model.encode(entities_df['locations'])
  distances, indices = knn_model.kneighbors(loc_embedding)
  loc_nn_df = df_raw.iloc[indices.flatten()][['imdb_id','title','imdb_rating','short_synopsis']].reset_index(drop=True)
  tfidf_matrix = vectorizer.fit_transform(loc_nn_df['short_synopsis'].values.tolist() + [plot_test])
  loc_cos_sim = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])
  loc_nn_df['loc_cos_sim'] = loc_cos_sim[0]
  ner_dfs.append(loc_nn_df)
  cos_sim_cols.append('loc_cos_sim')

# Temporal Settings
if len(entities_df['temporal_settings']) > 0:
  temp_embedding = encoder_model.encode(entities_df['temporal_settings'])
  distances, indices = knn_model.kneighbors(temp_embedding)
  temp_nn_df = df_raw.iloc[indices.flatten()][['imdb_id','title','imdb_rating','short_synopsis']].reset_index(drop=True)
  tfidf_matrix = vectorizer.fit_transform(temp_nn_df['short_synopsis'].values.tolist() + [plot_test])
  temp_cos_sim = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])
  temp_nn_df['temp_cos_sim'] = temp_cos_sim[0]
  ner_dfs.append(temp_nn_df)
  cos_sim_cols.append('temp_cos_sim')

# Items
if len(entities_df['items']) > 0:
  items_embedding = encoder_model.encode(entities_df['items'])
  distances, indices = knn_model.kneighbors(items_embedding)
  items_nn_df = df_raw.iloc[indices.flatten()][['imdb_id','title','imdb_rating','short_synopsis']].reset_index(drop=True)
  tfidf_matrix = vectorizer.fit_transform(items_nn_df['short_synopsis'].values.tolist() + [plot_test])
  items_cos_sim = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])
  items_nn_df['items_cos_sim'] = items_cos_sim[0]
  ner_dfs.append(items_nn_df)
  cos_sim_cols.append('items_cos_sim')

# Combine dfs
df = ner_dfs[0]
for d in ner_dfs[1:]:
    df = df.merge(d, how='outer', on=['imdb_id','title','imdb_rating','short_synopsis'])
#ner_df = pd.concat(ner_dfs, join='outer')
df = df.fillna(0)
df = df.drop_duplicates()

In [11]:
# Calculate embedding input against KNN to test that it is working
print("Retrieval commenced :", datetime.now())
distances, indices = knn_model.kneighbors(plot_test_embedding)
nearest_neighbors_df = df_raw.iloc[indices.flatten()][['imdb_id','title','imdb_rating','short_synopsis']].reset_index(drop=True)
tfidf_matrix = vectorizer.fit_transform(nearest_neighbors_df['short_synopsis'].values.tolist() + [plot_test])
cos_sim = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])
nearest_neighbors_df['cosine_similarity'] = cos_sim[0]
nearest_neighbors_df = nearest_neighbors_df.sort_values(by='cosine_similarity', ascending=False)
nearest_neighbors_df = nearest_neighbors_df.reset_index(drop=True)
nearest_neighbors_df['short_synopsis'] = nearest_neighbors_df['short_synopsis'].str.capitalize()

print("Retrieval completed :", datetime.now())
#nearest_neighbors_df

Retrieval commenced : 2024-04-13 21:41:02.515179
Retrieval completed : 2024-04-13 21:41:02.642856


In [12]:
nearest_neighbors_df.head(10)

Unnamed: 0,imdb_id,title,imdb_rating,short_synopsis,cosine_similarity
0,tt0108052,Schindler's List,9.0,The relocation of polish jews from surrounding...,0.27143
1,tt0038769,Die Mörder sind unter uns,7.4,A man walks through the ruins and watches chil...,0.257827
2,tt0363163,Der Untergang,8.2,"The story jumps ahead to april 20th, 1945, hit...",0.241227
3,tt0346293,Hitler: The Rise of Evil,7.2,The opening of the film features a montage of ...,0.224526
4,tt0092978,Escape from Sobibor,7.4,A new group of ss officers arrives at a work c...,0.206437
5,tt0161860,Nirgendwo in Afrika,7.5,A former lawyer has fled to a farm in kenya wh...,0.142934
6,tt0071688,"Jakob, der Lügner",7.1,"In a ghetto in german-occupied poland, a man n...",0.118518
7,tt0108211,Stalingrad,7.5,The soldiers are enjoying themselves until the...,0.109451
8,tt0099776,Europa Europa,7.6,"The perel family decides to move to ód, centra...",0.102197
9,tt1280548,Spielzeugland,7.7,The silbersteins are deported to a concentrati...,0.084174


In [None]:
# Combine all dfs and cosine similarity values for categorical embeddings
cos_sim_cols.append('cosine_similarity')
final_df = df.merge(nearest_neighbors_df[['imdb_id','cosine_similarity','title']], how='outer', on=['imdb_id'])
final_df['title_x'] = final_df['title_x'].fillna(final_df['title_y'])
final_df.drop(columns=['title_y'], inplace=True)
final_df.rename(columns={'title_x': 'title'}, inplace=True)
final_df = final_df.fillna(0)
final_df = final_df.drop_duplicates()
final_df['cos_sim'] = final_df[cos_sim_cols].sum(axis=1)
final_df = final_df.drop(columns=cos_sim_cols)
final_df = final_df.sort_values(by='cos_sim', ascending=False)
final_df = final_df.reset_index(drop=True)
final_df = final_df[['imdb_id','title','imdb_rating','cos_sim']]
final_df.head(10)

Unnamed: 0,imdb_id,title,imdb_rating,cos_sim
0,tt0108052,Schindler's List,9.0,10.248342
1,tt0038769,Die Mörder sind unter uns,7.4,9.466176
2,tt0363163,Der Untergang,0.0,8.684178
3,tt0346293,Hitler: The Rise of Evil,7.2,8.434196
4,tt0092978,Escape from Sobibor,7.4,7.619108
5,tt0161860,Nirgendwo in Afrika,7.5,5.2604
6,tt0071688,"Jakob, der Lügner",7.1,4.470782
7,tt0108211,Stalingrad,7.5,4.029495
8,tt0099776,Europa Europa,7.6,3.884724
9,tt1280548,Spielzeugland,7.7,3.09942


In [None]:
# Display results for plot only
plotmatch_jupyter_display(nearest_neighbors_df)


Movie Details   : Rank: 1
0108052 Schindler's List 1993



Movie Details   : Rank: 2
0038769 Murderers Among Us 1946



Movie Details   : Rank: 3
0363163 Downfall 2004



Movie Details   : Rank: 4
0346293 Hitler: The Rise of Evil 2003



Movie Details   : Rank: 5
0092978 Escape from Sobibor 1987



Movie Details   : Rank: 6
0161860 Nowhere in Africa 2001



Movie Details   : Rank: 7
0071688 Jacob the Liar 1974



Movie Details   : Rank: 8
0108211 Stalingrad 1993



Movie Details   : Rank: 9
0099776 Europa Europa 1990



Movie Details   : Rank: 10
1280548 Spielzeugland 2007


In [None]:
# Display results for plot with entities
plotmatch_jupyter_display(final_df)


Movie Details   : Rank: 1
0108052 Schindler's List 1993



Movie Details   : Rank: 2
0038769 Murderers Among Us 1946



Movie Details   : Rank: 3
0363163 Downfall 2004



Movie Details   : Rank: 4
0346293 Hitler: The Rise of Evil 2003



Movie Details   : Rank: 5
0092978 Escape from Sobibor 1987



Movie Details   : Rank: 6
0161860 Nowhere in Africa 2001



Movie Details   : Rank: 7
0071688 Jacob the Liar 1974



Movie Details   : Rank: 8
0108211 Stalingrad 1993



Movie Details   : Rank: 9
0099776 Europa Europa 1990



Movie Details   : Rank: 10
1280548 Spielzeugland 2007


## Step 2 : Two Tower Retrieval Architecture
In order to build efficiencies, to avoid comparing against the entire database which will consume more resource, an excess that can be optimized, the following process is developed:
1. **Classification** | Enhancement
2. **Retrieval** | Part of base model
3. **Rank** |  Part of base model


### Step 2.1 : Base Model (KNN)

In [None]:
# Run test against KNN encoded dataset, the input dataset is generated summaries using
print("Encoding commenced :", datetime.now())

# Import data
test_data = pd.read_csv(r'/content/drive/MyDrive/user_submission_plots.csv', encoding='latin1')
df_test = test_data.copy().iloc[:120]
df_raw_test = df_raw.copy()

# Encode data
df_test['plot_synopsis_encoded'] = df_test['plot_submission'].apply(lambda x: encoder_model.encode([x][0]))
print("Encoding completed :", datetime.now())
df_test.head()

Encoding commenced : 2024-04-12 11:32:13.972794
Encoding completed : 2024-04-12 11:35:04.646859


Unnamed: 0,imdb_id,title,plot_submission,genre_classification,plot_synopsis_encoded
0,tt0108052,Schindler's List,A German businessman who becomes an unlikely h...,"['Silent', 'Independent', 'Horror', 'Dystopian...","[-0.0065914667, 0.030773917, 0.0006430667, -0...."
1,tt0109830,Forrest Gump,A simple man with a low IQ but good intentions...,"['Biographical', 'Experimental', 'Independent'...","[0.015001041, 0.010262309, -0.027762147, 0.009..."
2,tt0137523,Fight Club,A disillusioned office worker forms an undergr...,"['Experimental', 'Biographical', 'Action', 'Sa...","[0.0015462774, -0.023018353, 0.018031629, -0.0..."
3,tt1375666,Inception,In a world where technology allows people to e...,"['Historical', 'Mystery', 'Sci-Fi', 'Biographi...","[0.027722003, -0.0035285894, -0.011888047, -0...."
4,tt0099685,Goodfellas,A young man gets involved in the criminal unde...,"['Crime', 'Experimental', 'Historical', 'Adven...","[-0.008276803, 0.025773803, 0.022395318, -0.02..."


In [None]:
# Check KNN against sample population of movies in test dataset
top_matches = []
distances, indices = knn_model.kneighbors(np.stack(df_test['plot_synopsis_encoded'].values))

for i, (doc_indices, doc_distances) in enumerate(zip(indices, distances)):
    doc_result = {'test_title': df_test.iloc[i]['title'],'test_imdb_id': df_test.iloc[i]['imdb_id']}

    for j, (index, distance) in enumerate(zip(doc_indices, doc_distances)):
        title = df_r.iloc[index]['title']
        cos_sim = 1 - distance

        doc_result[f'predicted_title_{j+1}'] = title
        doc_result[f'cos_sim_{j+1}'] = cos_sim
        doc_result[f'imdb_{j+1}'] = df_r.iloc[index]['imdb_id']

    top_matches.append(doc_result)

results_df = pd.DataFrame(top_matches)
results_df.head()

Unnamed: 0,test_title,test_imdb_id,predicted_title_1,cos_sim_1,imdb_1,predicted_title_2,cos_sim_2,imdb_2,predicted_title_3,cos_sim_3,imdb_3,predicted_title_4,cos_sim_4,imdb_4,predicted_title_5,cos_sim_5,imdb_5,predicted_title_6,cos_sim_6,imdb_6,predicted_title_7,cos_sim_7,imdb_7,predicted_title_8,cos_sim_8,imdb_8,predicted_title_9,cos_sim_9,imdb_9,predicted_title_10,cos_sim_10,imdb_10
0,Schindler's List,tt0108052,Schindler's List,0.642258,tt0108052,Spielzeugland,0.61632,tt1280548,Stalingrad,0.577887,tt0108211,Hitler: The Rise of Evil,0.572442,tt0346293,"Jakob, der Lügner",0.568256,tt0071688,Der Untergang,0.565211,tt0363163,Die Mörder sind unter uns,0.563344,tt0038769,Nirgendwo in Afrika,0.557943,tt0161860,Was nützt die Liebe in Gedanken,0.544676,tt0325733,The Amateur,0.539946,tt0082005
1,Forrest Gump,tt0109830,The Good Mother,0.573413,tt0095238,Iron Eagle,0.570081,tt0091278,Working Girl,0.556672,tt0096463,Ziemia obiecana,0.551436,tt0072446,Snow Angels,0.538565,tt0453548,Map of the Human Heart,0.53842,tt0104812,One True Thing,0.53685,tt0120776,Gigantic,0.533147,tt1176251,They Gave Him a Gun,0.529956,tt0029656,The Way We Were,0.521764,tt0070903
2,Fight Club,tt0137523,Crimen ferpecto,0.58063,tt0395125,Shank,0.560853,tt1320296,Shank,0.560853,tt1288571,Gangster No. 1,0.559891,tt0210065,Fifty Dead Men Walking,0.558048,tt1097643,Magic Kid,0.557626,tt0107482,High-Rise,0.55408,tt0462335,Hard Target,0.548914,tt0107076,The Boxer,0.54804,tt0118760,The Prize Fighter,0.537072,tt0079754
3,Inception,tt1375666,The Night of the Following Day,0.555802,tt0064728,Upstream Color,0.534705,tt2084989,Inception,0.524431,tt1375666,Transit,0.5234,tt1059836,Momentum,0.519511,tt3181776,Ink,0.518333,tt1071804,Johnny Mnemonic,0.516588,tt0113481,Strange Days,0.492035,tt0114558,Diabolik,0.491396,tt0062861,Extracted,0.483734,tt1757746
4,Goodfellas,tt0099685,The Good Thief,0.657246,tt0281820,American Gangster,0.647769,tt0765429,Goodfellas,0.621898,tt0099685,(Sökarna),0.619709,tt0108275,Training Day,0.618757,tt0139654,Barood,0.610439,tt0157358,Gangster No. 1,0.606384,tt0210065,The Valachi Papers,0.605941,tt0068341,Magic Kid,0.596572,tt0107482,Fifty Dead Men Walking,0.593975,tt1097643


In [None]:
# Create lists for predicted IMDb ratings and true IMDb ratings
predicted_imdb_ratings = []
true_imdb_ratings = []
relevance_scores = []
cosine_similarity_scores = []

# Iterate over rows in results_df
for index, row in results_df.iterrows():
    predicted_imdb = [row[f'imdb_{i}'] for i in range(1, 11)]
    true_imdb = [row['test_imdb_id']] * 10
    predicted_imdb_ratings.extend(predicted_imdb)
    true_imdb_ratings.extend(true_imdb)

for index, row in results_df.iterrows():
    true_imdb = row['test_imdb_id']
    for i in range(1, 11):
        predicted_imdb = row[f'imdb_{i}']
        cos_sim = row[f'cos_sim_{i}']
        relevance_scores.append(1 if predicted_imdb == true_imdb else 0)
        cosine_similarity_scores.append(cos_sim)

true_relevance = np.zeros((len(results_df), 10))
predicted_scores = np.zeros_like(true_relevance)
for index, row in results_df.iterrows():
    true_id = row['test_imdb_id']
    for i in range(10):
        predicted_id = row[f'imdb_{i+1}']
        if predicted_id == true_id:
            true_relevance[index, i] = 1
            predicted_scores[index, i] = 1
        else:
            true_relevance[index, i] = 0
            predicted_scores[index, i] = 0

kendall_tau, _  = kendalltau(predicted_imdb_ratings, true_imdb_ratings)
spearman_rho, _ = spearmanr(predicted_imdb_ratings, true_imdb_ratings)
map_score       = average_precision_score(relevance_scores, cosine_similarity_scores)
average_ndcg    = ndcg_score(true_relevance, predicted_scores)

print("Kendall's Tau          :", kendall_tau)
print("Spearman's Rho         :", spearman_rho)
print("Mean Average Precision :", map_score)
print("Norm. Disc. Cum Gain   :", average_ndcg)

Kendall's Tau          : 0.07938646947874431
Spearman's Rho         : 0.1149993951186377
Mean Average Precision : 0.251260126877357
Norm. Disc. Cum Gain   : 0.3333333333333333


## Step 2.2 Classification
There are two types of processing we will do here

1. Zero-Shot Genre Classification
Using a zero-shot classification model `facebook/bart-large-mnli`, apply it to the plot (prompt) to produce the predicted genre, this has already been baked into the input database tower so the next step is to incorporate it.

2. NER (Named Entity Recogition)
Using SpaCy to tag

In [None]:
# Enhance the dataset tower by extracting out the preprocessed genres
print("Genre extraction commenced :", datetime.now())
df_r_gen = df_r.copy()
df_r_gen['genre_classification'] = df_r_gen['genre_classification'].str.strip("[]").str.replace("'", "").str.lower()
df_r_gen[['genre_1', 'genre_2', 'genre_3', 'genre_4', 'genre_5']] = df_r_gen['genre_classification'].str.strip("[]").str.split(", ", expand=True)
df_r_gen.drop(columns=['genre_classification'], inplace=True)
print("Genre extraction commenced :", datetime.now())

df_r_gen.head()

Genre extraction commenced : 2024-04-12 11:54:07.162897
Genre extraction commenced : 2024-04-12 11:54:07.466761


Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,synopsis_source,short_synopsis,imdb_rating,plot_synposis_encoded,characters,locations,temporal_settings,items,genre_1,genre_2,genre_3,genre_4,genre_5
0,tt2140553,The Last of Us,"In September 2013, an outbreak of a mutant Cor...",violence,train,wikipedia,"in february 2013, an outbreak of a mutant Cord...",9.7,"[-0.0195728838, 0.0011783035, -0.0196311567, -...","['Marlene', 'Hana Hayes', 'Maria', 'Firefly', ...","['Wyoming', 'Utah', 'Texas', 'Massachusetts', ...","['the twenty years', 'the winter', 'September ...",[],crime,action,adventure,epic,mystery
1,tt0180825,Metal Gear Solid,"The year is 2005. In Alaska's Fox archipeligo,...",paranormal,test,imdb,a nuclear weapon will be launched unless the d...,9.6,"[-0.057711225, -0.019903101, -0.00170467433, -...","['DARPA', 'Ocelot', 'Kenneth Baker', 'Snake', ...","['Alaska', 'Outer Heaven', 'the middle east', ...","['years ago', 'Otacon""', 'a couple of seconds'...","['M60 Mini', 'Mantis', 'Outer Heaven', 'Sniper...",fantasy,biographical,adventure,action,silent
2,tt0208155,Fainaru fantajî VII,=== Setting and characters ===\nFinal Fantasy ...,"good versus evil, insanity, murder",test,wikipedia,Final Fantasy VII takes place on a world refer...,9.5,"[0.0225034673, 0.0346259698, 0.00294106011, -0...","['Vincent', 'Red XIII', 'Cetra', 'Barret', 'th...","['Nibelheim', 'Cetra City', 'the Northern Crat...","['four years later', 'seven years earlier', 'F...","['Cid Highwind', 'Cloud', 'Sephiroth', 'Meteor...",action,adventure,experimental,epic,biographical
3,tt0365498,Metal Gear Solid 3: Snake Eater,=== Characters ===\nThe protagonist of Snake E...,"violence, dark, cruelty, murder, sadist",train,wikipedia,a young former Green Beret is assigned to the ...,9.5,"[-0.0222474299, -0.0242226552, 0.0201468002, 0...","['Sigint', 'Ocelot', 'Groznyj Grad', 'Brian Cu...","['The United States', ""the United States'"", 'U...","['World War II', '1964', 'the Cold War', 'A we...","['Legacy', 'Arlington National Cemetery']",tragedy,adventure,independent,experimental,action
4,tt0903747,Breaking Bad,Season 1A struggling high school chemistry tea...,"violence, comedy, neo noir, murder",train,imdb,Walter White (Bryan Cranston) is diagnosed wit...,9.5,"[-0.0587933809, -0.00337498565, -0.0354771614,...","['Dean Norris', 'Jonathan Banks', 'Saul', 'Jac...","['New Mexico', 'Badger', 'Albuquerque', 'New H...","['age 11', 'first days', 'four days', 'Season ...",[],dystopian,sci-fi,adventure,action,experimental


In [None]:
# Further enhance the dataset tower by applying NER
# Note: This can take a while so we want to filter upfront before starting entity extraction
print("Extraction commenced :", datetime.now())

df_r_ner = df_r.copy().head(200)
df_r_ner[['characters', 'locations', 'temporal_settings','items']] = df_r_ner['plot_synopsis'].apply(
    lambda x: pd.Series(extract_entities(x))
)
print("Extraction completed :", datetime.now())
df_r_ner.head()

Extraction commenced : 2024-04-12 11:54:28.281802
Extraction completed : 2024-04-12 11:55:15.627835


Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,synopsis_source,short_synopsis,imdb_rating,genre_classification,plot_synposis_encoded,characters,locations,temporal_settings,items
0,tt2140553,The Last of Us,"In September 2013, an outbreak of a mutant Cor...",violence,train,wikipedia,"in february 2013, an outbreak of a mutant Cord...",9.7,"['crime', 'action', 'adventure', 'epic', 'myst...","[-0.0195728838, 0.0011783035, -0.0196311567, -...","[Jeffrey Pierce, Robin Atkin Downes, Reuben La...","[Jackson, the United States, Pittsburgh, Austi...","[the spring, three weeks ago, the fall, the ni...",[]
1,tt0180825,Metal Gear Solid,"The year is 2005. In Alaska's Fox archipeligo,...",paranormal,test,imdb,a nuclear weapon will be launched unless the d...,9.6,"['fantasy', 'biographical', 'adventure', 'acti...","[-0.057711225, -0.019903101, -0.00170467433, -...","[Fox-hound's, Houseman, Jim Houseman, Roy Camp...","[Alaska, Outer Heaven, Washington, the middle ...","[2005, 24 hours, a minute, years ago, Otacon"",...","[Sniper Wolf, the Cyborg Ninja, Outer Heaven, ..."
2,tt0208155,Fainaru fantajî VII,=== Setting and characters ===\nFinal Fantasy ...,"good versus evil, insanity, murder",test,wikipedia,Final Fantasy VII takes place on a world refer...,9.5,"['action', 'adventure', 'experimental', 'epic'...","[0.0225034673, 0.0346259698, 0.00294106011, -0...","[the Black Materia, Cid, Cait Sith, Sephiroth,...","[Cetra City, the Northern Crater, Lifestream, ...","[seven years earlier, Five hundred years later...","[Cloud Strife, Cid Highwind, Sephiroth, Jenova..."
3,tt0365498,Metal Gear Solid 3: Snake Eater,=== Characters ===\nThe protagonist of Snake E...,"violence, dark, cruelty, murder, sadist",train,wikipedia,a young former Green Beret is assigned to the ...,9.5,"['tragedy', 'adventure', 'independent', 'exper...","[-0.0222474299, -0.0242226552, 0.0201468002, 0...","[Naked Snake, Naoki Tatsuta, Special Forces, T...","[America, the United States, Soviet Union, Vol...","[1964, a few years earlier, A week, the night,...","[Legacy, Arlington National Cemetery]"
4,tt0903747,Breaking Bad,Season 1A struggling high school chemistry tea...,"violence, comedy, neo noir, murder",train,imdb,Walter White (Bryan Cranston) is diagnosed wit...,9.5,"['dystopian', 'sci-fi', 'adventure', 'action',...","[-0.0587933809, -0.00337498565, -0.0354771614,...","[Matt L. Jones, Anna Gunn, Hank, DEA, Jesse Pi...","[Albuquerque, Badger, New Mexico, New Hampshire]","[Season 4Jesse, age 11, the next several month...",[]


In [None]:
# Generating cosine scores for genre and extracted entities
entities_df = df_r_ner.copy()

entities_df['genre_classification_encoded'] = entities_df['genre_classification'].map(encoder_model.encode)
genre_distances, genre_indices = knn_model.kneighbors(np.stack(entities_df['genre_classification_encoded'].values))

entities_df['characters_encoded'] = entities_df['characters'].map(encoder_model.encode)
entities_df['locations_encoded'] = entities_df['locations'].map(encoder_model.encode)
entities_df['temporal_settings_encoded'] = entities_df['temporal_settings'].map(encoder_model.encode)
entities_df['items_encoded'] = entities_df['items'].map(encoder_model.encode)

In [None]:
# Flatten and pad
entities_df['characters_encoded'] = entities_df['characters_encoded'].map(flatten_entities)
entities_df['characters_encoded'] = entities_df['characters_encoded'].map(padding_separate)

entities_df['locations_encoded'] = entities_df['locations_encoded'].map(flatten_entities)
entities_df['locations_encoded'] = entities_df['locations_encoded'].map(padding_separate)

entities_df['temporal_settings_encoded'] = entities_df['temporal_settings_encoded'].map(flatten_entities)
entities_df['temporal_settings_encoded'] = entities_df['temporal_settings_encoded'].map(padding_separate)

entities_df['items_encoded'] = entities_df['items_encoded'].map(flatten_entities)
entities_df['items_encoded'] = entities_df['items_encoded'].map(padding_separate)

char_distances, char_indices = knn_model.kneighbors(np.stack(entities_df['characters_encoded'].values))
loc_distances, loc_indices = knn_model.kneighbors(np.stack(entities_df['locations_encoded'].values))
temp_distances, temp_indices = knn_model.kneighbors(np.stack(entities_df['temporal_settings_encoded'].values))
items_distances, items_indices = knn_model.kneighbors(np.stack(entities_df['items_encoded'].values))

df_r_ner = df_r_ner.rename(columns={"plot_synposis_encoded": "plot_synopsis_encoded"})

In [None]:
# Calculate cosine similarities

# Genres
genre_dict = {}

for i, (doc_indices, doc_distances) in enumerate(zip(genre_indices, genre_distances)):
    imdb = df_r_ner.iloc[i]['imdb_id']
    genre_dict[imdb] = []
    for j, (index, distance) in enumerate(zip(doc_indices, doc_distances)):
        cos_sim = 1 - distance
        imdb_predicted = df_r.iloc[index]['imdb_id']
        genre_dict[imdb].append((imdb_predicted, cos_sim))

# Characters
character_dict = {}

for i, (doc_indices, doc_distances) in enumerate(zip(char_indices, char_distances)):
    imdb = df_r_ner.iloc[i]['imdb_id']
    character_dict[imdb] = []
    for j, (index, distance) in enumerate(zip(doc_indices, doc_distances)):
        cos_sim = 1 - distance
        imdb_predicted = df_r.iloc[index]['imdb_id']
        character_dict[imdb].append((imdb_predicted, cos_sim))

# Locations
locations_dict = {}

for i, (doc_indices, doc_distances) in enumerate(zip(loc_indices, loc_distances)):
    imdb = df_r_ner.iloc[i]['imdb_id']
    locations_dict[imdb] = []
    for j, (index, distance) in enumerate(zip(doc_indices, doc_distances)):
        cos_sim = 1 - distance
        imdb_predicted = df_r.iloc[index]['imdb_id']
        locations_dict[imdb].append((imdb_predicted, cos_sim))

# Temporal Settings
temporal_settings_dict = {}

for i, (doc_indices, doc_distances) in enumerate(zip(temp_indices, temp_distances)):
    imdb = df_r_ner.iloc[i]['imdb_id']
    temporal_settings_dict[imdb] = []
    for j, (index, distance) in enumerate(zip(doc_indices, doc_distances)):
        cos_sim = 1 - distance
        imdb_predicted = df_r.iloc[index]['imdb_id']
        temporal_settings_dict[imdb].append((imdb_predicted, cos_sim))

# Items
items_dict = {}

for i, (doc_indices, doc_distances) in enumerate(zip(items_indices, items_distances)):
    imdb = df_r_ner.iloc[i]['imdb_id']
    items_dict[imdb] = []
    for j, (index, distance) in enumerate(zip(doc_indices, doc_distances)):
        cos_sim = 1 - distance
        imdb_predicted = df_r.iloc[index]['imdb_id']
        items_dict[imdb].append((imdb_predicted, cos_sim))


In [None]:
# Baseline KNN
top_matches = []
distances, indices = knn_model.kneighbors(np.stack(df_r_ner['plot_synopsis_encoded'].values))

for i, (doc_indices, doc_distances) in enumerate(zip(indices, distances)):
    doc_result = {'test_title': df_r_ner.iloc[i]['title'],'test_imdb_id': df_r_ner.iloc[i]['imdb_id']}

    for j, (index, distance) in enumerate(zip(doc_indices, doc_distances)):
        title = df_r.iloc[index]['title']
        cos_sim = 1 - distance

        doc_result[f'predicted_title_{j+1}'] = title
        doc_result[f'cos_sim_{j+1}'] = cos_sim
        doc_result[f'imdb_{j+1}'] = df_r.iloc[index]['imdb_id']

    top_matches.append(doc_result)

results_df = pd.DataFrame(top_matches)

In [None]:
# Combine baseline KNN with genre and entities
final_df_comb = combine_cosine(results_df)

In [None]:
# Metrics for final predictions

# Create lists for predicted IMDb ratings and true IMDb ratings
predicted_imdb_ratings = []
true_imdb_ratings = []
relevance_scores = []
cosine_similarity_scores = []

# Iterate over rows in results_df
for index, row in final_df_comb.iterrows():
    predicted_imdb = [row[f'imdb_{i}'] for i in range(1, 11)]
    true_imdb = [row['test_imdb_id']] * 10
    predicted_imdb_ratings.extend(predicted_imdb)
    true_imdb_ratings.extend(true_imdb)

for index, row in final_df_comb.iterrows():
    true_imdb = row['test_imdb_id']
    for i in range(1, 11):
        predicted_imdb = row[f'imdb_{i}']
        cos_sim = row[f'cos_sim_{i}']
        relevance_scores.append(1 if predicted_imdb == true_imdb else 0)
        cosine_similarity_scores.append(cos_sim)

true_relevance = np.zeros((len(final_df_comb), 10))
predicted_scores = np.zeros_like(true_relevance)
for index, row in results_df.iterrows():
    true_id = row['test_imdb_id']
    for i in range(10):
        predicted_id = row[f'imdb_{i+1}']
        if predicted_id == true_id:
            true_relevance[index, i] = 1
            predicted_scores[index, i] = 1
        else:
            true_relevance[index, i] = 0
            predicted_scores[index, i] = 0

kendall_tau, _  = kendalltau(predicted_imdb_ratings, true_imdb_ratings)
spearman_rho, _ = spearmanr(predicted_imdb_ratings, true_imdb_ratings)
map_score       = average_precision_score(relevance_scores, cosine_similarity_scores)
average_ndcg    = ndcg_score(true_relevance, predicted_scores)

print("Kendall's Tau          :", kendall_tau)
print("Spearman's Rho         :", spearman_rho)
print("Mean Average Precision :", map_score)
print("Norm. Disc. Cum Gain   :", average_ndcg)

Kendall's Tau          : 0.18792046340194735
Spearman's Rho         : 0.26713404305361266
Mean Average Precision : 0.7853015725423149
Norm. Disc. Cum Gain   : 0.995
