In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_csv('wiki_movie_plots_deduped.csv')
df.dropna(inplace=True)
df.drop_duplicates(subset=['Plot'],inplace=True)
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
5,1903,Alice in Wonderland,American,Cecil Hepworth,May Clark,unknown,https://en.wikipedia.org/wiki/Alice_in_Wonderl...,"Alice follows a large white rabbit down a ""Rab..."
13,1907,Daniel Boone,American,Wallace McCutcheon and Ediwin S. Porter,"William Craven, Florence Lawrence",biographical,https://en.wikipedia.org/wiki/Daniel_Boone_(19...,Boone's daughter befriends an Indian maiden as...
14,1907,How Brown Saw the Baseball Game,American,Unknown,Unknown,comedy,https://en.wikipedia.org/wiki/How_Brown_Saw_th...,Before heading out to a baseball game at a nea...
15,1907,Laughing Gas,American,Edwin Stanton Porter,"Bertha Regustus, Edward Boulden",comedy,https://en.wikipedia.org/wiki/Laughing_Gas_(fi...,The plot is that of a black woman going to the...
16,1908,The Adventures of Dollie,American,D. W. Griffith,"Arthur V. Johnson, Linda Arvidson",drama,https://en.wikipedia.org/wiki/The_Adventures_o...,On a beautiful summer day a father and mother ...


In [72]:
import time

# Define the fetch_movie_info function
def fetch_movie_info(dataframe_idx):
    # Extract information from the DataFrame
    info = df.iloc[dataframe_idx]

    # Create a dictionary with movie details
    meta_dict = {
        'Title': info['Title'],
        'Year': info['Release Year'],  # Use 'Release Year' as the column name
        'Genre': info['Genre']
    }
    return meta_dict

# Define the search function
def search(query, K, index, model, Plot=None, Genre=None, ReleaseYear=None):
    # Record the starting time for performance measurement
    t = time.time()

    # Encode the search query using the provided model
    query_vector = model.encode([query])

    # Perform a similarity search with the Faiss index to find the top K results
    top_k = index.search(query_vector, K)

    # Calculate and print the total time taken for the search
    print('>>>> Results in Total Time: {}'.format(time.time() - t))

    # Get the IDs of the top results
    top_k_ids = top_k[1].tolist()[0]
    top_k_ids = list(np.unique(top_k_ids))

    # Initialize an empty list to store the search results
    results = []

    # Iterate through the top result IDs
    for idx in top_k_ids:
        info = df.iloc[idx]

        # Check if a specific release year is provided and if it matches the movie's release year
        if ReleaseYear and ReleaseYear != info['Release Year']:
            continue  # Skip movies that don't match the specified release year

        # Create a dictionary with movie details
        meta_dict = {
            'Title': info['Title'],
            'Year': info['Release Year'],  
            'Genre': info['Genre']
        }

        # Include the movie's plot if available
        if Plot:
            meta_dict['Plot'] = info['Plot']

        # Add the movie details to the results list
        results.append(meta_dict)

    return results


In [4]:
from sentence_transformers import SentenceTransformer, InputExample, losses, models, datasets
from torch import nn
import os
import random

# Initialize an empty list to store training examples
train_examples = []

# Iterate through the rows of the DataFrame
for i in range(len(df)):
    # Extract the 'Title' and 'Plot' information from the DataFrame
    query, paragraph = df['Title'].iloc[i], df['Plot'].iloc[i]
    
    # Create an InputExample object with the 'Title' as query and 'Plot' as paragraph
    train_examples.append(InputExample(texts=[query, paragraph]))

# Shuffle the list of training examples to ensure randomness
random.shuffle(train_examples)


In [5]:
train_dataloader = datasets.NoDuplicatesDataLoader(train_examples, batch_size=8)

# Now we create a SentenceTransformer model from scratch
word_emb = models.Transformer('sentence-transformers/msmarco-distilbert-base-dot-prod-v3')
#i use distlibert
pooling = models.Pooling(word_emb.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_emb, pooling])

train_loss = losses.MultipleNegativesRankingLoss(model)


Downloading (…)lve/main/config.json:   0%|          | 0.00/554 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [6]:
# Tune the model

# Define the number of training epochs
num_epochs = 3

# Calculate the number of warm-up steps as 10% of the total training steps
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)

# Fine-tune the model by fitting it to the training data
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=num_epochs, warmup_steps=warmup_steps, show_progress_bar=True)


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4074 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4074 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4074 [00:00<?, ?it/s]

In [9]:
import faiss
# Encode the 'Plot' column of the DataFrame using the model
encoded_data = model.encode(df.Plot.tolist())

# Convert the encoded data to float32 for Faiss compatibility
encoded_data = np.asarray(encoded_data.astype('float32'))

# Initialize a Faiss index with Inner Product (IP) similarity
index = faiss.IndexIDMap(faiss.IndexFlatIP(768))

# Add the encoded data to the Faiss index with unique IDs
index.add_with_ids(encoded_data, np.array(range(0, len(df)))

# Write the Faiss index to a file for future use
faiss.write_index(index, 'movie_plot.index')

In [10]:
from pprint import pprint

query="Artificial Intelligence based action movie"
results=search(query, top_k=5, index=index, model=model)

print("\n")
for result in results:
    print('\t',result)

>>>> Results in Total Time: 0.5532848834991455


	 {'Title': 'Short Circuit'}
	 {'Title': 'Remote Control'}
	 {'Title': 'Antitrust'}
	 {'Title': 'How to Make a Monster'}
	 {'Title': 'Armed Response'}


In [12]:
model.save('C:/Users/thame/Desktop/m2 miv/test/movie_search_model')

In [13]:
new_model = SentenceTransformer('C:/Users/thame/Desktop/m2 miv/test/movie_search_model')

In [76]:
query = "romance movie"
K = 10
Genre = "Romance"
ReleaseYear = 2014

results = search(query, K, index, model, Plot, Genre, ReleaseYear)

# Check if there are any results
if not results:
    print("No movies found with the given criteria.")
else:
    # Print the results
    for result in results:
        print("Title:", result['Title'])
        print("Release Year:", result['Year'])
        print("Genre:", result['Genre'])
        #if 'Plot' in result:
            #print("Plot:", result['Plot'])
        if 'ReleaseYear' in result:
            print("ReleaseYear:", result['ReleaseYear'])
        print("\n")


>>>> Results in Total Time: 0.036551713943481445
Title: My Love, My Bride
Release Year: 2014
Genre: unknown


Title: Red Carpet
Release Year: 2014
Genre: unknown




In [77]:
query = "hacker"
K = 10
Genre = "action"
ReleaseYear = 2009

results = search(query, K, index, model, Plot, Genre, ReleaseYear)

# Check if there are any results
if not results:
    print("No movies found with the given criteria.")
else:
    # Print the results
    for result in results:
        print("Title:", result['Title'])
        print("Release Year:", result['Year'])
        print("Genre:", result['Genre'])
        #if 'Plot' in result:
            #print("Plot:", result['Plot'])
        if 'ReleaseYear' in result:
            print("ReleaseYear:", result['ReleaseYear'])
        print("\n")



>>>> Results in Total Time: 0.03142285346984863
Title:  Echelon Conspiracy
Release Year: 2009
Genre: action




In [78]:
query = "hacker"
K = 10
Genre = "action"
ReleaseYear = 2000

results = search(query, K, index, model, Plot, Genre, ReleaseYear)

# Check if there are any results
if not results:
    print("No movies found with the given criteria.")
else:
    # Print the results
    for result in results:
        print("Title:", result['Title'])
        print("Release Year:", result['Year'])
        print("Genre:", result['Genre'])
        #if 'Plot' in result:
            #print("Plot:", result['Plot'])
        if 'ReleaseYear' in result:
            print("ReleaseYear:", result['ReleaseYear'])
        print("\n")



>>>> Results in Total Time: 0.028706789016723633
No movies found with the given criteria.
