In [None]:
# Combining the first and second set of movie tags into a single Python list
from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

# Assuming 'df' and 'movie_tags' are already defined and 'df' contains a 'short_synopsis' column
# Initialize the classifier pipeline with the specified model
classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')

def classify_short_synopsis(short_synopsis, tags):
    """Classify a short synopsis and return top 10 tags."""
    classification = classifier(short_synopsis, tags, multi_label=True)
    # Extract the labels with the highest scores
    top_tags = [classification['labels'][i] for i in range(min(10, len(classification['labels'])))]
    return top_tags

# Step 4: For each movie, find the 10 tags with the highest similarity
def get_top_tags(similarity_scores, tags, top_n=10):
    top_indices = similarity_scores.argsort()[-top_n:][::-1]  # Indices of top N scores
    return [tags[i] for i in top_indices]

def classify_synopsis(synopsis):
    """
    Classifies a given synopsis into genres.
    """
    return classifier(synopsis, movie_tags)['labels'][0:5]



  from .autonotebook import tqdm as notebook_tqdm





In [None]:
df = pd.read_csv(r"C:\Users\mywil\OneDrive\Documents\Wilpo's Coding Projects\PlotMatch\data_tagged_imdb_rating.csv").head(3600)

# Helper function to decrease processing time

def classify_in_parallel(df, num_workers=30):
    """
    Classifies the synopses in the DataFrame in parallel.
    """
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        # Create a future for each synopsis classification
        futures = [executor.submit(classify_synopsis, synopsis) for synopsis in df['short_synopsis']]

        results = []
        for future in as_completed(futures):
            results.append(future.result())

    return results

movie_tags = [
    "Action", "Adventure", "Animation", "Biographical", "Comedy", "Crime", "Documentary", "Drama",
    "Epic", "Fantasy", "Historical", "Horror", "Musical", "Mystery", "Noir", "Paranormal",
    "Romance", "Sci-Fi", "Thriller", "War", "Western", "Zombie", "Superhero", "Silent",
    "Independent", "Experimental", "Cult", "Political", "Satire", "Tragedy", "Dystopian", "Erotic", "Espionage", "Fairy Tale",
    "Heist", "Martial Arts", "Mockumentary", "Monster", "Mythology", "Post-Apocalyptic", "Road Movie", "Space Opera", "Steampunk", "Detective",
    "Supernatural", "Time Travel", "Cyberpunk", "Cooking", "Dance", "Police"
]

results = classify_in_parallel(df)
df['genre_classification'] = results

In [None]:
# Checking processing time with different chunk sizes
chunksize = 600
chunks_list = []

for chunk in pd.read_csv(r"C:\Users\mywil\OneDrive\Documents\Wilpo's Coding Projects\PlotMatch\data_tagged_imdb_rating.csv", chunksize=chunksize):
    chunks_list.append(chunk)

chunks_list = chunks_list[15:]
len(chunks_list)

count = 8
for chunk_df in chunks_list:
    count += 1

    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Current Time:", current_time)

    results = classify_in_parallel(chunk_df)
    chunk_df['genre_classification'] = results

    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Current Time:", current_time)

    chunk_df.to_csv('tagged_results_'+str(count)+'.csv', index=False)

Current Time: 22:36:10
Current Time: 01:43:19
Current Time: 01:43:20
Current Time: 04:53:17
Current Time: 04:53:18
Current Time: 07:50:04
Current Time: 07:50:05
Current Time: 10:59:16
Current Time: 10:59:16
Current Time: 13:55:26
Current Time: 13:55:26
Current Time: 17:04:30
Current Time: 17:04:30
Current Time: 20:01:12
Current Time: 20:01:12
Current Time: 23:11:15
Current Time: 23:11:15
Current Time: 02:08:47
Current Time: 02:08:47
Current Time: 03:43:16


In [None]:
chunk_df.to_csv("tagged_results_0.csv", index = False)
print("Completed")

Completed


In [None]:
df['tags_genre'] = df['short_synopsis'].apply(lambda x: classifier(x, movie_tags)['labels'][0:5])
df.to_csv("data_tagged_4.csv", index=False)

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time:", current_time)