In [6]:
import pandas as pd

# Load movie and rating datasets
movies = pd.read_csv('/kaggle/input/movielens-20m-dataset/movie.csv')
ratings = pd.read_csv('/kaggle/input/movielens-20m-dataset/rating.csv')

# Add a synthetic 'description' column for demonstration (real-world metadata could be sourced elsewhere)
movies['description'] = movies['genres'].apply(lambda x: f"A {x} movie loved by fans.")


In [7]:
pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.3.1
Note: you may need to restart the kernel to use updated packages.


In [8]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained LLM for embedding
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for movie descriptions
movies['embedding'] = list(model.encode(movies['description']))


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/853 [00:00<?, ?it/s]

In [9]:
movies

Unnamed: 0,movieId,title,genres,description,embedding
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,A Adventure|Animation|Children|Comedy|Fantasy ...,"[-0.05705645, 0.051417504, -0.01377575, 0.0237..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,A Adventure|Children|Fantasy movie loved by fans.,"[-0.045746617, 0.0800787, -0.009502576, 0.0294..."
2,3,Grumpier Old Men (1995),Comedy|Romance,A Comedy|Romance movie loved by fans.,"[-0.09438177, 0.001381491, -0.009171218, 0.061..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,A Comedy|Drama|Romance movie loved by fans.,"[-0.09119257, -0.0101926755, -0.020657504, 0.0..."
4,5,Father of the Bride Part II (1995),Comedy,A Comedy movie loved by fans.,"[-0.09537501, 0.0034517772, -0.04570984, 0.053..."
...,...,...,...,...,...
27273,131254,Kein Bund für's Leben (2007),Comedy,A Comedy movie loved by fans.,"[-0.09537501, 0.0034517772, -0.04570984, 0.053..."
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,A Comedy movie loved by fans.,"[-0.09537501, 0.0034517772, -0.04570984, 0.053..."
27275,131258,The Pirates (2014),Adventure,A Adventure movie loved by fans.,"[-0.06898302, 0.083500504, -0.011094016, 0.020..."
27276,131260,Rentun Ruusu (2001),(no genres listed),A (no genres listed) movie loved by fans.,"[-0.05030838, -0.024017714, -0.032176893, 0.01..."


In [13]:
average_ratings = pd.merge(ratings, movies, on='movieId').groupby('title')['rating'].mean()

In [14]:
average_ratings

title
#chicagoGirl: The Social Network Takes on a Dictator (2013)    3.666667
$ (Dollars) (1971)                                             2.833333
$5 a Day (2008)                                                2.871795
$9.99 (2008)                                                   3.009091
$ellebrity (Sellebrity) (2012)                                 2.000000
                                                                 ...   
À propos de Nice (1930)                                        3.125000
Árido Movie (2005)                                             2.000000
Åsa-Nisse - Wälkom to Knohult (2011)                           1.500000
Üvegtigris (2001)                                              3.000000
貞子3D (2012)                                                    1.500000
Name: rating, Length: 26729, dtype: float64

In [15]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def recommend_movies_llm(input_movie, num_recommendations=5):
    # Find the embedding for the input movie
    input_idx = movies[movies['title'] == input_movie].index[0]
    input_embedding = movies.loc[input_idx, 'embedding']

    # Compute cosine similarity with all other movies
    similarities = cosine_similarity(
        [input_embedding],
        np.vstack(movies['embedding'])
    )[0]

    # Get indices of top similar movies
    similar_indices = similarities.argsort()[-(num_recommendations+1):][::-1][1:]

    # Filter results by ratings (optional)
    recommended_movies = movies.iloc[similar_indices]
    high_rated = recommended_movies[recommended_movies['title'].map(average_ratings) >= 4.0]

    return high_rated.head(num_recommendations)[['title', 'genres']]

# Example usage
print(recommend_movies_llm('Toy Story (1995)'))


Empty DataFrame
Columns: [title, genres]
Index: []


In [21]:
from transformers import pipeline

# Load a pre-trained GPT-like model
generator = pipeline('text-generation', model='gpt2')  # Use OpenAI or Hugging Face APIs

# Generate text
def generate_explanation(input_text):
    result = generator(input_text, max_length=150, num_return_sequences=1)
    return result[0]['generated_text']

# Example usage
input_text = "Explain why these movies are recommended: Toy Story (1995), Finding Nemo, and The Incredibles."
print(generate_explanation(input_text))

def explain_recommendations(input_movie, recommendations):
    explanation = f"I recommend these movies because you liked {input_movie}: \n"
    for movie in recommendations['title']:
        explanation += f"- {movie}: {movies[movies['title'] == movie]['description'].values[0]}\n"

    # Generate refined explanations using LLM
    refined_explanation = generator(explanation, max_length=150)[0]['generated_text']
    return refined_explanation

# Example usage
recommended = recommend_movies_llm('Toy Story (1995)')
print(explain_recommendations('Toy Story (1995)', recommended))


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Explain why these movies are recommended: Toy Story (1995), Finding Nemo, and The Incredibles.

"You need only understand how often someone who can stand still in a movie, or in order to get a good job in a movie, takes a lot of time for a change of heart, and a lot of patience … which is just as bad with children as with adults, no?" says the psychologist.

That's a bit of a long list, but I want to finish off with one more observation. One is that some of the more interesting movies and television shows people consume are more popular than others.

If you want to live in a time period where kids go to school in the late 2000s
I recommend these movies because you liked Toy Story (1995): 

4. An American Werewolf In Miami __________________

3. The Night Manager __________________ (2002): __________________

2. The Graduate __________________

1. The Departed __________________

______________

1. The Wizard Of Oz.

http://www.mediafire.com/?w5e5jjnhnqw (2012) -- An American Werewolf I