In [4]:
import pandas as pd

# Path to the Data folder
data_folder_path = 'Data/'

# Read in the files with full paths
movie_df = pd.read_csv(data_folder_path + 'movie.csv')
rating_df = pd.read_csv(data_folder_path + 'rating.csv')
genome_tags_df = pd.read_csv(data_folder_path + 'genome_tags.csv')
tags_df = pd.read_csv(data_folder_path + 'tag.csv')  # Reading the additional tags.csv file

# Display the DataFrames
print("Movies DataFrame:")
print(movie_df.head())  # Show the first few rows
print("\nRatings DataFrame:")
print(rating_df.head())  # Show the first few rows
print("\nGenome Tags DataFrame:")
print(genome_tags_df.head())  # Show the first few rows
print("\nTags DataFrame:")
print(tags_df.head())  # Show the first few rows of the new DataFrame

# Fill missing values
# The fill value should be chosen based on the data type and context of each column
movie_df.fillna('unknown', inplace=True)
rating_df.fillna(0, inplace=True)  # Assuming 0 is a sensible fill value for ratings
genome_tags_df.fillna('unknown', inplace=True)
tags_df.fillna('unknown', inplace=True)  # Filling missing values in the new DataFrame

# Show DataFrames after filling missing values
print("\nMovies DataFrame after filling missing values:")
print(movie_df.head())
print("\nRatings DataFrame after filling missing values:")
print(rating_df.head())
print("\nGenome Tags DataFrame after filling missing values:")
print(genome_tags_df.head())
print("\nTags DataFrame after filling missing values:")
print(tags_df.head())


Movies DataFrame:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Ratings DataFrame:
   userId  movieId  rating            timestamp
0       1        2     3.5  2005-04-02 23:53:47
1       1       29     3.5  2005-04-02 23:31:16
2       1       32     3.5  2005-04-02 23:33:39
3       1       47     3.5  2005-04-02 23:32:07
4       1       50     3.5  2005-04-02 23:29:40

Genome Tags DataFrame:
   tagId           tag
0      1           007
1      2 

Merging All Dataframes

In [5]:
# Assuming movie_df, rating_df, and tags_df are already loaded and preprocessed

# Step 1: Merge movie_df with rating_df
movie_rating_merged = pd.merge(movie_df, rating_df, on='movieId', how='outer')

# Step 2: Merge the result with tags_df
full_merged_df = pd.merge(movie_rating_merged, tags_df, on=['movieId', 'userId'], how='outer')

# Display the first few rows of the combined DataFrame to verify
print(full_merged_df.head())


   movieId             title                                       genres  \
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
1        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
2        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
3        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
4        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   

   userId  rating          timestamp_x  tag timestamp_y  
0     3.0     4.0  1999-12-11 13:36:47  NaN         NaN  
1     6.0     5.0  1997-03-13 17:50:52  NaN         NaN  
2     8.0     4.0  1996-06-05 13:37:51  NaN         NaN  
3    10.0     4.0  1999-11-25 02:44:47  NaN         NaN  
4    11.0     4.5  2009-01-02 01:13:41  NaN         NaN  


In [8]:
# Assuming your merged DataFrame is named full_merged_df
full_merged_df = full_merged_df.drop(columns=['timestamp_x', 'timestamp_y'])


Doing NLP on the genre and tags

In [9]:
full_merged_df

Unnamed: 0,movieId,title,genres,userId,rating,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0,
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,4.0,
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11.0,4.5,
...,...,...,...,...,...,...
20340273,81932,,,138436.0,,Mark Wahlberg
20340274,81932,,,138436.0,,pigs
20340275,81932,,,138436.0,,prostitution
20340276,77154,,,138437.0,,"This movie should have been called \\""How Coca..."


In [10]:
# Assuming your merged DataFrame is named full_merged_df
nan_counts = full_merged_df.isna().sum()

# Print the counts of NaN values per column
print(nan_counts)

movieId           0
title         74119
genres        74119
userId          534
rating        74653
tag        19874714
dtype: int64


In [11]:
full_merged_df_cleaned = full_merged_df.dropna()

In [12]:
full_merged_df_cleaned

Unnamed: 0,movieId,title,genres,userId,rating,tag
585,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1644.0,3.5,Watched
619,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1741.0,4.0,computer animation
620,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1741.0,4.0,Disney animated feature
621,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1741.0,4.0,Pixar animation
622,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1741.0,4.0,TÃ©a Leoni does not star in this movie
...,...,...,...,...,...,...
20266152,131258,The Pirates (2014),Adventure,28906.0,2.5,bandits
20266153,131258,The Pirates (2014),Adventure,28906.0,2.5,Korea
20266154,131258,The Pirates (2014),Adventure,28906.0,2.5,mutiny
20266155,131258,The Pirates (2014),Adventure,28906.0,2.5,pirates


Generating Embeddings for The Tags, Genre, and Title. Since These are all text based, theoretically they have some kind of semantic meaning that could be captured

In [13]:
from transformers import BertModel, BertTokenizer
import torch

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()  # Set the model to evaluation mode


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [None]:
# Function to generate embeddings in batches
def generate_embeddings(model, encoded_inputs, batch_size=10):
    # Ensure inputs are on the same device as the model
    device = next(model.parameters()).device
    input_ids = encoded_inputs['input_ids'].to(device)
    attention_mask = encoded_inputs['attention_mask'].to(device)
    
    model.to(device)
    
    embeddings = []
    for i in range(0, len(input_ids), batch_size):
        batch_input_ids = input_ids[i:i+batch_size]
        batch_attention_mask = attention_mask[i:i+batch_size]
        
        with torch.no_grad():
            outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
        
        # Use mean pooling to get a single vector for the sentence
        batch_embeddings = outputs.last_hidden_state.mean(dim=1)
        embeddings.append(batch_embeddings.cpu())
    
    # Concatenate all batch embeddings
    embeddings = torch.cat(embeddings, dim=0)
    return embeddings

# Generate embeddings
embeddings = generate_embeddings(model, encoded_inputs)
