In [1]:
import os
import re
import csv
import glob
import zipfile
import pandas as pd
import numpy as np
from io import BytesIO
import requests
from sentence_transformers import SentenceTransformer, util

2026-02-17 19:12:53.557611: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1771355573.784420      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1771355573.849398      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1771355574.382653      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771355574.382711      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771355574.382715      55 computation_placer.cc:177] computation placer alr

In [2]:
class MovieDataProcessor:
    def __init__(self, movielens_name='ml-32m', target_dir='data/movielens'):
        self.ml_name = movielens_name
        self.target_dir = target_dir
        self.ml_path = os.path.join(target_dir, movielens_name)
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        
    def download_movielens(self):
        url = f'http://files.grouplens.org/datasets/movielens/{self.ml_name}.zip'
        if not os.path.exists(self.target_dir):
            os.makedirs(self.target_dir)
        
        print(f"Downloading {self.ml_name}...")
        r = requests.get(url, stream=True)
        with zipfile.ZipFile(BytesIO(r.content)) as z:
            z.extractall(self.target_dir)
        print("MovieLens extracted.")

    def process_netflix_raw(self, input_pattern, output_csv):
        """Converts Netflix .txt files to a memory-efficient CSV."""
        files = glob.glob(input_pattern)
        print(f"Converting {len(files)} Netflix files...")
        
        with open(output_csv, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['movie_id', 'user_id', 'date'])
            for file_path in files:
                with open(file_path, 'r') as infile:
                    movie_id = None
                    for line in infile:
                        line = line.strip()
                        if line.endswith(':'):
                            movie_id = line[:-1]
                        else:
                            parts = line.split(',')
                            # parts[2] is YYYY-MM-DD, we strip '-'
                            writer.writerow([movie_id, parts[0], parts[2].replace('-', '')])
        
        # Load a sample/chunk for processing as requested
        df = pd.read_csv(output_csv, dtype={'movie_id': 'uint16', 'user_id': 'uint32', 'date': 'uint32'}, nrows=5_000_000)
        df['rating'] = 0 # Default placeholder
        return df

    def get_unified_id_map(self, netflix_titles, ml_movies):
        """Matches MovieLens IDs to Netflix IDs using Semantic Similarity."""
        print("Aligning Movie IDs via Semantic Search...")
        
        # Clean ML titles: "Toy Story (1995)" -> ("Toy Story", 1995)
        def clean_ml(t):
            match = re.search(r'\((\d{4})\)$', str(t))
            return (t[:match.start()].strip(), int(match.group(1))) if match else (t, None)

        ml_movies[['title_clean', 'year']] = ml_movies['title'].apply(lambda x: pd.Series(clean_ml(x)))
        
        # Embeddings
        net_emb = self.model.encode(netflix_titles['title'].astype(str).tolist(), convert_to_tensor=True)
        ml_emb = self.model.encode(ml_movies['title_clean'].tolist(), convert_to_tensor=True)
        
        hits = util.semantic_search(net_emb, ml_emb, top_k=1)
        
        id_map = {}
        for i, hit in enumerate(hits):
            best = hit[0]
            if best['score'] > 0.85:
                # Check year parity (allow 1 year drift)
                net_yr = netflix_titles.iloc[i]['year']
                ml_yr = ml_movies.iloc[best['corpus_id']]['year']
                if pd.isna(net_yr) or pd.isna(ml_yr) or abs(net_yr - ml_yr) <= 1:
                    id_map[ml_movies.iloc[best['corpus_id']]['movieId']] = netflix_titles.iloc[i]['movie_id']

        # Handle unmatched ML movies with new IDs
        max_net_id = netflix_titles['movie_id'].max()
        unmatched = set(ml_movies['movieId']) - set(id_map.keys())
        for i, ml_id in enumerate(unmatched):
            id_map[ml_id] = max_net_id + i + 1
            
        return id_map

    def save_user_history(self, df, filename):
        """Filters, sorts, and saves sequence data to Parquet/List."""
        print(f"Generating sequences for {filename}...")
        
        # Filter users with at least 5 interactions
        counts = df['user_id'].value_counts()
        df = df[df['user_id'].isin(counts[counts >= 5].index)].copy()
        
        # Sort by user and date
        df.sort_values(['user_id', 'date'], inplace=True)
        
        # Group and take last 100
        history = df.groupby('user_id')['movie_id'].apply(lambda x: list(x)[-100:]).reset_index(name='list_movies')
        history.to_parquet(filename.replace('.csv', '.parquet'), index=False)
        
        # Create integer-mapped sequences for modeling
        unique_movies = sorted(list(set([m for seq in history['list_movies'] for m in seq])))
        movie_to_idx = {m: i + 1 for i, m in enumerate(unique_movies)}
        
        return history['list_movies'].apply(lambda seq: [movie_to_idx[m] for m in seq]).tolist()

In [6]:
path = '/kaggle/input/datasets/organizations/netflix-inc/netflix-prize-data'
!mkdir -p /content/data/netflix_prize
!cp -r "{path}"/* /content/data/netflix_prize/
!ls /content/data/netflix_prize

combined_data_1.txt  combined_data_3.txt  movie_titles.csv  qualifying.txt
combined_data_2.txt  combined_data_4.txt  probe.txt	    README


In [7]:
processor = MovieDataProcessor()

# 1. Setup Data
processor.download_movielens()
# Note: Ensure Kaggle dataset is available at this path
netflix_df = processor.process_netflix_raw('/content/data/netflix_prize/combined_data_*.txt', 'temp_netflix.csv')
netflix_titles = pd.read_csv('/content/data/netflix_prize/movie_titles.csv', 
                             encoding='ISO-8859-1', names=['movie_id', 'year', 'title'], on_bad_lines='skip')

# 2. Map IDs
ml_movies = pd.read_csv(os.path.join(processor.ml_path, 'movies.csv'))
id_map = processor.get_unified_id_map(netflix_titles, ml_movies)

# 3. Process MovieLens Ratings
ml_ratings = pd.read_csv(os.path.join(processor.ml_path, 'ratings.csv'))
ml_ratings['movie_id'] = ml_ratings['movieId'].map(id_map)
# Offset User IDs to avoid collision with Netflix users
ml_ratings['user_id'] = ml_ratings['userId'] + netflix_df['user_id'].max()
ml_ratings['date'] = pd.to_datetime(ml_ratings['timestamp'], unit='s').dt.strftime('%Y%m%d').astype(int)
ml_ratings = ml_ratings[['user_id', 'movie_id', 'date', 'rating']]

# 4. Final Merging
combined_df = pd.concat([netflix_df[['user_id', 'movie_id', 'date', 'rating']], ml_ratings], ignore_index=True)

# 5. Output Sequences
netflix_sequences = processor.save_user_history(netflix_df, 'user_history_netflix.csv')
ml_sequences = processor.save_user_history(ml_ratings, 'user_history_ml.csv')
combined_sequences = processor.save_user_history(combined_df, 'user_history.csv')

print("Pipeline Complete. Sequences generated.")

Downloading ml-32m...
MovieLens extracted.
Converting 4 Netflix files...
Aligning Movie IDs via Semantic Search...
Generating sequences for user_history_netflix.csv...
Generating sequences for user_history_ml.csv...
Generating sequences for user_history.csv...
Pipeline Complete. Sequences generated.
