## Load datasets and packages

In [25]:
import pandas as pd
import numpy as np
import polars as pl
from pathlib import Path
from typing import List, Dict, Tuple
from tqdm import tqdm
from mysql.connector import Error
import mysql.connector
import time
import structlog
import faiss
import pickle

In [26]:
logger = structlog.get_logger()

## Connect to MySQL server

In [27]:
def connect_to_mysql() -> mysql.connector:
    """
    Creates and returns a connection to the MySQL database.
    """
    try:
        connection = mysql.connector.connect(
            host='pdingproddbreplica.chioko0q2r4e.us-west-1.rds.amazonaws.com',          # Your MySQL server address (localhost for local)
            database='pding_prod_db',  # Your database name
            user='readonly',      # Your MySQL username
            password='welcomePding'   # Your MySQL password
            # Uncomment below if needed:
            # port=3306,               # MySQL default port is 3306
            # auth_plugin='mysql_native_password'  # If using newer MySQL versions
        )
        
        if connection.is_connected():
            logger.info("Connected to MySQL database")
            return connection
            
    except Error as e:
        logger.info(f"Error connecting to MySQL: {e}")
        return None
    
def close_connection(connection: mysql.connector):
    """
    Close the database connection.
    """
    if connection and connection.is_connected():
        connection.close()
        logger.info("MySQL connection closed")


def execute_multiple_queries_with_timing(connection, queries):
    """
    Execute multiple SQL queries sequentially, return results as a list of pandas DataFrames,
    and track execution time for each query and the total process.
    """
    dataframes = []
    total_start_time = time.perf_counter()
    try:
        cursor = connection.cursor(dictionary=True)
        for idx, query in enumerate(tqdm(queries, desc="Executing queries")):
            query_start_time = time.perf_counter()
            try:
                cursor.execute(query)
                records = cursor.fetchall()
                df = pd.DataFrame(records)
                dataframes.append(df)
                query_end_time = time.perf_counter()
                logger.info(f"Query {idx + 1} executed in {query_end_time - query_start_time:.4f} seconds.")
            except Error as e:
                logger.info(f"Error executing query {idx + 1}: {e}")
                dataframes.append(None)
        cursor.close()
    except Error as e:
        logger.info(f"Error setting up cursor: {e}")
    total_end_time = time.perf_counter()
    logger.info(f"Total execution time: {total_end_time - total_start_time:.4f} seconds.")
    return dataframes
    

In [21]:
connection = connect_to_mysql()
queries = [
    "select * from videos",
    "select * from video_rating",
    "select * from video_purchase",
    "select * from user_followings"
]
dfs = execute_multiple_queries_with_timing(connection, queries)
videos_df = dfs[0]
video_rating_df = dfs[1]
video_purchase_df = dfs[2]
user_followings_df = dfs[3]

Connected to MySQL database


Executing queries:  25%|██▌       | 1/4 [00:05<00:15,  5.12s/it]

Query 1 executed in 5.1205 seconds.


Executing queries:  50%|█████     | 2/4 [00:06<00:05,  2.75s/it]

Query 2 executed in 1.0814 seconds.


Executing queries:  75%|███████▌  | 3/4 [00:28<00:11, 11.65s/it]

Query 3 executed in 22.2357 seconds.


Executing queries: 100%|██████████| 4/4 [02:16<00:00, 34.17s/it]

Query 4 executed in 108.2194 seconds.
Total execution time: 136.8766 seconds.





In [22]:
print(f"Shape of videos_df: {videos_df.shape}")
print(f"Shape of video_rating_df: {video_rating_df.shape}")
print(f"Shape of video_purchase_df: {video_purchase_df.shape}")
print(f"Shape of user_followings_df: {user_followings_df.shape}")

Shape of videos_df: (12214, 24)
Shape of video_rating_df: (11686, 5)
Shape of video_purchase_df: (202279, 13)
Shape of user_followings_df: (1696894, 8)


In [23]:
videos_df = videos_df[videos_df['is_deleted'] == 0]
user_followings_df = user_followings_df[user_followings_df['is_deleted'] == 0]
video_rating_df['last_updated_date'] = pd.to_datetime(video_rating_df['updated_seconds'], unit='s')

# Apply efficient renaming to avoid duplicate columns
videos_df = videos_df.rename(columns={'duration': 'video_duration'})
video_purchase_df = video_purchase_df.rename(columns={
    'last_update_date': 'last_purchased_date',
    'duration': 'purchase_tier'
})

In [26]:
videos_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8975 entries, 0 to 12213
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   id                   8975 non-null   object        
 1   description          8975 non-null   object        
 2   video_duration       8975 non-null   int64         
 3   is_adult             8975 non-null   int64         
 4   is_paid              8975 non-null   int64         
 5   is_visible           8975 non-null   int64         
 6   status               8975 non-null   object        
 7   title                8975 non-null   object        
 8   trees                2723 non-null   object        
 9   updated_time_stamp   8975 non-null   int64         
 10  uploaded_time_stamp  8975 non-null   int64         
 11  user_id              8975 non-null   object        
 12  video_id             8975 non-null   object        
 13  rating_visible       8975 non-null   

In [27]:
import pandas as pd
import gc
import time
import os
import psutil  # You may need to pip install this

def process_with_tracking(vp_df, v_df, vr_df, uf_df, chunk_size=5000, output_file="outputs.csv"):
    # Set up tracking variables
    total_rows = len(vp_df)
    rows_processed = 0
    chunks_processed = 0
    start_time = time.time()
    results_saved = 0
    
    # Create or clear output file
    with open(output_file, 'w') as f:
        f.write("")  # Just clear the file if it exists
    
    logger.info(f"Total rows to process: {total_rows}")
    logger.info(f"Memory usage at start: {psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024:.2f} MB")
    
    # Process vp dataframe in chunks
    for start_idx in range(0, len(vp_df), chunk_size):
        try:
            chunk_start_time = time.time()
            # Get chunk of video_purchase data
            end_idx = min(start_idx + chunk_size, len(vp_df))
            vp_chunk = vp_df.iloc[start_idx:end_idx].copy()
            
            # First join
            temp_df = pd.merge(
                vp_chunk,
                v_df[['video_id', 'video_duration', 'rating_score', 'title', 'description']],
                on='video_id',
                how='inner'
            )
            
            # Check if any rows remain after first join
            if len(temp_df) == 0:
                logger.info(f"Chunk {chunks_processed+1}: No matching rows after first join. Skipping.")
                rows_processed += len(vp_chunk)
                chunks_processed += 1
                continue
                
            # Second join
            temp_df = pd.merge(
                temp_df,
                vr_df[['rating', 'video_id', 'user_id', 'last_updated_date']],
                on=['user_id', 'video_id'],
                how='inner'
            )
            
            # Check if any rows remain after second join
            if len(temp_df) == 0:
                logger.info(f"Chunk {chunks_processed+1}: No matching rows after second join. Skipping.")
                rows_processed += len(vp_chunk)
                chunks_processed += 1
                continue
            
            # Third join
            temp_df = pd.merge(
                temp_df,
                uf_df[['following', 'pd_category', 'pd_language']],
                left_on='video_owner_user_id',
                right_on='following',
                how='inner'
            )
            
            # Remove duplicates and sort
            temp_df = temp_df.drop_duplicates()
            
            # Append to results file
            temp_df.to_csv(output_file, mode='a', header=(results_saved==0), index=False)
            results_saved += len(temp_df)
            
            # Update progress tracking
            rows_processed += len(vp_chunk)
            chunks_processed += 1
            
            # Print progress
            elapsed = time.time() - start_time
            chunk_time = time.time() - chunk_start_time
            memory_usage = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024  # MB
            
            logger.info(f"Chunk {chunks_processed}: Processed {rows_processed}/{total_rows} rows ({rows_processed/total_rows*100:.1f}%)")
            logger.info(f"  - Rows in this chunk result: {len(temp_df)}")
            logger.info(f"  - Chunk processing time: {chunk_time:.2f}s")
            logger.info(f"  - Total elapsed time: {elapsed/60:.1f} minutes")
            logger.info(f"  - Memory usage: {memory_usage:.2f} MB")
            
            # Force garbage collection
            del temp_df
            del vp_chunk
            gc.collect()
            
        except Exception as e:
            logger.info(f"Error processing chunk {chunks_processed+1}: {str(e)}")
            # Save progress info in case of error
            with open("processing_error_log.txt", "a") as f:
                f.write(f"Error at chunk {chunks_processed+1}, rows {start_idx}-{end_idx}: {str(e)}\n")
    
    logger.info(f"\nProcessing complete:")
    logger.info(f"  - Total rows processed: {rows_processed}")
    logger.info(f"  - Total rows in results: {results_saved}")
    logger.info(f"  - Total time: {(time.time() - start_time)/60:.1f} minutes")
    logger.info(f"  - Results saved to: {output_file}")
    
    # Read back the final sorted results if needed
    # Note: This might be memory-intensive if the result is very large
    logger.info("Reading and sorting final results...")
    try:
        # Read in chunks and sort
        sorted_output_file = "sorted_" + output_file
        # Read first chunk to get header
        first_chunk = pd.read_csv(output_file, nrows=1)
        header = list(first_chunk.columns)
        
        # Write sorted chunks
        with open(sorted_output_file, 'w') as f:
            # Write header
            f.write(','.join(header) + '\n')
        
        # Process in chunks for sorting
        for chunk in pd.read_csv(output_file, chunksize=100000):
            chunk_sorted = chunk.sort_values(by='last_purchased_date', ascending=False)
            chunk_sorted.to_csv(sorted_output_file, mode='a', header=False, index=False)
        
        logger.info(f"Sorted results saved to: {sorted_output_file}")
    except Exception as e:
        logger.info(f"Error during final sorting: {str(e)}")
        logger.info("You can still access the unsorted results in the original output file.")
    
    return results_saved


In [29]:
process_with_tracking(
    v_df=videos_df,
    vp_df=video_purchase_df,
    uf_df=user_followings_df,
    vr_df=video_rating_df
)

Total rows to process: 202279
Memory usage at start: 3237.32 MB
Chunk 1: Processed 5000/202279 rows (2.5%)
  - Rows in this chunk result: 232
  - Chunk processing time: 13.94s
  - Total elapsed time: 0.2 minutes
  - Memory usage: 3180.89 MB
Chunk 2: Processed 10000/202279 rows (4.9%)
  - Rows in this chunk result: 222
  - Chunk processing time: 12.26s
  - Total elapsed time: 0.4 minutes
  - Memory usage: 3180.89 MB
Chunk 3: Processed 15000/202279 rows (7.4%)
  - Rows in this chunk result: 239
  - Chunk processing time: 14.25s
  - Total elapsed time: 0.7 minutes
  - Memory usage: 3180.89 MB
Chunk 4: Processed 20000/202279 rows (9.9%)
  - Rows in this chunk result: 260
  - Chunk processing time: 16.27s
  - Total elapsed time: 0.9 minutes
  - Memory usage: 3180.89 MB
Chunk 5: Processed 25000/202279 rows (12.4%)
  - Rows in this chunk result: 244
  - Chunk processing time: 14.40s
  - Total elapsed time: 1.2 minutes
  - Memory usage: 3180.90 MB
Chunk 6: Processed 30000/202279 rows (14.8%)
 

9868

# Load master data

In [16]:
master_df = pd.read_csv("/home/cyrilng/pding-recsys/sorted_outputs.csv")

In [17]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9868 entries, 0 to 9867
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   id                               9868 non-null   object 
 1   last_purchased_date              9868 non-null   object 
 2   trees_consumed                   9868 non-null   float64
 3   user_id                          9868 non-null   object 
 4   video_id                         9868 non-null   object 
 5   video_owner_user_id              9868 non-null   object 
 6   is_replacement_of_deleted_video  9730 non-null   float64
 7   purchase_tier                    9868 non-null   object 
 8   expiry_date                      9868 non-null   object 
 9   is_refunded                      9868 non-null   int64  
 10  drm_fee                          413 non-null    float64
 11  discount_percentage_applied      0 non-null      float64
 12  package_purchase_id 

In [18]:
# drop column and rename
master_df.drop(columns=['drm_fee', 'discount_percentage_applied', 'package_purchase_id', 
                        'is_replacement_of_deleted_video', 'following','is_refunded', 
                        'expiry_date', 'id', 'video_owner_user_id'], inplace=True)
master_df.rename(columns={'rating_score': 'wilson_score'}, inplace=True)
master_df.drop_duplicates(inplace=True)

In [19]:
logger.info(f"Number of unique users: {master_df['user_id'].nunique()}")
logger.info(f"Number of unique videos: {master_df['video_id'].nunique()}")

Number of unique users: 4086
Number of unique videos: 2447


In [20]:
master_df = master_df.dropna(subset=["pd_category", "pd_language"])
master_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9810 entries, 0 to 9867
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   last_purchased_date  9810 non-null   object 
 1   trees_consumed       9810 non-null   float64
 2   user_id              9810 non-null   object 
 3   video_id             9810 non-null   object 
 4   purchase_tier        9810 non-null   object 
 5   video_duration       9810 non-null   int64  
 6   wilson_score         9810 non-null   float64
 7   title                9810 non-null   object 
 8   description          9810 non-null   object 
 9   rating               9810 non-null   int64  
 10  last_updated_date    9810 non-null   object 
 11  pd_category          9810 non-null   object 
 12  pd_language          9810 non-null   object 
dtypes: float64(2), int64(2), object(9)
memory usage: 1.0+ MB


# Content-based Filtering

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import nltk
from konlpy.tag import Okt  # Korean language processor
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [8]:
class ContentBasedFiltering:
    """
    Content-based filtering recommendation system for items with Korean metadata.
    Specifically handles Korean text in 'title' and 'description' attributes.
    """
    
    def __init__(self):
        #  Initialize Korean text processor
        self.okt = Okt() 
        self.korean_stopwords = self._load_korean_stopwords()
        
        # Vector database
        self.index = None
        self.id_mapping = {}
        
        # Initialize transformers
        self.text_vectorizer = None
        self.numerical_scaler = MinMaxScaler()
        self.categorical_encoder = OneHotEncoder(handle_unknown='ignore')

        logger.info("Content-based Recommender initialized")
        
    def _load_korean_stopwords(self):
        """Load Korean stopwords or use a default set if file not available"""
        logger.info("Load Korean stopwords list")
        try:
            with open('korean_stopwords.txt', 'r', encoding='utf-8') as f:
                return set(f.read().splitlines())
        except FileNotFoundError:
            # Default basic Korean stopwords
            return {'이', '그', '저', '것', '수', '등', '들', '및', '에서', '으로', '를', '에', '의', '가', '은', '는', '이런', '저런', '그런'}
    
    def _tokenize_korean_text(self, text):
        """Preprocess Korean text with specialized handling"""

        if not isinstance(text, str):
            return ""
        
        # Normalize text
        text = text.lower()
        
        # Remove special characters but keep Korean, English, numbers
        text = re.sub(r'[^\wㄱ-ㅎㅏ-ㅣ가-힣 ]', ' ', text)
        
        # Tokenize Korean text and select only nouns, adjectives, verbs
        tokens = self.okt.pos(text)
        filtered_tokens = [word for word, pos in tokens if (pos in ['Noun', 'Adjective', 'Verb'] and 
                                                           len(word) > 1 and 
                                                           word not in self.korean_stopwords)]
        
        return ' '.join(filtered_tokens)

    def preprocess_text(self, df: pd.DataFrame) -> pd.DataFrame:
        """Preprocess Korean text columns (title and description)"""
        logger.info(f"Preprocessing text for {len(df)} videos")
        # Create copies to avoid modifying the original dataframe
        df_processed = df.copy()
        
        # Tokenize Korean text
        df_processed['title_tokenized'] = df_processed['title'].fillna("").apply(self._tokenize_korean_text)
        df_processed['description_tokenized'] = df_processed['description'].fillna("").apply(self._tokenize_korean_text)
        
        # Combine text features
        df_processed['text_combined'] = df_processed['title_tokenized'] + " " + df_processed['description_tokenized']
        
        return df_processed
    
    def extract_text_features(self, df: pd.DataFrame) -> np.ndarray:
        """Extract TF-IDF features from preprocessed text"""
        logger.info("Extracting text features")
        if self.text_vectorizer is None:
            # Initialize and fit vectorizer if not already done
            self.text_vectorizer = TfidfVectorizer(
                min_df=2, 
                max_df=0.95, 
                max_features=5000, 
                ngram_range=(1, 2), 
                sublinear_tf=True
            )
            text_features = self.text_vectorizer.fit_transform(df['text_combined'])
        else:
            # Use pre-trained vectorizer
            text_features = self.text_vectorizer.transform(df['text_combined'])
            
        return text_features
    
    def extract_metadata_features(self, df: pd.DataFrame) -> np.ndarray:
        """Extract and encode numerical and categorical metadata features"""
        logger.info("Extracting metadata features")
        # Handle numerical features
        numerical_features = df[['tree_consumed', 'video_duration']].values
        scaled_numerical = self.numerical_scaler.fit_transform(numerical_features)
        
        # Handle categorical features
        categorical_features = df[['purchase_tier', 'pd_category']].values
        encoded_categorical = self.categorical_encoder.fit_transform(categorical_features).toarray()
        
        # Combine all metadata features
        metadata_features = np.hstack((scaled_numerical, encoded_categorical))
        
        return metadata_features
    
    def combine_features(self, text_features: np.ndarray, metadata_features: np.ndarray, 
                         text_weight: float = 0.7) -> np.ndarray:
        """Combine text and metadata features with weighting"""
        logger.info(f"Combining features with text_weight={text_weight}")
        # Normalize feature matrices
        text_norm = np.sqrt((text_features.toarray() ** 2).sum(axis=1))
        text_normalized = text_features.toarray() / text_norm[:, np.newaxis]
        
        metadata_norm = np.sqrt((metadata_features ** 2).sum(axis=1))
        metadata_normalized = metadata_features / metadata_norm[:, np.newaxis]
        
        # Combine with weights
        combined_features = (text_weight * text_normalized + 
                             (1 - text_weight) * metadata_normalized)
        
        return combined_features
    
    def build_faiss_index(self, feature_matrix: np.ndarray) -> None:
        """Build FAISS index for fast similarity search"""
        logger.info(f"Building FAISS index with {feature_matrix.shape[0]} videos")
        
        # Convert to float32 as required by FAISS
        features_float32 = feature_matrix.astype(np.float32)
        
        # Create and train index
        dimension = features_float32.shape[1]
        self.index = faiss.IndexFlatL2(dimension)  # L2 distance
        
        # Add vectors to the index
        self.index.add(features_float32)
        
        logger.info(f"FAISS index built with {self.index.ntotal} vectors")

    def fit(self, video_data: pd.DataFrame) -> None:
        """Fit the recommendation model on the provided video data"""
        logger.info(f"Fitting model on {len(video_data)} videos")
        
        # Store original video IDs for mapping
        original_indices = video_data.index.tolist()
        
        # Preprocess text
        processed_df = self.preprocess_text(video_data)
        
        # Extract features
        text_features = self.extract_text_features(processed_df)
        metadata_features = self.extract_metadata_features(processed_df)
        
        # Combine features
        combined_features = self.combine_features(text_features, metadata_features)
        
        # Build search index
        self.build_faiss_index(combined_features)
        
        # Create mapping from FAISS index to original video IDs
        self.id_mapping = {i: original_indices[i] for i in range(len(original_indices))}
        
        # Save models
        self.save_models()
        
        logger.info("Model fitting completed")

    def find_similar_videos(self, video_id: int, video_data: pd.DataFrame, top_n: int = 10) -> List[Tuple[int, float]]:
        """Find videos similar to the given video ID"""
        # # Cache key for this query
        # cache_key = f"sim_videos:{video_id}:{top_n}"
        
        # # Try to get from cache first
        # if self.use_cache:
        #     cached_result = self.cache.get(cache_key)
        #     if cached_result:
        #         CACHE_HIT_COUNTER.inc()
        #         logger.info(f"Cache hit for video_id={video_id}")
        #         return pickle.loads(cached_result)
        
        logger.info(f"Finding {top_n} videos similar to video_id={video_id}")
        
        try:
            # Get the index of the video in our processed data
            video_idx = list(self.id_mapping.values()).index(video_id)
            
            # Get the feature vector for this video
            query_vector = np.array([self.index.reconstruct(video_idx)]).astype(np.float32)
            
            # Search for similar videos
            k = top_n + 1  # +1 because the video itself will be included
            distances, indices = self.index.search(query_vector, k)
            
            # Convert to list of (video_id, similarity_score) tuples
            # Skip the first result (which is the query video itself)
            similar_videos = []
            for i, idx in enumerate(indices[0]):
                if self.id_mapping[idx] != video_id:  # Skip the query video
                    # Convert distance to similarity score (1 / (1 + distance))
                    similarity = 1 / (1 + distances[0][i])
                    similar_videos.append((self.id_mapping[idx], float(similarity)))
                
                if len(similar_videos) == top_n:
                    break
            
            # Cache the result
            # if self.use_cache:
            #     self.cache.setex(cache_key, self.cache_ttl, pickle.dumps(similar_videos))
            
            return similar_videos
        
        except Exception as e:
            logger.error(f"Error finding similar videos: {str(e)}")
            return []
        
    def save_models(self) -> None:
        """Save trained models and preprocessors to disk"""
        logger.info(f"Saving models to {self.model_dir}")
        
        # Save text vectorizer
        with open(os.path.join(self.model_dir, "text_vectorizer.pkl"), "wb") as f:
            pickle.dump(self.text_vectorizer, f)
        
        # Save numerical scaler
        with open(os.path.join(self.model_dir, "numerical_scaler.pkl"), "wb") as f:
            pickle.dump(self.numerical_scaler, f)
        
        # Save categorical encoder
        with open(os.path.join(self.model_dir, "categorical_encoder.pkl"), "wb") as f:
            pickle.dump(self.categorical_encoder, f)
        
        # Save FAISS index
        faiss.write_index(self.index, os.path.join(self.model_dir, "faiss_index.bin"))
        
        # Save ID mapping
        with open(os.path.join(self.model_dir, "id_mapping.pkl"), "wb") as f:
            pickle.dump(self.id_mapping, f)
            
        logger.info("Models saved successfully")
    
    def load_models(self) -> None:
        """Load trained models and preprocessors from disk"""
        logger.info(f"Loading models from {self.model_dir}")
        
        try:
            # Load text vectorizer
            with open(os.path.join(self.model_dir, "text_vectorizer.pkl"), "rb") as f:
                self.text_vectorizer = pickle.load(f)
            
            # Load numerical scaler
            with open(os.path.join(self.model_dir, "numerical_scaler.pkl"), "rb") as f:
                self.numerical_scaler = pickle.load(f)
            
            # Load categorical encoder
            with open(os.path.join(self.model_dir, "categorical_encoder.pkl"), "rb") as f:
                self.categorical_encoder = pickle.load(f)
            
            # Load FAISS index
            self.index = faiss.read_index(os.path.join(self.model_dir, "faiss_index.bin"))
            
            # Load ID mapping
            with open(os.path.join(self.model_dir, "id_mapping.pkl"), "rb") as f:
                self.id_mapping = pickle.load(f)
                
            logger.info("Models loaded successfully")
            return True
        
        except Exception as e:
            logger.error(f"Error loading models: {str(e)}")
            return False
    

# Collaborative Based Filtering

### Item-to-Item 

In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import time
import heapq
from collections import defaultdict
from typing import List, Tuple
import math
from scipy import stats

class ItemBasedCF:
    """
    Item-based Collaborative Filtering recommendation system
    specifically designed for video recommendations with user ratings.
    """
    
    def __init__(self, top_n_similar=10):
        """
        Initialize the recommender system.
        
        Parameters:
        -----------
        top_n_similar : int, default=10
            Number of most similar items to store for each item
        """
        self.top_n_similar = top_n_similar
        self.user_item_matrix = None
        self.item_similarity_matrix = None
        self.user_mapping = None
        self.item_mapping = None
        self.reverse_user_mapping = None
        self.reverse_item_mapping = None
        
    def fit(self, ratings_df: pd.DataFrame):
        """
        Build the item-based collaborative filtering model.
        
        Parameters:
        -----------
        ratings_df : pandas.DataFrame
            DataFrame containing user_id, video_id, and rating columns
        """
        print("Starting model fitting...")
        start_time = time.time()
        
        # Create mappings between original IDs and matrix indices
        self._create_mappings(ratings_df)
        
        # Build the user-item matrix
        self._build_user_item_matrix(ratings_df)
        
        # Calculate item similarity matrix
        self._build_item_similarity_matrix()
        
        print(f"Model fitting completed in {time.time() - start_time:.2f} seconds")
        
    def _create_mappings(self, ratings_df: pd.DataFrame):
        """Create mappings between original IDs and matrix indices."""
        print("Creating user and item mappings...")
        
        # Get unique users and items
        unique_users = ratings_df['user_id'].unique()
        unique_items = ratings_df['video_id'].unique()
        
        # Create mappings
        self.user_mapping = {user: idx for idx, user in enumerate(unique_users)}
        self.item_mapping = {item: idx for idx, item in enumerate(unique_items)}
        
        # Create reverse mappings (index to original ID)
        self.reverse_user_mapping = {idx: user for user, idx in self.user_mapping.items()}
        self.reverse_item_mapping = {idx: item for item, idx in self.item_mapping.items()}
        
        print(f"Found {len(unique_users)} unique users and {len(unique_items)} unique videos")
        
    def _build_user_item_matrix(self, ratings_df: pd.DataFrame):
        """Build the user-item matrix from the ratings DataFrame."""
        print("Building user-item matrix...")
        
        # Convert IDs to matrix indices
        user_indices = [self.user_mapping[user] for user in ratings_df['user_id']]
        item_indices = [self.item_mapping[item] for item in ratings_df['video_id']]
        
        # Create sparse matrix
        n_users = len(self.user_mapping)
        n_items = len(self.item_mapping)
        
        # Convert ratings to float values
        ratings = ratings_df['rating'].values.astype(float)
        
        # Create the sparse matrix
        self.user_item_matrix = csr_matrix((ratings, (user_indices, item_indices)), 
                                          shape=(n_users, n_items))
        
        print(f"Created user-item matrix of shape {self.user_item_matrix.shape}")
        
    def _build_item_similarity_matrix(self):
        """Calculate the item-item similarity matrix using cosine similarity."""
        print("Building item similarity matrix...")
        start_time = time.time()
        
        # Convert to item-user matrix (transpose)
        item_user_matrix = self.user_item_matrix.T.tocsr()
        
        # Calculate similarity matrix
        n_items = item_user_matrix.shape[0]
        self.item_similarity_matrix = {}
        
        # For each item, calculate similarity with all other items
        for i in range(n_items):
            # Print progress every 1000 items
            if i % 1000 == 0 and i > 0:
                print(f"Processed {i}/{n_items} items... ({time.time() - start_time:.2f} sec)")
            
            # Get the item vector
            item_vec = item_user_matrix[i].toarray().flatten()
            
            # Calculate similarities with all items at once
            similarities = cosine_similarity(
                item_vec.reshape(1, -1), 
                item_user_matrix.toarray()
            ).flatten()
            
            # Keep only top N similar items (excluding self)
            # First, set self-similarity to -1 to exclude it
            similarities[i] = -1
            
            # Get indices of top N items
            top_similar_indices = heapq.nlargest(self.top_n_similar, 
                                                range(len(similarities)), 
                                                key=similarities.__getitem__)
            
            # Store only top N similarities per video
            self.item_similarity_matrix[i] = {
                sim_idx: similarities[sim_idx] 
                for sim_idx in top_similar_indices 
                if similarities[sim_idx] > 0
            }
        
        print(f"Item similarity matrix built in {time.time() - start_time:.2f} seconds")

    def _wilson_score(self, pos, n, confidence=0.95):
        """
        Calculate the Wilson score interval for a binomial proportion.
        
        Parameters:
        -----------
        pos : int or float
            Number of positive ratings or sum of ratings
        n : int
            Total number of ratings
        confidence : float, default=0.95
            Confidence level
            
        Returns:
        --------
        float
            Lower bound of Wilson score interval
        """
        if n == 0:
            return 0
        
        z = stats.norm.ppf(1 - (1 - confidence) / 2)
        phat = pos / n
        
        # Wilson score calculation
        score = (phat + z*z/(2*n) - z * math.sqrt((phat*(1-phat) + z*z/(4*n))/n)) / (1 + z*z/n)
        
        return score

    def recommend_popular_items_wilson(self, n_recommendations=10, confidence=0.95, normalize_ratings=True):
        """
        Recommend most popular items based on Wilson score.
        Used as fallback for cold-start users.
        
        Parameters:
        -----------
        n_recommendations : int, default=10
            Number of recommendations to generate
        confidence : float, default=0.95
            Confidence level for Wilson score
        normalize_ratings : bool, default=True
            Whether to normalize ratings to 0-1 range
                
        Returns:
        --------
        list of tuples
            List of (video_id, wilson_score) tuples
        """
        # Convert user-item matrix to array for easier processing
        matrix = self.user_item_matrix.toarray()
        
        # Prepare to store results
        wilson_scores = []
        
        # Process each item
        for item_idx in range(matrix.shape[1]):
            # Get ratings for this item
            item_ratings = matrix[:, item_idx]
            
            # Skip items with no ratings
            valid_ratings = item_ratings[item_ratings > 0]
            if len(valid_ratings) == 0:
                wilson_scores.append(0)
                continue
            
            if normalize_ratings:
                # Normalize ratings to 0-1 range
                # Assuming ratings are 1-5
                norm_ratings = (valid_ratings - 1) / 4
                pos = np.sum(norm_ratings)
            else:
                # Use sum of ratings as "positive" outcome
                pos = np.sum(valid_ratings)
            
            # Total number of ratings
            n = len(valid_ratings)
            
            # Calculate Wilson score
            score = self._wilson_score(pos, n, confidence)
            wilson_scores.append(score)
        
        # Get indices of top items by Wilson score
        top_indices = heapq.nlargest(n_recommendations, 
                                    range(len(wilson_scores)), 
                                    key=lambda x: wilson_scores[x])
        
        # Convert to original video IDs
        recommendations = [
            (self.reverse_item_mapping[idx], wilson_scores[idx])
            for idx in top_indices
        ]
        
        return recommendations
    
    def recommend_for_user(self, user_id: int, n_recommendations=10, exclude_watched=True) -> List[Tuple]:
        """
        Generate personalized recommendations for a user.
        
        Parameters:
        -----------
        user_id : int or str
            Original user ID
        n_recommendations : int, default=10
            Number of recommendations to generate
        exclude_watched : bool, default=True
            Whether to exclude videos the user has already watched
            
        Returns:
        --------
        list of tuples
            List of (video_id, predicted_rating) tuples
        """
        # Check if user exists in training data
        if user_id not in self.user_mapping:
            print(f"User {user_id} not found in training data. Using popular items instead.")
            return self.recommend_popular_items_wilson(n_recommendations)
        
        # Get user index
        user_idx = self.user_mapping[user_id]
        
        # Get user's ratings
        user_ratings = self.user_item_matrix[user_idx].toarray().flatten()
        watched_items = np.where(user_ratings > 0)[0]
        
        if len(watched_items) == 0:
            print(f"User {user_id} has no ratings. Using popular items instead.")
            return self.recommend_popular_items(n_recommendations)
        
        # Initialize recommendation scores
        scores = defaultdict(float)
        
        # For each rated item
        for item_idx in watched_items:
            # Get user's rating for this item
            item_rating = user_ratings[item_idx]
            
            # Skip low ratings (optional - you might want to consider negative feedback)
            if item_rating < 3:
                continue
                
            # Get similar items
            if item_idx in self.item_similarity_matrix:
                # For each similar item
                for similar_item, similarity in self.item_similarity_matrix[item_idx].items():
                    # Skip if user has already watched this item and we want to exclude watched
                    if exclude_watched and user_ratings[similar_item] > 0:
                        continue
                    
                    # Weight by both rating and similarity
                    scores[similar_item] += similarity * item_rating
        
        # If we have no recommendations after filtering
        if len(scores) == 0:
            return self.recommend_popular_items(n_recommendations)
        
        # Sort by score and take top N
        top_item_indices = heapq.nlargest(n_recommendations, 
                                         scores.keys(), 
                                         key=scores.get)
        
        # Convert back to original video IDs
        recommendations = [
            (self.reverse_item_mapping[item_idx], scores[item_idx])
            for item_idx in top_item_indices
        ]
        
        return recommendations
    
    def recommend_similar_items(self, video_id, n_recommendations=10) -> List:
        """
        Find videos similar to a given video.
        
        Parameters:
        -----------
        video_id : int or str
            Original video ID
        n_recommendations : int, default=10
            Number of similar videos to recommend
            
        Returns:
        --------
        list of tuples
            List of (video_id, similarity_score) tuples
        """
        # Check if item exists in training data
        if video_id not in self.item_mapping:
            print(f"Video {video_id} not found in training data.")
            return []
        
        # Get item index
        item_idx = self.item_mapping[video_id]
        
        # If no similarity data for this item
        if item_idx not in self.item_similarity_matrix:
            print(f"No similarity data for video {video_id}.")
            return []
        
        # Get similar items
        similar_items = self.item_similarity_matrix[item_idx]
        
        # Sort by similarity and take top N
        top_similar = heapq.nlargest(n_recommendations, 
                                    similar_items.keys(), 
                                    key=similar_items.get)
        
        # Convert back to original video IDs
        recommendations = [
            (self.reverse_item_mapping[sim_idx], similar_items[sim_idx])
            for sim_idx in top_similar
        ]
        
        return recommendations
    
    def recommend_popular_items(self, n_recommendations=10) -> List:
        """
        Recommend most popular items based on average rating.
        Used as fallback for cold-start users.
        
        Parameters:
        -----------
        n_recommendations : int, default=10
            Number of recommendations to generate
            
        Returns:
        --------
        list of tuples
            List of (video_id, avg_rating) tuples
        """
        # Calculate average rating per item
        item_means = self.user_item_matrix.mean(axis=0).A1
        
        # Get indices of top rated items
        top_indices = heapq.nlargest(n_recommendations, 
                                    range(len(item_means)), 
                                    key=lambda x: item_means[x])
        
        # Convert to original video IDs
        recommendations = [
            (self.reverse_item_mapping[idx], item_means[idx])
            for idx in top_indices
        ]
        
        return recommendations
    
    def save_model(self, filepath):
        """Save the model to a file."""
        import pickle
        
        model_data = {
            'user_mapping': self.user_mapping,
            'item_mapping': self.item_mapping,
            'reverse_user_mapping': self.reverse_user_mapping,
            'reverse_item_mapping': self.reverse_item_mapping,
            'user_item_matrix': self.user_item_matrix,
            'item_similarity_matrix': self.item_similarity_matrix,
            'top_n_similar': self.top_n_similar
        }
        
        with open(filepath, 'wb') as f:
            pickle.dump(model_data, f)
        
        print(f"Model saved to {filepath}")
    
    @classmethod
    def load_model(cls, filepath):
        """Load a saved model from a file."""
        import pickle
        
        with open(filepath, 'rb') as f:
            model_data = pickle.load(f)
        
        # Create new instance
        model = cls(top_n_similar=model_data['top_n_similar'])
        
        # Restore attributes
        model.user_mapping = model_data['user_mapping']
        model.item_mapping = model_data['item_mapping']
        model.reverse_user_mapping = model_data['reverse_user_mapping']
        model.reverse_item_mapping = model_data['reverse_item_mapping']
        model.user_item_matrix = model_data['user_item_matrix']
        model.item_similarity_matrix = model_data['item_similarity_matrix']
        
        return model

# Example for loading data from CSV file and full pipeline
def full_pipeline_example():
    """
    Example showing how to use the ItemBasedCF class with a CSV file.
    """
    # Load data
    df = video_rating_df
    
    # Data exploration and preprocessing
    print(f"Dataset shape: {df.shape}")
    print(f"Number of unique users: {df['user_id'].nunique()}")
    print(f"Number of unique videos: {df['video_id'].nunique()}")
    print(f"Rating distribution:\n{df['rating'].value_counts(normalize=True)}")
    
    # Check for missing values
    if df.isnull().sum().sum() > 0:
        print("Warning: Dataset contains missing values")
        df = df.dropna()
    
    # Split into train (80%) and test (20%)
    from sklearn.model_selection import train_test_split
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    
    # Train the model
    model = ItemBasedCF(top_n_similar=20)
    model.fit(train_df)
    
    # Save the model
    model.save_model('item_cf_model.pkl')
    
    # Evaluate the model
    from sklearn.metrics import mean_squared_error
    import math
    
    # Get test user-item pairs
    test_users = test_df['user_id'].unique()
    
    # Calculate RMSE for test set
    actual_ratings = []
    predicted_ratings = []
    
    for user_id in test_users:
        # Get user's test items
        user_test_items = test_df[test_df['user_id'] == user_id]
        
        if user_id not in model.user_mapping:
            continue
        
        user_idx = model.user_mapping[user_id]
        user_ratings = model.user_item_matrix[user_idx].toarray().flatten()
        watched_items = np.where(user_ratings > 0)[0]
        
        for _, row in user_test_items.iterrows():
            if row['video_id'] not in model.item_mapping:
                continue
                
            item_idx = model.item_mapping[row['video_id']]
            actual_rating = row['rating']
            
            # Predict rating using item-based CF
            predicted_rating = 0
            total_similarity = 0
            
            for watched_item in watched_items:
                # Skip the current test item
                if watched_item == item_idx:
                    continue
                    
                # Get user's rating for this item
                rating = user_ratings[watched_item]
                
                # Get similarity between this item and test item
                if watched_item in model.item_similarity_matrix and item_idx in model.item_similarity_matrix[watched_item]:
                    similarity = model.item_similarity_matrix[watched_item][item_idx]
                    predicted_rating += similarity * rating
                    total_similarity += similarity
            
            # Normalize by total similarity
            if total_similarity > 0:
                predicted_rating /= total_similarity
                
                actual_ratings.append(actual_rating)
                predicted_ratings.append(predicted_rating)
    
    # Calculate RMSE
    rmse = math.sqrt(mean_squared_error(actual_ratings, predicted_ratings))
    print(f"Test RMSE: {rmse}")
    
    # Generate recommendations for a sample user
    sample_user = df['user_id'].iloc[0]
    recommendations = model.recommend_for_user(sample_user, n_recommendations=5)
    print(f"Top 5 recommendations for user {sample_user}:")
    for video_id, score in recommendations:
        print(f"  Video {video_id}: Score {score:.4f}")

# Uncomment to run the full pipeline example
full_pipeline_example()

Dataset shape: (11629, 4)
Number of unique users: 4831
Number of unique videos: 2924
Rating distribution:
rating
5    0.738929
1    0.118497
4    0.072577
3    0.046952
2    0.023046
Name: proportion, dtype: float64
Starting model fitting...
Creating user and item mappings...
Found 4158 unique users and 2608 unique videos
Building user-item matrix...
Created user-item matrix of shape (4158, 2608)
Building item similarity matrix...
Processed 1000/2608 items... (152.02 sec)
Processed 2000/2608 items... (310.01 sec)
Item similarity matrix built in 411.72 seconds
Model fitting completed in 411.74 seconds
Model saved to item_cf_model.pkl
Test RMSE: 1.118815692958474
Top 5 recommendations for user R7DePNyqQDUQOT1MJhIawC2IN323:
  Video ee8e8b74-c446-485e-90ab-5285d8b6607c: Score 1.7344
  Video e3388292-edde-4a1a-9e7c-5c59973331b2: Score 1.4222
  Video b8fd416f-ba7a-4d9c-946d-18e001fbe03f: Score 1.4222
  Video 4d13a11e-2206-4a97-a01b-53b9052858d4: Score 1.4222
  Video 7274c94e-ca0c-4ea2-8704-0

In [None]:
# Generate recommendations for a sample user
sample_user = df['user_id'].iloc[0]
recommendations = model.recommend_for_user(sample_user, n_recommendations=5)
print(f"Top 5 recommendations for user {sample_user}:")
for video_id, score in recommendations:
    print(f"  Video {video_id}: Score {score:.4f}")

### User-to-Item

In [54]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from math import sqrt

class UserBasedCF:
    def __init__(self, n_neighbors=20, min_neighbors=1):
        """
        Initialize User-Based Collaborative Filtering model
        
        Parameters:
        -----------
        n_neighbors : int, default=20
            Number of neighbors to use for prediction
        min_neighbors : int, default=1
            Minimum number of neighbors needed to make a prediction
        """
        self.n_neighbors = n_neighbors
        self.min_neighbors = min_neighbors
        self.user_similarity_matrix = None
        self.ratings_matrix = None
        self.user_ids = None
        self.item_ids = None
        self.user_means = None
        
    def fit(self, ratings_df):
        """
        Fit the model with training data
        
        Parameters:
        -----------
        ratings_df : pandas DataFrame with columns ['user_id', 'item_id', 'rating']
            The training ratings data
        """
        # Create user-item matrix
        
        self.ratings_matrix = ratings_df.pivot_table(
            index='user_id', 
            columns='item_id', 
            values='rating'
        ).fillna(0)
      
        
        
        self.user_ids = self.ratings_matrix.index.tolist()
        self.item_ids = self.ratings_matrix.columns.tolist()
        
        # Calculate mean rating for each user for later use in prediction
        self.user_means = self.ratings_matrix.mean(axis=1)
        
        # Normalize ratings by subtracting user means
        normalized_ratings = self.ratings_matrix.subtract(self.user_means, axis=0)
        
        # Calculate user similarity matrix using cosine similarity
        self.user_similarity_matrix = cosine_similarity(normalized_ratings)
        self.user_similarity_matrix = pd.DataFrame(
            self.user_similarity_matrix,
            index=self.user_ids,
            columns=self.user_ids
        )
        
    def predict(self, user_id, item_id):
        """
        Predict rating for a single user-item pair
        
        Parameters:
        -----------
        user_id : int or str
            The user ID
        item_id : int or str
            The item ID
            
        Returns:
        --------
        float : Predicted rating
        """
        if user_id not in self.user_ids:
            return self.ratings_matrix.mean().mean()  # Global mean for new users
        
        if item_id not in self.item_ids:
            return self.user_means[user_id]  # User mean for new items
        
        # Get user's row index
        user_idx = self.user_ids.index(user_id)
        
        # Get all users' similarities to target user
        similarities = self.user_similarity_matrix.iloc[user_idx].values
        
        # Get all users' ratings for the target item
        ratings = self.ratings_matrix[item_id].values
        
        # Mask when ratings are zero (unrated)
        mask = ratings != 0
        
        # Only consider similar users who have rated the item
        sims = similarities[mask]
        rs = ratings[mask]
        
        # Check if we have enough neighbors
        if len(sims) < self.min_neighbors:
            return self.user_means[user_id]
        
        # Sort by similarity and take top k neighbors
        if len(sims) > self.n_neighbors:
            idx = np.argsort(sims)[-self.n_neighbors:]
            sims = sims[idx]
            rs = rs[idx]
        
        # Normalize ratings by subtracting user means
        user_means_array = np.array([self.user_means[uid] for uid in np.array(self.user_ids)[mask]])
        if len(user_means_array) > self.n_neighbors:
            user_means_array = user_means_array[idx]
        
        rs_norm = rs - user_means_array
        
        # Calculate weighted average
        if np.sum(np.abs(sims)) > 0:
            pred = self.user_means[user_id] + np.sum(sims * rs_norm) / np.sum(np.abs(sims))
        else:
            pred = self.user_means[user_id]
        
        return pred
    
    def recommend(self, user_id, n_items=10, exclude_rated=True):
        """
        Generate recommendations for a user
        
        Parameters:
        -----------
        user_id : int or str
            The user ID
        n_items : int, default=10
            Number of items to recommend
        exclude_rated : bool, default=True
            Whether to exclude already rated items
            
        Returns:
        --------
        list : List of recommended item IDs
        """
        if user_id not in self.user_ids:
            # For new users, recommend most popular items
            item_popularity = self.ratings_matrix.sum().sort_values(ascending=False)
            return item_popularity.index[:n_items].tolist()
        
        # Get items the user has already rated
        rated_items = self.ratings_matrix.loc[user_id]
        rated_items = rated_items[rated_items > 0].index.tolist() if exclude_rated else []
        
        # Calculate predicted ratings for all unrated items
        all_items = [item for item in self.item_ids if item not in rated_items]
        predicted_ratings = {item: self.predict(user_id, item) for item in all_items}
        
        # Sort items by predicted rating
        sorted_items = sorted(predicted_ratings.items(), key=lambda x: x[1], reverse=True)
        
        # Return top N recommendations
        return [item for item, rating in sorted_items[:n_items]]
    
    def evaluate(self, test_df):
        """
        Evaluate the model on test data
        
        Parameters:
        -----------
        test_df : pandas DataFrame with columns ['user_id', 'item_id', 'rating']
            The test ratings data
            
        Returns:
        --------
        float : Root Mean Squared Error (RMSE)
        """
        predictions = []
        actuals = []
        
        for _, row in test_df.iterrows():
            user_id = row['user_id']
            item_id = row['item_id']
            
            if user_id in self.user_ids and item_id in self.item_ids:
                pred = self.predict(user_id, item_id)
                predictions.append(pred)
                actuals.append(row['rating'])
        
        # Calculate RMSE
        rmse = sqrt(mean_squared_error(actuals, predictions))
        return rmse

In [61]:
u2i_cf_df = cf_df_encoded[["user_idx", "item_idx", "rating"]]
u2i_cf_df = u2i_cf_df.rename(columns={"user_idx": "user_id", "item_idx":"item_id"})
u2i_cf_df

Unnamed: 0,user_id,item_id,rating
0,2594,1889,5
1,2032,1908,1
2,1979,448,5
3,71,1951,5
4,2156,485,1
...,...,...,...
9295,2342,2286,5
9296,631,2286,5
9297,1746,721,5
9298,1746,844,5


In [62]:
u2i_cf_df = cf_df_encoded[["user_idx", "item_idx", "rating"]]
u2i_cf_df.rename(columns={"user_idx": "user_id", "item_idx":"item_id"}, inplace=True)

# Split into train and test (80:20)
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(u2i_cf_df, test_size=0.2, random_state=42)

# Initialize and train model
cf_model = UserBasedCF(n_neighbors=2)
cf_model.fit(train_df)

# Make recommendations for a user
recommendations = cf_model.recommend(user_id=1, n_items=5)
print(f"Recommendations for user 1: {recommendations}")

# Evaluate model
rmse = cf_model.evaluate(test_df)
print(f"RMSE: {rmse}")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  u2i_cf_df.rename(columns={"user_idx": "user_id", "item_idx":"item_id"}, inplace=True)


Recommendations for user 1: [322, 1160, 1625, 1174, 1341]
RMSE: 4.915260353628598


In [68]:
cf_df_encoded[cf_df_encoded["item_idx"].isin([322, 1160, 1625, 1174, 1341])]

Unnamed: 0,user_id,video_id,rating,user_idx,item_idx
78,DNHJJnbYyXd31WUkq0GPXZkkFqr1,844b8ae6-f5b9-474c-83fc-9df645de1941,5,835,1174
188,zNWhroyJfITeNCa9UDijTTvgks63,844b8ae6-f5b9-474c-83fc-9df645de1941,5,3847,1174
210,cpxKW5kyPihjXGPsptgecvot9SE2,844b8ae6-f5b9-474c-83fc-9df645de1941,5,2453,1174
373,yDvMmF1T7NfNKpYXsQms5HkdHFt1,844b8ae6-f5b9-474c-83fc-9df645de1941,5,3783,1174
410,Ku0YGfIHZuMtSHDmq9yCtEe0DkB3,844b8ae6-f5b9-474c-83fc-9df645de1941,5,1285,1174
459,LhavCaMxdPexiv7AEHdQQGmDU6s1,844b8ae6-f5b9-474c-83fc-9df645de1941,5,1344,1174
461,XpVUrvD6WhOEeblPxAlg6wzd5Z92,844b8ae6-f5b9-474c-83fc-9df645de1941,5,2107,1174
476,SgS7wLNOMoQhWBMx8sQksHWYGdF3,844b8ae6-f5b9-474c-83fc-9df645de1941,5,1798,1174
539,HVgi0BgIVATRrGo66Hi8symnKGy2,844b8ae6-f5b9-474c-83fc-9df645de1941,5,1094,1174
581,E7vSNnez82OlQlKEuYumBiyTQ4o2,844b8ae6-f5b9-474c-83fc-9df645de1941,5,878,1174


# Content Based Filtering

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from konlpy.tag import Okt
import re