# Content-Based Filtering Recommendation Model

This notebook builds a content-based filtering recommendation model using text analysis techniques and saves it to a .sav file.

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import pickle
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

# Set up visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

## Define the Content-Based Filtering Model Class

In [None]:
class ContentFilteringModel:
    def __init__(self):
        self.vectorizer = None
        self.content_vectors = None
        self.article_ids = None
        self.article_titles = None
        self.stemmer = PorterStemmer()
        self.stop_words = set(stopwords.words('english'))
        
    def preprocess_text(self, text):
        """
        Preprocess text by removing special characters, converting to lowercase,
        removing stopwords, and stemming
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase and remove special characters
        text = re.sub(r'[^\w\s]', ' ', str(text).lower())
        
        # Tokenize, remove stopwords, and stem
        tokens = [self.stemmer.stem(word) for word in text.split() if word not in self.stop_words]
        
        return ' '.join(tokens)
    
    def fit(self, articles_df):
        """
        Train the content-based filtering model
        
        Parameters:
        - articles_df: DataFrame with columns [contentId, title, text]
        """
        print("Preprocessing article content...")
        
        # Store article IDs and titles for later use
        self.article_ids = articles_df['contentId'].astype(str).tolist()
        self.article_titles = articles_df['title'].tolist()
        
        # Combine title and text for better content representation
        # Title is repeated to give it more weight
        articles_df['combined_content'] = articles_df['title'].fillna('') + ' ' + \
                                         articles_df['title'].fillna('') + ' ' + \
                                         articles_df['text'].fillna('')
        
        # Preprocess the combined content
        articles_df['processed_content'] = articles_df['combined_content'].apply(self.preprocess_text)
        
        print("Vectorizing article content...")
        # Create TF-IDF vectors
        self.vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
        self.content_vectors = self.vectorizer.fit_transform(articles_df['processed_content'])
        
        print("Model training complete!")
        print(f"Vectorized {self.content_vectors.shape[0]} articles with {self.content_vectors.shape[1]} features")
        
    def get_similar_items(self, item_id, top_n=5):
        """
        Get similar items to a given item based on content similarity
        
        Parameters:
        - item_id: The ID of the item to get similar items for
        - top_n: Number of similar items to return
        
        Returns:
        - List of similar item IDs with scores
        """
        # Convert item_id to string for consistent handling
        item_id = str(item_id)
        
        # Check if the item exists in our dataset
        if item_id not in self.article_ids:
            print(f"Item {item_id} not found in the dataset")
            return []
        
        # Get the index of the item
        item_index = self.article_ids.index(item_id)
        
        # Get the content vector for the item
        item_vector = self.content_vectors[item_index]
        
        # Calculate similarity with all other items
        similarity_scores = cosine_similarity(item_vector, self.content_vectors).flatten()
        
        # Get indices of top similar items (excluding the item itself)
        similar_indices = similarity_scores.argsort()[::-1][1:top_n+1]
        
        # Create recommendations list
        recommendations = []
        for idx in similar_indices:
            # Get content-based reason
            reason = self._get_content_reason(item_index, idx)
            
            recommendations.append({
                'contentId': self.article_ids[idx],
                'score': float(similarity_scores[idx]),
                'reason': reason
            })
        
        return recommendations
    
    def _get_content_reason(self, item_index, similar_index):
        """
        Generate a content-based reason for the recommendation
        """
        item_title = self.article_titles[item_index] if item_index < len(self.article_titles) else "this article"
        similar_title = self.article_titles[similar_index] if similar_index < len(self.article_titles) else "this article"
        
        # List of possible content-based reasons
        reasons = [
            f"Similar topics to '{item_title}'",
            f"Similar writing style to '{item_title}'",
            f"Similar keywords to '{item_title}'",
            f"Content similar to '{item_title}'"
        ]
        
        # For simplicity, choose a random reason
        # In a real system, you would analyze the specific features that make the items similar
        import random
        return random.choice(reasons)

## Load and Explore the Data

In [None]:
# Define paths
DATA_PATH = '../shared_articles.csv'
MODEL_PATH = 'content_model.sav'

# Load only necessary columns to save memory
articles_df = pd.read_csv(DATA_PATH, usecols=['contentId', 'title', 'text', 'lang'])

# Display the first few rows
articles_df.head()

In [None]:
# Check for missing values
print("Missing values:")
print(articles_df.isnull().sum())

# Filter for English articles only
articles_df = articles_df[articles_df['lang'] == 'en']

# Remove rows with missing title and text
articles_df = articles_df.dropna(subset=['title', 'text'])

# Convert IDs to strings to ensure consistent handling
articles_df['contentId'] = articles_df['contentId'].astype(str)

In [None]:
# Basic statistics
print(f"Total number of articles: {len(articles_df)}")

# Distribution of text length
articles_df['text_length'] = articles_df['text'].apply(lambda x: len(str(x)))
articles_df['title_length'] = articles_df['title'].apply(lambda x: len(str(x)))

print(f"Average title length: {articles_df['title_length'].mean():.2f} characters")
print(f"Average text length: {articles_df['text_length'].mean():.2f} characters")

# Visualize text length distribution
plt.figure(figsize=(12, 6))
sns.histplot(articles_df['text_length'], bins=50)
plt.title('Distribution of Article Text Length')
plt.xlabel('Text Length (characters)')
plt.ylabel('Count')
plt.xlim(0, articles_df['text_length'].quantile(0.95))  # Limit x-axis to 95th percentile for better visualization
plt.show()

## Sample Data for Faster Processing

In [None]:
# Sample data for faster processing if needed
print("Sampling data for faster processing...")
articles_df = articles_df.sample(frac=0.2, random_state=42)
print(f"Sampled {len(articles_df)} articles")

## Train the Model

In [None]:
# Create and train the model
model = ContentFilteringModel()
model.fit(articles_df)

## Test the Model

In [None]:
# Get a random article ID from the dataset
random_article = articles_df['contentId'].sample(1).iloc[0]
print(f"Getting similar articles for article: {random_article}")

# Get the title of the random article
article_title = articles_df[articles_df['contentId'] == random_article]['title'].iloc[0]
print(f"Article title: {article_title}")

# Get similar articles
similar_articles = model.get_similar_items(random_article, top_n=5)
print("\nSimilar articles:")
for i, article in enumerate(similar_articles):
    article_id = article['contentId']
    title = articles_df[articles_df['contentId'] == article_id]['title'].iloc[0] if article_id in articles_df['contentId'].values else "Unknown"
    print(f"{i+1}. {title}")
    print(f"   Content ID: {article_id}, Score: {article['score']:.4f}")
    print(f"   Reason: {article['reason']}")

## Visualize Feature Importance

In [None]:
# Get the most important features (words) in the TF-IDF vectorizer
feature_names = np.array(model.vectorizer.get_feature_names_out())

# Get the TF-IDF scores for each feature
tfidf_scores = np.asarray(model.content_vectors.mean(axis=0)).flatten()

# Sort features by TF-IDF score
sorted_indices = tfidf_scores.argsort()[::-1]
top_features = feature_names[sorted_indices][:20]
top_scores = tfidf_scores[sorted_indices][:20]

# Visualize top features
plt.figure(figsize=(12, 8))
sns.barplot(x=top_scores, y=top_features)
plt.title('Top 20 Important Words in the Articles')
plt.xlabel('Average TF-IDF Score')
plt.ylabel('Words')
plt.tight_layout()
plt.show()

## Save the Model

In [None]:
# Save the model
print(f"Saving model to {MODEL_PATH}...")
with open(MODEL_PATH, 'wb') as f:
    pickle.dump(model, f)
print("Model saved successfully!")

## Conclusion

We've successfully built a content-based filtering recommendation model using text analysis techniques and saved it to a .sav file. This model can be used to find similar articles based on their content, which is particularly useful for recommending items to new users who don't have an interaction history.