# Anime Recommendation System

This notebook contains:
1. **Exploratory Data Analysis (EDA)** - Understanding the anime dataset
2. **Natural Language Processing (NLP)** - Text preprocessing and analysis
3. **Recommendation Engine** - Content-based filtering system

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from collections import Counter

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("Libraries imported successfully!")

## 1. Data Loading and Initial Exploration

In [None]:
# Load the dataset
df = pd.read_csv('anime_recommendation_dataset.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Basic information
print("Dataset Info:")
df.info()
print("\n" + "="*50)
print("\nBasic Statistics:")
df.describe()

In [None]:
# Check for missing values
print("Missing Values:")
missing_values = df.isnull().sum()
missing_percentage = (df.isnull().sum() / len(df)) * 100
missing_df = pd.DataFrame({'Missing Count': missing_values, 'Percentage': missing_percentage})
missing_df[missing_df['Missing Count'] > 0]

## 2. Exploratory Data Analysis (EDA)

In [None]:
# Score distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
axes[0].hist(df['score'].dropna(), bins=30, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Score', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Distribution of Anime Scores', fontsize=14, fontweight='bold')
axes[0].axvline(df['score'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df["score"].mean():.2f}')
axes[0].legend()

# Box plot
axes[1].boxplot(df['score'].dropna(), vert=True)
axes[1].set_ylabel('Score', fontsize=12)
axes[1].set_title('Score Distribution (Box Plot)', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Score Statistics:")
print(f"Mean Score: {df['score'].mean():.2f}")
print(f"Median Score: {df['score'].median():.2f}")
print(f"Min Score: {df['score'].min():.2f}")
print(f"Max Score: {df['score'].max():.2f}")

In [None]:
# Episodes distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram (limited to reasonable range)
episodes_filtered = df[df['episodes'] <= 100]['episodes'].dropna()
axes[0].hist(episodes_filtered, bins=30, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Number of Episodes (≤100)', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Distribution of Episode Count', fontsize=14, fontweight='bold')

# Top 10 most common episode counts
episode_counts = df['episodes'].value_counts().head(10)
axes[1].barh(episode_counts.index.astype(str), episode_counts.values)
axes[1].set_xlabel('Count', fontsize=12)
axes[1].set_ylabel('Number of Episodes', fontsize=12)
axes[1].set_title('Top 10 Most Common Episode Counts', fontsize=14, fontweight='bold')
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

print(f"\nEpisodes Statistics:")
print(f"Mean Episodes: {df['episodes'].mean():.2f}")
print(f"Median Episodes: {df['episodes'].median():.2f}")

In [None]:
# Genre Analysis
# Extract all genres
all_genres = []
for genres in df['genres'].dropna():
    genre_list = [g.strip() for g in genres.split(',')]
    all_genres.extend(genre_list)

genre_counts = Counter(all_genres)
top_genres = pd.DataFrame(genre_counts.most_common(15), columns=['Genre', 'Count'])

plt.figure(figsize=(12, 6))
plt.barh(top_genres['Genre'], top_genres['Count'], color='skyblue', edgecolor='black')
plt.xlabel('Count', fontsize=12)
plt.ylabel('Genre', fontsize=12)
plt.title('Top 15 Most Common Anime Genres', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print(f"\nTotal Unique Genres: {len(genre_counts)}")
print(f"\nTop 10 Genres:")
for genre, count in genre_counts.most_common(10):
    print(f"  {genre}: {count}")

In [None]:
# Score by Genre Analysis
genre_scores = {}
for idx, row in df.iterrows():
    if pd.notna(row['genres']) and pd.notna(row['score']):
        genres = [g.strip() for g in row['genres'].split(',')]
        for genre in genres:
            if genre not in genre_scores:
                genre_scores[genre] = []
            genre_scores[genre].append(row['score'])

# Calculate average score for each genre
genre_avg_scores = {genre: np.mean(scores) for genre, scores in genre_scores.items() if len(scores) >= 5}
genre_avg_df = pd.DataFrame(list(genre_avg_scores.items()), columns=['Genre', 'Avg_Score'])
genre_avg_df = genre_avg_df.sort_values('Avg_Score', ascending=False).head(15)

plt.figure(figsize=(12, 6))
plt.barh(genre_avg_df['Genre'], genre_avg_df['Avg_Score'], color='coral', edgecolor='black')
plt.xlabel('Average Score', fontsize=12)
plt.ylabel('Genre', fontsize=12)
plt.title('Top 15 Genres by Average Score (min 5 anime)', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Top rated anime
print("Top 10 Highest Rated Anime:")
print("="*80)
top_anime = df.nlargest(10, 'score')[['title', 'score', 'genres', 'episodes']]
for idx, row in top_anime.iterrows():
    print(f"\n{row['title']}")
    print(f"  Score: {row['score']} | Episodes: {row['episodes']} | Genres: {row['genres'][:60]}...")

## 3. Natural Language Processing (NLP)

We'll process the synopsis and character text for better recommendations.

In [None]:
# Text preprocessing function
def clean_text(text):
    """Clean and preprocess text data"""
    if pd.isna(text):
        return ""
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', str(text))
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespaces
    text = ' '.join(text.split())
    return text

# Create a copy for processing
df_processed = df.copy()

# Clean synopsis
df_processed['cleaned_synopsis'] = df_processed['synopsis'].apply(clean_text)

# Clean genres (keep original structure but normalize)
df_processed['cleaned_genres'] = df_processed['genres'].fillna('').str.lower()

# Process characters - simplify by taking first few
df_processed['cleaned_characters'] = df_processed['characters'].fillna('').apply(
    lambda x: ' '.join([c.strip() for c in str(x).split(',')[:10]])
).apply(clean_text)

print("Text preprocessing completed!")
print(f"\nExample of cleaned synopsis:")
print(df_processed['cleaned_synopsis'].iloc[0][:200])

In [None]:
# Create combined features for recommendation
# Combine synopsis, genres, and characters with different weights
df_processed['combined_features'] = (
    df_processed['cleaned_synopsis'] + ' ' + 
    df_processed['cleaned_genres'].str.replace(',', ' ').str.repeat(3) + ' ' +  # Give more weight to genres
    df_processed['cleaned_characters']
)

# Fill any remaining NaN values
df_processed['combined_features'] = df_processed['combined_features'].fillna('')

print("Combined features created!")
print(f"\nExample combined features (first 300 chars):")
print(df_processed['combined_features'].iloc[0][:300])

In [None]:
# Analyze word frequency in synopsis
from collections import Counter
import re

# Get all words from synopsis
all_words = ' '.join(df_processed['cleaned_synopsis']).split()
# Remove common stop words
stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'should', 'could', 'may', 'might', 'can', 'his', 'her', 'their', 'our', 'your', 'its', 'this', 'that', 'these', 'those', 'he', 'she', 'it', 'they', 'them', 'who', 'which', 'what', 'when', 'where', 'why', 'how'}
filtered_words = [w for w in all_words if w not in stop_words and len(w) > 3]

word_freq = Counter(filtered_words)
top_words = pd.DataFrame(word_freq.most_common(20), columns=['Word', 'Frequency'])

plt.figure(figsize=(14, 6))
plt.barh(top_words['Word'], top_words['Frequency'], color='lightgreen', edgecolor='black')
plt.xlabel('Frequency', fontsize=12)
plt.ylabel('Word', fontsize=12)
plt.title('Top 20 Most Common Words in Anime Synopsis', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 4. Building the Recommendation Engine

We'll use TF-IDF (Term Frequency-Inverse Document Frequency) vectorization and cosine similarity for content-based filtering.

In [None]:
# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),  # Use both unigrams and bigrams
    stop_words='english',
    min_df=2  # Ignore terms that appear in less than 2 documents
)

# Fit and transform the combined features
tfidf_matrix = tfidf_vectorizer.fit_transform(df_processed['combined_features'])

print(f"TF-IDF Matrix Shape: {tfidf_matrix.shape}")
print(f"Number of features: {len(tfidf_vectorizer.get_feature_names_out())}")
print("\nTF-IDF vectorization completed!")

In [None]:
# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print(f"Cosine Similarity Matrix Shape: {cosine_sim.shape}")
print("Similarity matrix computed successfully!")

In [None]:
# Create indices for quick lookup
indices = pd.Series(df_processed.index, index=df_processed['title']).drop_duplicates()

def get_recommendations(title, n_recommendations=10, score_weight=0.3):
    """
    Get anime recommendations based on content similarity
    
    Parameters:
    -----------
    title : str
        The title of the anime
    n_recommendations : int
        Number of recommendations to return
    score_weight : float
        Weight for incorporating anime score (0 to 1)
    
    Returns:
    --------
    DataFrame with recommended anime
    """
    try:
        # Get the index of the anime
        idx = indices[title]
        
        # Get pairwise similarity scores
        sim_scores = list(enumerate(cosine_sim[idx]))
        
        # Incorporate score rating
        if score_weight > 0:
            # Normalize scores to 0-1 range
            max_score = df_processed['score'].max()
            min_score = df_processed['score'].min()
            normalized_scores = (df_processed['score'] - min_score) / (max_score - min_score)
            
            # Combine similarity with score
            sim_scores = [(i, score * (1 - score_weight) + normalized_scores.iloc[i] * score_weight) 
                         for i, score in sim_scores]
        
        # Sort by similarity score
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        # Get top n similar anime (excluding the input anime itself)
        sim_scores = sim_scores[1:n_recommendations+1]
        
        # Get anime indices
        anime_indices = [i[0] for i in sim_scores]
        similarity_scores = [i[1] for i in sim_scores]
        
        # Return the top n most similar anime
        recommendations = df.iloc[anime_indices][['title', 'genres', 'score', 'episodes']].copy()
        recommendations['similarity_score'] = similarity_scores
        
        return recommendations
        
    except KeyError:
        return f"Anime '{title}' not found in the database. Please check the title and try again."

print("Recommendation function created successfully!")

In [None]:
# Test the recommendation system
test_anime = "Cowboy Bebop"
print(f"Getting recommendations for: {test_anime}")
print("="*80)

recommendations = get_recommendations(test_anime, n_recommendations=10)
print(recommendations.to_string(index=False))

In [None]:
# Additional recommendation function - by genre
def recommend_by_genre(genre, n_recommendations=10, min_score=60):
    """
    Recommend anime by genre
    
    Parameters:
    -----------
    genre : str
        Genre to filter by
    n_recommendations : int
        Number of recommendations
    min_score : float
        Minimum score threshold
    
    Returns:
    --------
    DataFrame with recommended anime
    """
    # Filter by genre and score
    filtered_df = df[
        (df['genres'].str.contains(genre, case=False, na=False)) & 
        (df['score'] >= min_score)
    ].sort_values('score', ascending=False)
    
    return filtered_df[['title', 'genres', 'score', 'episodes']].head(n_recommendations)

# Test genre-based recommendations
print("Top Action anime (score >= 70):")
print("="*80)
action_recommendations = recommend_by_genre('Action', n_recommendations=10, min_score=70)
print(action_recommendations.to_string(index=False))

In [None]:
# Search function - find anime by partial title match
def search_anime(query, top_n=10):
    """
    Search for anime by partial title match
    
    Parameters:
    -----------
    query : str
        Search query
    top_n : int
        Number of results to return
    
    Returns:
    --------
    DataFrame with matching anime
    """
    mask = df['title'].str.contains(query, case=False, na=False)
    results = df[mask][['title', 'genres', 'score', 'episodes']].sort_values('score', ascending=False)
    
    if len(results) == 0:
        return f"No anime found matching '{query}'"
    
    return results.head(top_n)

# Test search function
print("Search results for 'Hunter':")
print("="*80)
search_results = search_anime('Hunter')
print(search_results.to_string(index=False))

## 5. Interactive Recommendation Examples

In [None]:
# Example 1: Recommendations for a different anime
anime_title = "TRIGUN"
print(f"Recommendations similar to '{anime_title}':")
print("="*80)
recommendations = get_recommendations(anime_title, n_recommendations=8)
print(recommendations.to_string(index=False))

In [None]:
# Example 2: Genre-based recommendations
print("Top Comedy anime:")
print("="*80)
comedy_recs = recommend_by_genre('Comedy', n_recommendations=8, min_score=65)
print(comedy_recs.to_string(index=False))

In [None]:
# Example 3: Get random anime to explore
print("Random high-quality anime to explore:")
print("="*80)
random_anime = df[df['score'] >= 75].sample(n=5)[['title', 'genres', 'score', 'episodes']]
print(random_anime.to_string(index=False))

## 6. Summary and Insights

### Key Findings:

1. **Dataset Overview:**
   - Total anime in dataset: 557
   - Features: title, synopsis, genres, episodes, score, characters

2. **EDA Insights:**
   - Score distribution shows most anime fall in the 60-80 range
   - Popular genres include Action, Comedy, Drama, and Adventure
   - Episode counts vary widely, with many having 12-26 episodes (standard seasons)

3. **NLP Processing:**
   - Text preprocessing includes HTML tag removal, lowercasing, and special character handling
   - TF-IDF vectorization captures important terms from synopsis, genres, and characters
   - Combined features provide a rich representation for similarity calculations

4. **Recommendation System:**
   - Content-based filtering using cosine similarity
   - Multiple recommendation methods: by title similarity, by genre, and search
   - Optional score weighting to balance similarity with anime quality

### Usage:
```python
# Get recommendations for a specific anime
get_recommendations('Cowboy Bebop', n_recommendations=10)

# Get recommendations by genre
recommend_by_genre('Action', n_recommendations=10, min_score=70)

# Search for anime
search_anime('Hunter')
```