In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('anime.csv')

# Explore the dataset
print(df.head())
print(df.info())  # Check data types and missing values
print(df.describe())  # Summary statistics


   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  

In [4]:
import pandas as pd

# Load the dataset (make sure the path is correct)
data = pd.read_csv('anime.csv')

# Check the first few rows to confirm the dataset is loaded
print(data.head())


   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


In [6]:
# Fill missing values in 'genre' with 'Unknown' without using inplace
data['genre'] = data['genre'].fillna('Unknown')

# Fill missing values in 'rating' with the mean rating without using inplace
data['rating'] = data['rating'].fillna(data['rating'].mean())

# Verify if missing values are handled
print(data.isnull().sum())


anime_id     0
name         0
genre        0
type        25
episodes     0
rating       0
members      0
dtype: int64


In [7]:
# Fill missing values in 'type' with 'Unknown'
data['type'] = data['type'].fillna('Unknown')

# Verify if missing values are handled
print(data.isnull().sum())


anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64


In [8]:
# Convert 'episodes' to numeric, forcing errors to NaN (non-numeric values will be treated as NaN)
data['episodes'] = pd.to_numeric(data['episodes'], errors='coerce')

# Fill missing episodes values with the mean of the column
data['episodes'] = data['episodes'].fillna(data['episodes'].mean())

# Verify that the 'episodes' column is numeric and filled correctly
print(data['episodes'].head())


0     1.0
1    64.0
2    51.0
3    24.0
4    51.0
Name: episodes, dtype: float64


In [9]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the scaler
scaler = MinMaxScaler()

# Normalize the 'rating' and 'episodes' columns
data[['rating', 'episodes']] = scaler.fit_transform(data[['rating', 'episodes']])

# Verify that the columns have been normalized
print(data[['rating', 'episodes']].head())


     rating  episodes
0  0.924370  0.000000
1  0.911164  0.034673
2  0.909964  0.027518
3  0.900360  0.012658
4  0.899160  0.027518


In [10]:
# One-hot encode the 'genre' column (genres are separated by commas)
genre_dummies = data['genre'].str.get_dummies(sep=',')

# Concatenate the new one-hot encoded columns with the original dataset
data = pd.concat([data, genre_dummies], axis=1)

# Check the updated dataframe
print(data.head())


   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type  episodes  \
0               Drama, Romance, School, Supernatural  Movie  0.000000   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV  0.034673   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV  0.027518   
3                                   Sci-Fi, Thriller     TV  0.012658   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV  0.027518   

     rating  members   Adventure   Cars   Comedy  ...  Shounen  Slice of Life  \
0  0.924370   200630           0      0        0  ...        0              0   
1  0.911164   793665           1      0        0  ...        0              0   
2  0.909964   114262   

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

# Select the columns that will be used for similarity computation
features = ['rating', 'episodes'] + list(genre_dummies.columns)

# Create a new DataFrame with selected features
anime_features = data[features]

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(anime_features)

# Verify the shape of the similarity matrix
print(cosine_sim.shape)


(12294, 12294)


In [12]:
def recommend_anime(anime_id, cosine_sim, data, top_n=5):
    # Get the index of the anime
    idx = data[data['anime_id'] == anime_id].index[0]
    
    # Get pairwise cosine similarity scores for the given anime
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the anime by similarity score (highest first)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top_n most similar anime (excluding the input anime itself)
    sim_scores = sim_scores[1:top_n+1]
    
    # Get the indices of the most similar anime
    anime_indices = [i[0] for i in sim_scores]
    
    # Return the top_n most similar anime
    return data.iloc[anime_indices][['anime_id', 'name', 'rating']]

# Test the function by recommending anime similar to a given anime (e.g., anime_id = 32281)
recommended_anime = recommend_anime(32281, cosine_sim, data, top_n=5)

# Display the recommended anime
print(recommended_anime)


      anime_id                                               name    rating
5805       547                        Wind: A Breath of Heart OVA  0.561825
6394       546                       Wind: A Breath of Heart (TV)  0.536615
208      28725                      Kokoro ga Sakebitagatterunda.  0.798319
504       6351  Clannad: After Story - Mou Hitotsu no Sekai, K...  0.762305
1201     10067                     Angel Beats!: Another Epilogue  0.715486


In [None]:
#This Assignment hlights the importance of data preprocessing and feature engineering in building an effective recommendation system. 
#By applying cosine similarity on carefully selected features, we can recommend anime that are likely to be of interest to users based on the content of the anime itself. While this is a simple recommendation system, 
#it can be expanded and refined to suit more complex use cases.