In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder,StandardScaler

In [16]:
# Load the dataset
data = pd.read_csv("anime.csv")

In [17]:
missing_values = data.isnull().sum()
print(missing_values)

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


In [19]:
# Handle missing values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')  # Replace missing values with most frequent value
data['genre'] = imputer.fit_transform(data['genre'].values.reshape(-1, 1))
data['type'] = imputer.fit_transform(data['type'].values.reshape(-1, 1))
data['rating'] = imputer.fit_transform(data['rating'].values.reshape(-1, 1))


In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12294 non-null  object 
 3   type      12294 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12294 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 768.4+ KB


In [21]:
# Label encode categorical features
categorical_columns = ['genre', 'type']  
le = LabelEncoder()
for col in categorical_columns:
    data[col] = le.fit_transform(data[col])


In [25]:
# Normalize numerical features (if necessary)
# Drop rows with "unknown" values in the 'episodes' column
data = data[data['episodes'] != 'unknown']
 
scaler = StandardScaler()
data[['rating', 'members']] = scaler.fit_transform(data[['rating', 'members']])

In [26]:
# Select relevant features for similarity calculation
features = ['genre', 'type', 'rating', 'members']

In [27]:
# Create a matrix of feature vectors
feature_matrix = data[features].values


In [28]:
# Calculate cosine similarity matrix
similarity_matrix = cosine_similarity(feature_matrix)

In [29]:
# Function to recommend anime
def recommend_anime(target_anime_index):
    similarities = similarity_matrix[target_anime_index]
    top_similar_indices = similarities.argsort()[-10:][::-1]  # Get top 10 similar anime
    recommended_anime = data.iloc[top_similar_indices]
    return recommended_anime

In [30]:
# Example usage
target_anime_index = 0  # Replace with the index of the target anime
recommended_anime = recommend_anime(target_anime_index)
print(recommended_anime)


      anime_id                                               name  genre  \
0        32281                                     Kimi no Na wa.   2686   
1132      8888  Code Geass: Boukoku no Akito 1 - Yokuryuu wa M...   1062   
530      14349                              Little Witch Academia   1300   
611      16904                                   K: Missing Kings   1189   
1014      5204                 Kara no Kyoukai 6: Boukyaku Rokuon   1023   
1828      4437                Naruto: Shippuuden Movie 2 - Kizuna   1049   
596      21339                                  Psycho-Pass Movie   1106   
623       2890                                Gake no Ue no Ponyo   1510   
71         578                                     Hotaru no Haka   2524   
18       12355                       Ookami Kodomo no Ame to Yuki   2867   

      type episodes    rating   members  
0        0        1  2.850577  3.330241  
1132     0        1  1.172592  1.332283  
530      0        1  1.506226  1.7080

In [38]:
# Evaluation
X = data[features]
y = data['rating']

In [39]:
# Split data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Make predictions   

y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:",r2)


Mean Squared Error: 1.710703335250814e-05
R-squared: 0.9999826568407814


In [None]:
#Analyzing Model Performance
#Mean Squared Error (MSE): 1.710703335250814e-05 is a very low value, indicating that the model's predictions are close to the actual values. This is a positive sign.
#R-squared: 0.9999826568407814 is extremely close to 1, suggesting that the model explains a very high proportion of the variance in the target variable. This is also a positive sign.