In [2]:
import pandas as pd 

In [7]:
# Sample data 
movies_data = {
    'movie_id': [1, 2, 3, 4, 5, 6, 7, 8],
    'title': ['Action Movie 1', 'Comedy Movie 1', 'Drama Movie 1', 'Action Comedy 1',
              'Romantic Movie 1', 'Action Thriller 1', 'Comedy Drama 1', 'Sci-Fi Action 1'],
    'genres': ['Action', 'Comedy', 'Drama', 'Action|Comedy',
               'Romance', 'Action|Thriller', 'Comedy|Drama', 'Sci-Fi|Action']
}

In [8]:
movies_df = pd.DataFrame(movies_data)

In [9]:
# Sample user preference data (let's say for one user)
user_preference_data = {
    'movie_id': [1, 2, 3, 4, 5, 6], # User has interacted with these movies
    'liked': [True, False, False, True, False, True] # True = Liked, False = Disliked
}
user_preference_df = pd.DataFrame(user_preference_data)


In [11]:
# Merge movie data and user preference data
movie_user_data = pd.merge(movies_df, user_preference_df, on='movie_id', how='inner')

print(movie_user_data)

   movie_id              title           genres  liked
0         1     Action Movie 1           Action   True
1         2     Comedy Movie 1           Comedy  False
2         3      Drama Movie 1            Drama  False
3         4    Action Comedy 1    Action|Comedy   True
4         5   Romantic Movie 1          Romance  False
5         6  Action Thriller 1  Action|Thriller   True


In [12]:
from sklearn.preprocessing import MultiLabelBinarizer 

In [14]:
# 1. Split genres into lists
movies_df['genres_list'] = movies_df['genres'].apply(lambda x: x.split('|'))
print(movies_df)

   movie_id              title           genres         genres_list
0         1     Action Movie 1           Action            [Action]
1         2     Comedy Movie 1           Comedy            [Comedy]
2         3      Drama Movie 1            Drama             [Drama]
3         4    Action Comedy 1    Action|Comedy    [Action, Comedy]
4         5   Romantic Movie 1          Romance           [Romance]
5         6  Action Thriller 1  Action|Thriller  [Action, Thriller]
6         7     Comedy Drama 1     Comedy|Drama     [Comedy, Drama]
7         8    Sci-Fi Action 1    Sci-Fi|Action    [Sci-Fi, Action]


In [18]:
# 2. Use MultiLabelBinarizer to create binary genre features
mlb = MultiLabelBinarizer()
genre_features = pd.DataFrame(mlb.fit_transform(movies_df['genres_list']),
                              columns=mlb.classes_,
                              index=movies_df.index)
print(genre_features)

   Action  Comedy  Drama  Romance  Sci-Fi  Thriller
0       1       0      0        0       0         0
1       0       1      0        0       0         0
2       0       0      1        0       0         0
3       1       1      0        0       0         0
4       0       0      0        1       0         0
5       1       0      0        0       0         1
6       0       1      1        0       0         0
7       1       0      0        0       1         0


In [20]:
# Concatenate genre features with the original movie data (excluding genres_list as it's not needed anymore)
processed_movie_data = pd.concat([movies_df[['movie_id', 'title']], genre_features], axis=1)

In [21]:
# Merge with user preference data
final_data = pd.merge(processed_movie_data, user_preference_df, on='movie_id', how='inner')

print(final_data)

   movie_id              title  Action  Comedy  Drama  Romance  Sci-Fi  \
0         1     Action Movie 1       1       0      0        0       0   
1         2     Comedy Movie 1       0       1      0        0       0   
2         3      Drama Movie 1       0       0      1        0       0   
3         4    Action Comedy 1       1       1      0        0       0   
4         5   Romantic Movie 1       0       0      0        1       0   
5         6  Action Thriller 1       1       0      0        0       0   

   Thriller  liked  
0         0   True  
1         0  False  
2         0  False  
3         0   True  
4         0  False  
5         1   True  


In [22]:
# Features are genre columns
feature_columns = ['Action', 'Comedy', 'Drama', 'Romance', 'Sci-Fi', 'Thriller']
X = final_data[feature_columns]
y = final_data['liked']

print("Features (X):\n", X)
print("\nTarget (y):\n", y)

Features (X):
    Action  Comedy  Drama  Romance  Sci-Fi  Thriller
0       1       0      0        0       0         0
1       0       1      0        0       0         0
2       0       0      1        0       0         0
3       1       1      0        0       0         0
4       0       0      0        1       0         0
5       1       0      0        0       0         1

Target (y):
 0     True
1    False
2    False
3     True
4    False
5     True
Name: liked, dtype: bool


In [24]:
from sklearn.naive_bayes import BernoulliNB

# Initialize Bernoulli Naive Bayes classifier
model = BernoulliNB()

# Train the model
model.fit(X, y)

print("\nBernoulli Naive Bayes model trained!")


Bernoulli Naive Bayes model trained!


In [26]:
# New movie to recommend - "Sci-Fi Comedy 1"
new_movie_genres = ['Sci-Fi', 'Comedy']

# Create feature vector for the new movie (similar to how we processed training data)
new_movie_feature_vector = pd.DataFrame(mlb.transform([new_movie_genres]), columns=mlb.classes_)

# Ensure it has the same columns as our training features (important if new genres appear, handle this robustly in production)
new_movie_feature_vector = new_movie_feature_vector.reindex(columns=feature_columns, fill_value=0) # fill_value=0 for genres not present

print("\nFeature vector for 'Sci-Fi Comedy 1':\n", new_movie_feature_vector)

# Predict probability of liking this movie
probability_of_like = model.predict_proba(new_movie_feature_vector)[:, 1] # Probability of class 'True' (liked)

print(f"\nProbability of liking 'Sci-Fi Comedy 1': {probability_of_like[0]:.4f}")

# Make recommendation decision (e.g., recommend if probability > 0.5)
if probability_of_like[0] > 0.5:
    print("\nRecommendation: Recommend 'Sci-Fi Comedy 1'!")
else:
    print("\nRecommendation: Do not recommend 'Sci-Fi Comedy 1'.")



Feature vector for 'Sci-Fi Comedy 1':
    Action  Comedy  Drama  Romance  Sci-Fi  Thriller
0       0       1      0        0       1         0

Probability of liking 'Sci-Fi Comedy 1': 0.2500

Recommendation: Do not recommend 'Sci-Fi Comedy 1'.
