In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df1 = pd.read_csv("movies.csv")
df2 = pd.read_csv("ratings.csv")

combine_df = pd.concat([df1, df2], ignore_index=True)
combine_df.head()



Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,,
1,2,Jumanji (1995),Adventure|Children|Fantasy,,,
2,3,Grumpier Old Men (1995),Comedy|Romance,,,
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,,,
4,5,Father of the Bride Part II (1995),Comedy,,,


In [3]:
combine_df.describe()

Unnamed: 0,movieId,userId,rating,timestamp
count,110578.0,100836.0,100836.0,100836.0
mean,21440.913419,326.127564,3.501557,1205946000.0
std,37848.880654,182.618491,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,1221.0,177.0,3.0,1019124000.0
50%,3254.0,325.0,3.5,1186087000.0
75%,8961.0,477.0,4.0,1435994000.0
max,193609.0,610.0,5.0,1537799000.0


In [4]:
combine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110578 entries, 0 to 110577
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   movieId    110578 non-null  int64  
 1   title      9742 non-null    object 
 2   genres     9742 non-null    object 
 3   userId     100836 non-null  float64
 4   rating     100836 non-null  float64
 5   timestamp  100836 non-null  float64
dtypes: float64(3), int64(1), object(2)
memory usage: 5.1+ MB


In [5]:
combine_df.drop_duplicates()
combine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110578 entries, 0 to 110577
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   movieId    110578 non-null  int64  
 1   title      9742 non-null    object 
 2   genres     9742 non-null    object 
 3   userId     100836 non-null  float64
 4   rating     100836 non-null  float64
 5   timestamp  100836 non-null  float64
dtypes: float64(3), int64(1), object(2)
memory usage: 5.1+ MB


In [6]:
combine_df['userId'].fillna(-1, inplace=True)  # Fill NaN userIds with -1
combine_df['rating'].fillna(0, inplace=True) #doing the same for rate, but with 0


# Group the data by 'movieId' and calculate the total ratings count and average rating

In [7]:
popularity_df = combine_df.groupby('movieId').agg(
    {'userId': 'count', 'rating': 'mean'}).reset_index()
popularity_df.rename(
    columns={'userId': 'rating_count', 'rating': 'average_rating'}, inplace=True)
popularity_df


Unnamed: 0,movieId,rating_count,average_rating
0,1,216,3.902778
1,2,111,3.400901
2,3,53,3.198113
3,4,8,2.062500
4,5,50,3.010000
...,...,...,...
9737,193581,2,2.000000
9738,193583,2,1.750000
9739,193585,2,1.750000
9740,193587,2,1.750000


# Sort movies by rating count in descending order

In [8]:
popularity_df = popularity_df.sort_values(by='rating_count', ascending=False)
popularity_df


Unnamed: 0,movieId,rating_count,average_rating
314,356,330,4.151515
277,318,318,4.415094
257,296,308,4.183442
510,593,280,4.146429
1939,2571,279,4.177419
...,...,...,...
816,1076,1,0.000000
5957,34482,1,0.000000
5421,25855,1,0.000000
7565,85565,1,0.000000


In [9]:
top = 10
top_movies = popularity_df.head(top)


In [10]:
top_movies = pd.merge(
    top_movies, combine_df[['movieId', 'title', 'genres']], on='movieId', how='left')


In [11]:
top_movies


Unnamed: 0,movieId,rating_count,average_rating,title,genres
0,356,330,4.151515,Forrest Gump (1994),Comedy|Drama|Romance|War
1,356,330,4.151515,,
2,356,330,4.151515,,
3,356,330,4.151515,,
4,356,330,4.151515,,
...,...,...,...,...,...
2685,527,221,4.205882,,
2686,527,221,4.205882,,
2687,527,221,4.205882,,
2688,527,221,4.205882,,


# the hybrid score

In [12]:
popularity_df['hybrid_score'] = popularity_df['average_rating'] * \
    popularity_df['rating_count']
popularity_df = popularity_df.sort_values(by='hybrid_score', ascending=False)
topp = 10
top_hybrid_movies = popularity_df.head(topp)
top_hybrid_movies = pd.merge(
    top_hybrid_movies, combine_df[['movieId', 'title', 'genres']], on='movieId', how='left')
top_hybrid_movies


Unnamed: 0,movieId,rating_count,average_rating,hybrid_score,title,genres
0,318,318,4.415094,1404.0,"Shawshank Redemption, The (1994)",Crime|Drama
1,318,318,4.415094,1404.0,,
2,318,318,4.415094,1404.0,,
3,318,318,4.415094,1404.0,,
4,318,318,4.415094,1404.0,,
...,...,...,...,...,...,...
2679,480,239,3.734310,892.5,,
2680,480,239,3.734310,892.5,,
2681,480,239,3.734310,892.5,,
2682,480,239,3.734310,892.5,,


In [13]:
combine_df['userId'].fillna(-1, inplace=True)  # Fill NaN userIds with -1
combine_df['rating'].fillna(0, inplace=True)   # Fill NaN ratings with 0

user_item_matrix = combine_df.pivot_table(
    index='userId', columns='movieId', values='rating', fill_value=0)


user_similarity = cosine_similarity(user_item_matrix)


user_similarity_df = pd.DataFrame(
    user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)




def get_similar_users(user_id, n=5):
    similar_users = user_similarity_df[user_id].sort_values(
        ascending=False).index[1:n+1]
    return similar_users




def get_user_recommendations(user_id, n=10):
    similar_users = get_similar_users(user_id)

   
    user_movies_rated = user_item_matrix.loc[user_id]
    similar_users_movies_rated = user_item_matrix.loc[similar_users]
    unrated_movies = similar_users_movies_rated.columns[(
        user_movies_rated == 0) & (similar_users_movies_rated.sum(axis=0) > 0)]

    
    movie_scores = []
    for movie_id in unrated_movies:
        similar_users_ratings = similar_users_movies_rated[movie_id]
        mean_rating = similar_users_ratings[similar_users_ratings > 0].mean()
        movie_scores.append((movie_id, mean_rating))

    
    movie_scores.sort(key=lambda x: x[1], reverse=True)

    
    top_n_movies = movie_scores[:n]

    return top_n_movies



target_user_id = 1
recommended_movies = get_user_recommendations(target_user_id)


recommended_movies_df = pd.DataFrame(
    recommended_movies, columns=['movieId', 'mean_rating'])
recommended_movies_df['title'] = recommended_movies_df['movieId'].apply(
    lambda movie_id: combine_df[combine_df['movieId'] == movie_id]['title'].values[0])
recommended_movies_df['genres'] = recommended_movies_df['movieId'].apply(
    lambda movie_id: combine_df[combine_df['movieId'] == movie_id]['genres'].values[0])


print(recommended_movies_df)


   movieId  mean_rating                                              title  \
0      514          5.0                                    Ref, The (1994)   
1      541          5.0                                Blade Runner (1982)   
2      720          5.0  Wallace & Gromit: The Best of Aardman Animatio...   
3      750          5.0  Dr. Strangelove or: How I Learned to Stop Worr...   
4      858          5.0                              Godfather, The (1972)   
5      899          5.0                         Singin' in the Rain (1952)   
6      913          5.0                         Maltese Falcon, The (1941)   
7      915          5.0                                     Sabrina (1954)   
8      955          5.0                            Bringing Up Baby (1938)   
9      968          5.0                    Night of the Living Dead (1968)   

                       genres  
0                      Comedy  
1      Action|Sci-Fi|Thriller  
2  Adventure|Animation|Comedy  
3            

In [14]:
import streamlit as st




def main():
    st.title('Number 1 Movie Recommender')
    st.write('Welcome to the Movie Recommender App!')

    user_id = st.number_input('Enter your user ID:', min_value=1, value=1)

    if st.button('Get Recommendations'):
        recommended_movies = get_user_recommendations(user_id)

        if recommended_movies:
            st.write('Recommended Movies:')
            for movie_id, mean_rating in recommended_movies:
                title = combine_df[combine_df['movieId']
                                   == movie_id]['title'].values[0]
                genres = combine_df[combine_df['movieId']
                                    == movie_id]['genres'].values[0]
                st.write(
                    f"- {title} (Genres: {genres}, Mean Rating: {mean_rating:.2f})")
        else:
            st.write('No recommendations available.')


if __name__ == "__main__":
    main()


2023-11-20 10:52:45.237 
  command:

    streamlit run /Users/leoking/anaconda3/envs/recommenders/lib/python3.11/site-packages/ipykernel_launcher.py [ARGUMENTS]


In [15]:


def collaborative_filtering_recommender(user_id, n=10):
    
    combine_df['userId'].fillna(-1, inplace=True)
    combine_df['rating'].fillna(0, inplace=True)
    
    
    user_item_matrix = combine_df.pivot_table(index='userId', columns='movieId', values='rating', fill_value=0)
    
    
    user_similarity = cosine_similarity(user_item_matrix)
    
    
    user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)
    
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).index[1:n+1]
    
    
    user_movies_rated = user_item_matrix.loc[user_id]
    similar_users_movies_rated = user_item_matrix.loc[similar_users]
    unrated_movies = similar_users_movies_rated.columns[(user_movies_rated == 0) & (similar_users_movies_rated.sum(axis=0) > 0)]
    
    
    movie_scores = []
    for movie_id in unrated_movies:
        similar_users_ratings = similar_users_movies_rated[movie_id]
        mean_rating = similar_users_ratings[similar_users_ratings > 0].mean()
        movie_scores.append((movie_id, mean_rating))
    
    
    movie_scores.sort(key=lambda x: x[1], reverse=True)
    
  
    top_n_movies = movie_scores[:n]
    
    
    recommended_movies_df = pd.DataFrame(top_n_movies, columns=['movieId', 'mean_rating'])
    recommended_movies_df['title'] = recommended_movies_df['movieId'].apply(lambda movie_id: combine_df[combine_df['movieId'] == movie_id]['title'].values[0])
    recommended_movies_df['genres'] = recommended_movies_df['movieId'].apply(lambda movie_id: combine_df[combine_df['movieId'] == movie_id]['genres'].values[0])
    
    return recommended_movies_df




In [16]:
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)
user_similarity_df 

userId,-1.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,601.0,602.0,603.0,604.0,605.0,606.0,607.0,608.0,609.0,610.0
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-1.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1.0,0.0,1.000000,0.027283,0.059720,0.194395,0.129080,0.128152,0.158744,0.136968,0.064263,...,0.080554,0.164455,0.221486,0.070669,0.153625,0.164191,0.269389,0.291097,0.093572,0.145321
2.0,0.0,0.027283,1.000000,0.000000,0.003726,0.016614,0.025333,0.027585,0.027257,0.000000,...,0.202671,0.016866,0.011997,0.000000,0.000000,0.028429,0.012948,0.046211,0.027565,0.102427
3.0,0.0,0.059720,0.000000,1.000000,0.002251,0.005020,0.003936,0.000000,0.004941,0.000000,...,0.005048,0.004892,0.024992,0.000000,0.010694,0.012993,0.019247,0.021128,0.000000,0.032119
4.0,0.0,0.194395,0.003726,0.002251,1.000000,0.128659,0.088491,0.115120,0.062969,0.011361,...,0.085938,0.128273,0.307973,0.052985,0.084584,0.200395,0.131746,0.149858,0.032198,0.107683
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606.0,0.0,0.164191,0.028429,0.012993,0.200395,0.106435,0.102123,0.200035,0.099388,0.075898,...,0.178084,0.116534,0.300669,0.066032,0.148141,1.000000,0.153063,0.262558,0.069622,0.201104
607.0,0.0,0.269389,0.012948,0.019247,0.131746,0.152866,0.162182,0.186114,0.185142,0.011844,...,0.092525,0.199910,0.203540,0.137834,0.118780,0.153063,1.000000,0.283081,0.149190,0.139114
608.0,0.0,0.291097,0.046211,0.021128,0.149858,0.135535,0.178809,0.323541,0.187233,0.100435,...,0.158355,0.197514,0.232771,0.155306,0.178142,0.262558,0.283081,1.000000,0.121993,0.322055
609.0,0.0,0.093572,0.027565,0.000000,0.032198,0.261232,0.214234,0.090840,0.423993,0.000000,...,0.035653,0.335231,0.061941,0.236601,0.097610,0.069622,0.149190,0.121993,1.000000,0.053225


In [17]:

def main():
    st.title("Best Movie Recommender ")
    
    target_user_id = st.text_input("Enter User ID", value="1")
    target_user_id = int(target_user_id)
    
    num_recommendations = st.slider("Number of Recommendations", min_value=1, max_value=20, value=10)
    
    if st.button("Generate Recommendations"):
        recommendations = collaborative_filtering_recommender(target_user_id, num_recommendations)
        st.write(recommendations)

if __name__ == "__main__":
    main()

In [18]:


from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split 


df1 = pd.read_csv("movies.csv")
df2 = pd.read_csv("ratings.csv")
combine_df = pd.concat([df1, df2], ignore_index=True)


def collaborative_filtering_recommender(user_id, n=5):
    
    combine_df['userId'].fillna(-1, inplace=True)
    combine_df['rating'].fillna(0, inplace=True)
    
    
    user_item_matrix = combine_df.pivot_table(index='userId', columns='movieId', values='rating', fill_value=0)
    
    
    user_similarity = cosine_similarity(user_item_matrix)
    
    
    user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)
    
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).index[1:n+1]
    
    
    user_movies_rated = user_item_matrix.loc[user_id]
    similar_users_movies_rated = user_item_matrix.loc[similar_users]
    unrated_movies = similar_users_movies_rated.columns[(user_movies_rated == 0) & (similar_users_movies_rated.sum(axis=0) > 0)]
    
    
    movie_scores = []
    for movie_id in unrated_movies:
        similar_users_ratings = similar_users_movies_rated[movie_id]
        mean_rating = similar_users_ratings[similar_users_ratings > 0].mean()
        movie_scores.append((movie_id, mean_rating))
    
    
    movie_scores.sort(key=lambda x: x[1], reverse=True)
    
  
    top_n_movies = movie_scores[:n]
    
    
   
    recommended_movies_df = pd.DataFrame(columns=['title', 'genres', 'mean_rating'])

    for movie_id, mean_rating in top_n_movies:
        title = combine_df.loc[combine_df['movieId'] == movie_id, 'title'].values[0]
        genres = combine_df.loc[combine_df['movieId'] == movie_id, 'genres'].values[0]
        recommended_movies_df = recommended_movies_df.append({
            'title': title,
            'genres': genres,
            'mean_rating': mean_rating
        }, ignore_index=True)

    return recommended_movies_df




def main():
    st.title('Movie Recommender')
    st.write('Welcome to the Movie Recommender App!')

    user_id = st.number_input('Enter your user ID:', min_value=1, value=1)

    if st.button('Get Recommendations'):
        recommended_movies_df = collaborative_filtering_recommender(user_id, n=5)

        if not recommended_movies_df.empty:
            st.write('Recommended Movies:')
            st.dataframe(recommended_movies_df)
        else:
            st.write('No recommendations available.')

if __name__ == "__main__":
    main()



# Load data
df1 = pd.read_csv("movies.csv")
df2 = pd.read_csv("ratings.csv")

combine_df = pd.concat([df1, df2], ignore_index=True)
combine_df['userId'].fillna(-1, inplace=True)
combine_df['rating'].fillna(0, inplace=True)

popularity_df = combine_df.groupby('movieId').agg(
    {'userId': 'count', 'rating': 'mean'}).reset_index()
popularity_df.rename(
    columns={'userId': 'rating_count', 'rating': 'average_rating'}, inplace=True)

popularity_df = popularity_df.sort_values(by='rating_count', ascending=False)

# Define the movie recommender function based on popularity
def popular_movie_recommender(top_n=5):
    top_movies = popularity_df.head(top_n)
    top_movies = pd.merge(
        top_movies, combine_df[['movieId', 'title', 'genres']], on='movieId', how='left')
    return top_movies

# Streamlit UI
def main():
    st.title('Popular Movie Recommender')
    st.write('Welcome to the Popular Movie Recommender App!')

    top_n = st.slider('Select the number of top movies:', 1, 100, 10)

    if st.button('Get Recommendations'):
        recommended_movies = popular_movie_recommender(top_n)

        if not recommended_movies.empty:
            st.write(f'Top {top_n} Recommended Movies:')
            st.table(recommended_movies[['title', 'genres', 'rating_count', 'average_rating']])
        else:
            st.write('No recommendations available.')

if __name__ == "__main__":
    main()






# Prepare data for Surprise
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df2[['userId', 'movieId', 'rating']], reader)
trainset, _ = train_test_split(data, test_size=0.2)

# Build user-based collaborative filtering model
sim_options = {
    'name': 'cosine',
    'user_based': True
}
model = KNNBasic(sim_options=sim_options)
model.fit(trainset)

# Streamlit UI
def main():
    st.title('User-Based Movie Recommender')
    st.write('User-Based Movie Recommender Section!')

    user_id = st.number_input('Enter your user ID:', min_value=1, value=1)
    n_recommendations = st.slider('Number of recommendations:', 1, 20, 10)

    if st.button('Get Recommendations'):
        # Get movie recommendations for the user
        movie_ids = [str(movie_id) for movie_id in range(1, df1['movieId'].max() + 1)]
        user_ratings = [(movie_id, model.predict(user_id, movie_id).est) for movie_id in movie_ids]
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n_movies = user_ratings[:n_recommendations]

        # Display recommended movies
        if top_n_movies:
            st.write('Recommended Movies:')
            for movie_id, rating in top_n_movies:
                title = df1[df1['movieId'] == int(movie_id)]['title'].values[0]
                genres = df1[df1['movieId'] == int(movie_id)]['genres'].values[0]
                st.write(
                    f"- {title} (Genres: {genres}, Predicted Rating: {rating:.2f})"
                )
        else:
            st.write('No recommendations available.')

if __name__ == "__main__":
    main()


Computing the cosine similarity matrix...
Done computing similarity matrix.
