# Item-Based Collaborative Filtering

* Import Dataset and Preprocessing

* Create User Movie DataFrame

* Create Item-Based Recommender

* Process Functionalization

# Import Necessary Libraries

In [1]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)
pd.set_option("display.width",500)
pd.set_option("display.expand_frame_repr",False)

# Import Dataset

In [2]:
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")
df = movies.merge(ratings, how="left", on="movieId")
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,944919400.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0,858275500.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,833981900.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,4.0,943497900.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11.0,4.5,1230859000.0


# Create User Movie DataFrame

In [3]:
df.shape # yaklaşık 20 milyon yorum vardır

(20000797, 6)

In [4]:
df["title"].nunique() # yaklaşık 27 bin eşsiz flim vardır

27262

In [5]:
df["title"].value_counts().head() # her filime kaç defa yorum yapılmış

Pulp Fiction (1994)                 67310
Forrest Gump (1994)                 66172
Shawshank Redemption, The (1994)    63366
Silence of the Lambs, The (1991)    63299
Jurassic Park (1993)                59715
Name: title, dtype: int64

#### Dikkat: Bazı filimlere çok az yorum ve beğeni vardır bunları hem çalışma zamanı maliyetinden hem de çok fazla bir bilgi içermediğinden dolayı çalışma dışı bırakabiliriz. Bu çalışmamızda 1000 altında yorum ve beğeni alan filimleri çalışma dışı bırakıyoruz.

In [6]:
rating_counts = pd.DataFrame(df["title"].value_counts())

In [7]:
rating_counts.head()

Unnamed: 0,title
Pulp Fiction (1994),67310
Forrest Gump (1994),66172
"Shawshank Redemption, The (1994)",63366
"Silence of the Lambs, The (1991)",63299
Jurassic Park (1993),59715


In [8]:
rating_counts[rating_counts["title"] < 1000].head()

Unnamed: 0,title
"Bear, The (Ours, L') (1988)",999
Rosewood (1997),999
Ted (2012),999
One Night at McCool's (2001),999
Marked for Death (1990),998


In [9]:
rating_counts[rating_counts["title"] < 1000].shape # 24 bin filimin yorumu ve beğenisi 1000 altında

(24103, 1)

In [10]:
rare_movies = rating_counts[rating_counts["title"] < 1000].index

In [12]:
rare_movies[0:5]

Index(['Bear, The (Ours, L') (1988)', 'Rosewood (1997)', 'Ted (2012)', 'One Night at McCool's (2001)', 'Marked for Death (1990)'], dtype='object')

In [13]:
common_movies = df[~df["title"].isin(rare_movies)]

In [14]:
common_movies.shape

(17766015, 6)

In [15]:
df.shape

(20000797, 6)

In [16]:
common_movies["title"].nunique()

3159

In [17]:
df["title"].nunique() 

27262

yani yaklaşık 3159 tane filme 17 milyon rating verilmişken, 24 bin filme yaklaşık 3 milyon verilmiştir ve buda yaptığımız işlemin ne kadra önemli olduunu gösteriyor. Açıkçası az rating alan filmlere odaklanmaktansa ve maliyeti yükseltemektense daha fazla oy ve yorum alanlara odaklanmak mantıklı olacaktır.

In [18]:
user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")

In [19]:
user_movie_df.shape

(138493, 3159)

# Create Item-Based Recommender

In [22]:
movie_name = "12 Angry Men (1957)"
movie_name = user_movie_df[movie_name]
user_movie_df.corrwith(movie_name).sort_values(ascending=False).head()

title
12 Angry Men (1957)                    1.000000
Witness for the Prosecution (1957)     0.503740
Inherit the Wind (1960)                0.440727
City Lights (1931)                     0.412992
Mr. Smith Goes to Washington (1939)    0.412903
dtype: float64

### Rastgele film seçerek öneride bulunmak

In [24]:
movie_name = pd.Series(user_movie_df.columns).sample(1).values[0]
movie_name = user_movie_df[movie_name]
user_movie_df.corrwith(movie_name).sort_values(ascending=False).head()

title
Naked Gun: From the Files of Police Squad!, The (1988)    1.000000
Naked Gun 2 1/2: The Smell of Fear, The (1991)            0.767789
Naked Gun 33 1/3: The Final Insult (1994)                 0.663927
Airplane! (1980)                                          0.602663
Hot Shots! (1991)                                         0.568975
dtype: float64

### Belli bir anahtara göre film seçmek ve öneride bulunmak

In [25]:
def check_film(keyword, user_movie_df):
    return [col for col in user_movie_df.columns if keyword in col]

In [26]:
check_film("Harry", user_movie_df)

['Deconstructing Harry (1997)',
 'Dirty Harry (1971)',
 'Dumb and Dumberer: When Harry Met Lloyd (2003)',
 'Harry Potter and the Chamber of Secrets (2002)',
 'Harry Potter and the Deathly Hallows: Part 1 (2010)',
 'Harry Potter and the Deathly Hallows: Part 2 (2011)',
 'Harry Potter and the Goblet of Fire (2005)',
 'Harry Potter and the Half-Blood Prince (2009)',
 'Harry Potter and the Order of the Phoenix (2007)',
 'Harry Potter and the Prisoner of Azkaban (2004)',
 "Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)",
 'Harry and the Hendersons (1987)',
 'Trouble with Harry, The (1955)',
 'When Harry Met Sally... (1989)',
 "Who's Harry Crumb? (1989)"]

In [27]:
movie_name = "Harry Potter and the Chamber of Secrets (2002)"
movie_name = user_movie_df[movie_name]
user_movie_df.corrwith(movie_name).sort_values(ascending=False).head()

title
Harry Potter and the Chamber of Secrets (2002)                                                    1.000000
Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)    0.861691
Harry Potter and the Goblet of Fire (2005)                                                        0.777049
Harry Potter and the Prisoner of Azkaban (2004)                                                   0.768082
Harry Potter and the Order of the Phoenix (2007)                                                  0.748770
dtype: float64

# Process Functionalization

In [28]:
def data_prep(data_movie, data_rating, on="movieId"):
    movies = pd.read_csv(data_movie)
    ratings = pd.read_csv(data_rating)
    df = movies.merge(ratings, how="left", on=on)
    return df

In [29]:
df = data_prep("movies.csv", "ratings.csv")

In [30]:
def create_user_movie(dataframe):
    rating_counts = pd.DataFrame(dataframe["title"].value_counts())
    rare_movies = rating_counts[rating_counts["title"] < 1000].index
    common_movies = dataframe[~dataframe["title"].isin(rare_movies)]
    user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")
    return user_movie_df

In [31]:
user_movie_df = create_user_movie(df)

In [32]:
def item_based_recommender(movie_name, user_movie_df, head=10):
    movie_name = user_movie_df[movie_name]
    return user_movie_df.corrwith(movie_name).sort_values(ascending=False).head(head)

In [33]:
item_based_recommender("City Lights (1931)", user_movie_df)

title
City Lights (1931)                                                                                      1.000000
Modern Times (1936)                                                                                     0.732096
Gold Rush, The (1925)                                                                                   0.712192
General, The (1926)                                                                                     0.675491
Ikiru (1952)                                                                                            0.661034
Kid, The (1921)                                                                                         0.618361
Grand Illusion (La grande illusion) (1937)                                                              0.603178
Paths of Glory (1957)                                                                                   0.596833
Bicycle Thieves (a.k.a. The Bicycle Thief) (a.k.a. The Bicycle Thieves) (Ladri di biciclet