In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

<hr>

### Content Based Filtering

In [2]:
movie = pd.read_csv('movies.csv')
movie

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
10324,146684,Cosmic Scrat-tastrophe (2015),Animation|Children|Comedy
10325,146878,Le Grand Restaurant (1966),Comedy
10326,148238,A Very Murray Christmas (2015),Comedy
10327,148626,The Big Short (2015),Drama


### Cleaning Dataset

- merubah '|'
- merubah 'not genres listed'

In [3]:
movie['genres'] = movie['genres'].replace('(no genres listed)', '')

hsl = []
for i in range(len(movie)):
    hsl.append(movie['genres'][i].replace('|', ' '))
movie['genres'] = hsl

# lowercase the genre
movie['genres'] = movie['genres'].str.lower()
movie

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),adventure animation children comedy fantasy
1,2,Jumanji (1995),adventure children fantasy
2,3,Grumpier Old Men (1995),comedy romance
3,4,Waiting to Exhale (1995),comedy drama romance
4,5,Father of the Bride Part II (1995),comedy
...,...,...,...
10324,146684,Cosmic Scrat-tastrophe (2015),animation children comedy
10325,146878,Le Grand Restaurant (1966),comedy
10326,148238,A Very Murray Christmas (2015),comedy
10327,148626,The Big Short (2015),drama


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
# CountVectorizer

cv = CountVectorizer()

c = cv.fit_transform(movie['genres'])
# total kata unik di col genres
print(cv.get_feature_names())

# matrix freq tiap kata di tiap data
mf = c.toarray()
print(mf[0])

['action', 'adventure', 'animation', 'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'fi', 'film', 'horror', 'imax', 'musical', 'mystery', 'noir', 'romance', 'sci', 'thriller', 'war', 'western']
[0 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]


In [6]:
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
# Cosine similarity dari tiap data di matrix freq
ss = cosine_similarity(mf)
ss

array([[1.        , 0.77459667, 0.31622777, ..., 0.4472136 , 0.        ,
        0.        ],
       [0.77459667, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 1.        , ..., 0.70710678, 0.        ,
        0.        ],
       ...,
       [0.4472136 , 0.        , 0.70710678, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

#### Joko sangat menyukai film bergenre animasi & action, terutama film Superman vs. The Elite (2012).

In [8]:
# Gunakan cos score sbg data rekomendasi
joko = movie[movie['title'] == 'Superman vs. The Elite (2012)'].index[0]

# # daftar seluruh mobil beserta cos score
similarMovie = list(enumerate(ss[joko]))

# # similarCars  # (index, %similarity)
# similarMovie

In [9]:
# sort berdasarkan %similarity
similarMovie = sorted(similarMovie, key=lambda x: x[1], reverse=True)
similarMovie[:5]

[(6260, 0.9999999999999998),
 (8637, 0.9999999999999998),
 (9370, 0.9999999999999998),
 (9570, 0.9999999999999998),
 (10167, 0.9999999999999998)]

In [10]:
# 5 movie yang mirip

dfSim = []
for i in similarMovie[:6]:
    dfSim.append(movie.iloc[i[0]])

dfSim = pd.DataFrame(dfSim)
dfSim = dfSim.drop(9370)
dfSim

Unnamed: 0,movieId,title,genres
6260,26913,Street Fighter II: The Animated Movie (Sutorît...,action animation
8637,79274,Batman: Under the Red Hood (2010),action animation
9570,99813,"Batman: The Dark Knight Returns, Part 2 (2013)",action animation
10167,124867,Justice League: Throne of Atlantis (2015),action animation
10277,138104,Justice League: Gods and Monsters (2015),action animation


<hr>

### Collaborative Filtering

In [11]:
ratings = pd.read_csv('ratings.csv')
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523
...,...,...,...,...
105334,668,142488,4.0,1451535844
105335,668,142507,3.5,1451535889
105336,668,143385,4.0,1446388585
105337,668,144976,2.5,1448656898


In [12]:
ratings[ratings['movieId'] == 95816]

Unnamed: 0,userId,movieId,rating,timestamp
104869,668,95816,2.5,1351303077


In [13]:
ratings['rating'].max()

5.0

### Widodo sangat menyukai film drama komedi, salah satunya bertajuk Being Flynn (2012).

In [14]:
data = pd.merge(ratings, movie, on='movieId')
data

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,16,4.0,1217897793,Casino (1995),crime drama
1,9,16,4.0,842686699,Casino (1995),crime drama
2,12,16,1.5,1144396284,Casino (1995),crime drama
3,24,16,4.0,963468757,Casino (1995),crime drama
4,29,16,3.0,836820223,Casino (1995),crime drama
...,...,...,...,...,...,...
105334,668,140098,2.5,1450415424,Runoff (2015),drama
105335,668,140816,2.5,1443288791,Tangerine (2015),comedy drama
105336,668,141472,2.5,1442679119,The 50 Year Argument (2014),
105337,668,142488,4.0,1451535844,Spotlight (2015),thriller


### Memuat Pivot Table

- Untuk mengetahui korelasi antar setiap data userId dengan rating

In [15]:
dfR = ratings[ratings.columns[:3]].pivot_table(
    index = ratings['userId'],
    columns = ratings['movieId']
)
dfR.head()

Unnamed: 0_level_0,movieId,movieId,movieId,movieId,movieId,movieId,movieId,movieId,movieId,movieId,...,userId,userId,userId,userId,userId,userId,userId,userId,userId,userId
movieId,1,2,3,4,5,6,7,8,9,10,...,144482,144656,144976,146344,146656,146684,146878,148238,148626,149532
userId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,,,,,,,,,,,...,,,,,,,,,,
2,1.0,,3.0,,5.0,,,,,,...,,,,,,,,,,
3,,,,,5.0,,7.0,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,1.0,,,,,,,,,,...,,,,,,,,,,


In [None]:
dfR = dfR.replace(np.NaN, 0)
dfRcor = dfR.corr().loc['rating']
# dfRcor = dfRcor.fillna(0)
dfRcor

### Widodo sangat menyukai film drama komedi, salah satunya bertajuk Being Flynn (2012).

In [None]:
# Movie yang disukai Widodo
idxWidodo = int(data[data['title'] == 'Being Flynn (2012)']['movieId'])
idxWidodo
# Consider 0 sebagai menyukai tanpa memberi rating
widodo = [(idxWidodo, 5)]
widodo

In [None]:
# Similarity score berdasarkan correlation matrix

similarR = pd.DataFrame()
for judul, rating in widodo:
#     print(judul,rating)
    Rskor = dfRcor.iloc[judul] * rating
    Rskor = Rskor.sort_values(ascending=False)
#     print(Rskor)
    similarR = similarR.append(Rskor)
similarR

In [None]:
hslR = similarR.sum().sort_values(ascending=False)
hslR[:5]

In [None]:
hasil = []
for i in hslR[:5]:
    hasil.append(movie.iloc[i[0]])

hasil = pd.DataFrame(hasil)
hasil