In [1]:
from fastai.collab import *
from fastai.tabular.all import *

In [5]:
path =  Path('../storage/ml-25m')
ratingsDF = pd.read_csv(path/'ratings.csv')
moviesDF = pd.read_csv(path/'movies.csv')

In [6]:
ratingsDF.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [7]:
moviesDF.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
ratings = ratingsDF.merge(moviesDF)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,3,296,5.0,1439474476,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
2,4,296,4.0,1573938898,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
3,5,296,4.0,830786155,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
4,7,296,4.0,835444730,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller


In [14]:
#Create dls
dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=512)
dls.show_batch()

Unnamed: 0,userId,title,rating
0,149957,"Great Dictator, The (1940)",5.0
1,138885,Get Shorty (1995),5.0
2,70051,The Disaster Artist (2017),4.0
3,100133,Harry Potter and the Chamber of Secrets (2002),3.0
4,85241,"Matrix, The (1999)",3.0
5,33451,Austin Powers: The Spy Who Shagged Me (1999),4.5
6,61104,Kill Bill: Vol. 2 (2004),3.5
7,96362,Spider-Man: Into the Spider-Verse (2018),4.0
8,53888,Hot Fuzz (2007),4.5
9,69080,Hellboy (2004),2.0


In [15]:
#Kaç tane unique user var:
n_users  = len(dls.classes['userId']) 

#Kaç tane unique movie var:
n_movies = len(dls.classes['title'])

n_users,n_movies

(162542, 58959)

In [16]:
from fastai.callback.fp16 import *
learn = collab_learner(dls, n_factors=100, y_range=(0, 5.5)).to_fp16()

In [17]:
learn.fit_one_cycle(4, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.809509,0.814184,14:11
1,0.786832,0.791488,14:41
2,0.722271,0.722743,14:37
3,0.668152,0.67627,14:59


In [18]:
#Export learner
learn.export("movieRecBig.pkl")

In [None]:
#Load learner from 
#path = Path('./movieRecBig.pkl') #0.6 civarı MSE ile bs=512 ile eğitildi.
#learn = load_learner(path)

In [19]:
learn.model

EmbeddingDotBias(
  (u_weight): Embedding(162542, 100)
  (i_weight): Embedding(58959, 100)
  (u_bias): Embedding(162542, 1)
  (i_bias): Embedding(58959, 1)
)

### High and Low Biased Movies

In [21]:
#High biased movies:
movie_bias = learn.model.i_bias.weight.squeeze()
idxs = movie_bias.argsort(descending=True)[:5]
[dls.classes['title'][i] for i in idxs]

['Shawshank Redemption, The (1994)',
 "Schindler's List (1993)",
 'Usual Suspects, The (1995)',
 'Godfather, The (1972)',
 'Silence of the Lambs, The (1991)']

In [22]:
#Low bias movies
movie_bias = learn.model.i_bias.weight.squeeze()
idxs = movie_bias.argsort(descending=False)[:5]
[dls.classes['title'][i] for i in idxs]

['Glitter (2001)',
 'Battlefield Earth (2000)',
 'Gigli (2003)',
 'Epic Movie (2007)',
 'Disaster Movie (2008)']

### Movie Matching

In [23]:
g = ratings.groupby('title')['rating'].count()
top_movies = g.sort_values(ascending=False).index.values[:1000]
top_idxs = tensor([learn.dls.classes['title'].o2i[m] for m in top_movies])

In [24]:
movie_factors = learn.model.i_weight.weight
movie_factors.shape

torch.Size([58959, 100])

In [158]:
A,B = 10,25
C = 30

In [163]:
#0. filmin embedding'ini elde edelim:
idx = top_idxs[A]
embdA = movie_factors[idx][None]
#embdA.shape
dls.classes['title'][idx]

'Terminator 2: Judgment Day (1991)'

In [164]:
#1. filmin embedding'ini elde edelim:
idx = top_idxs[B]
embdB = movie_factors[idx][None]
#embdB.shape
dls.classes['title'][idx]

'Apollo 13 (1995)'

In [165]:
#Benim istediğim 1 - 0 ilişkisinin  ? - 4  karşılığındaki ?'ni bulmak.
idx = top_idxs[C]
embdC = movie_factors[idx][None]
#embdC.shape
dls.classes['title'][idx]

'Gladiator (2000)'

In [166]:
#İki embedding arasındaki fark vektörü, ve aranan vektör:
differenceVect = torch.subtract(embdB,embdA)
wantedVec = torch.add(embdC,differenceVect)

In [167]:
#Şimdi bu wantedVec'e en yakın film hangisi onu bulalım:
distances = nn.CosineSimilarity(dim=1)(movie_factors, wantedVec)
idx = distances.argsort(descending=True)[0]
dls.classes['title'][idx]

'Beautiful Mind, A (2001)'

In [123]:
distances.sort(descending=True)

torch.return_types.sort(
values=tensor([ 0.7882,  0.7777,  0.7543,  ..., -0.7382, -0.7520, -0.7626],
       device='cuda:0', grad_fn=<SortBackward>),
indices=tensor([ 5405, 38933, 16880,  ..., 25898,  9989, 52178], device='cuda:0'))

In [125]:
idx = distances.argsort(descending=True)[1]
idx

tensor(38933, device='cuda:0')

### Movie Average

In [259]:
A,B = 3,15

idx1 = top_idxs[A]
embdA = movie_factors[idx1][None]

idx2 = top_idxs[B]
embdB = movie_factors[idx2][None]

dls.classes['title'][idx1], dls.classes['title'][idx2]

('Silence of the Lambs, The (1991)',
 'Star Wars: Episode VI - Return of the Jedi (1983)')

In [263]:
#İki embedding arasındaki fark vektörü, ve aranan vektör:
sum = torch.add(0.9*embdA,0.1*embdB)
wantedVec = sum/2

In [266]:
#Şimdi bu wantedVec'e en yakın film hangisi onu bulalım:
distances = nn.CosineSimilarity(dim=1)(movie_factors, wantedVec)
idx = distances.argsort(descending=True)[1]
dls.classes['title'][idx]

'Lethal Admirer (2018)'

In [265]:
distances.sort(descending=True)

torch.return_types.sort(
values=tensor([ 0.9922,  0.8469,  0.8289,  ..., -0.7014, -0.7064, -0.7095],
       device='cuda:0', grad_fn=<SortBackward>),
indices=tensor([42217, 27691,  6789,  ..., 44441, 41035, 42959], device='cuda:0'))

### Test Area

In [256]:
A = torch.tensor([[1., 2.]])
B = torch.tensor([[4., 3.]])
C = torch.tensor([[2., 5.]])
A,B,C

(tensor([[1., 2.]]), tensor([[4., 3.]]), tensor([[2., 5.]]))

In [198]:
diff = torch.subtract(B,A)
wantedVec = torch.add(C,diff)
diff,wantedVec

(tensor([[3., 1.]]), tensor([[5., 6.]]))

In [199]:
factors = torch.tensor([[1., 2.],[4., 3.],[2., 5.],[5., 6.],[6.,6.]])
factors.shape

torch.Size([5, 2])

In [195]:
distances = nn.CosineSimilarity(dim=1)(factors, wantedVec)

In [196]:
distances

tensor([0.9734, 0.9731, 0.9510, 1.0000, 0.9959])

In [207]:
sum = torch.add(A,B)
sum

tensor([[5., 5.]])

In [209]:
avgMovie = sum/2
avgMovie

tensor([[2.5000, 2.5000]])

In [211]:
distances = nn.CosineSimilarity(dim=1)(factors, wantedVec)
distances

tensor([0.9734, 0.9731, 0.9510, 1.0000, 0.9959])