In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from pandas.api.types import CategoricalDtype 
from implicit.als import AlternatingLeastSquares
from sklearn.model_selection import train_test_split, ParameterGrid


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder


import random

import torch
import torch.nn as nn
import torch.optim as optim

import joblib


In [2]:
ratings = pd.read_csv('ratings.csv')
books = pd.read_csv('books.csv')

# Data Preparation

In [3]:
ratings['binary_rating'] = (ratings.rating > 3).astype(int)
ratings.head(5)

Unnamed: 0,user_id,book_id,rating,binary_rating
0,1,258,5,1
1,2,4081,4,1
2,2,260,5,1
3,2,9296,5,1
4,2,2318,3,0


In [4]:
ratings['book_order'] = (ratings.groupby('user_id').cumcount() + 1) / ratings.groupby('user_id')['book_id'].transform('count')
ratings.head(5)

Unnamed: 0,user_id,book_id,rating,binary_rating,book_order
0,1,258,5,1,0.008547
1,2,4081,4,1,0.015385
2,2,260,5,1,0.030769
3,2,9296,5,1,0.046154
4,2,2318,3,0,0.061538


In [5]:
train_size = 0.7
train_data = ratings[ratings['book_order'] <= train_size]
test_data = ratings[ratings['book_order'] > train_size]

In [6]:
user_ind = train_data.user_id.unique()
book_ind = train_data.book_id.unique()

rows = train_data['user_id'].astype(CategoricalDtype(categories=user_ind)).cat.codes
cols = train_data['book_id'].astype(CategoricalDtype(categories=book_ind)).cat.codes

matrix = sparse.csr_matrix((train_data.binary_rating, (rows,cols)), shape=(len(user_ind), len(book_ind)))

# Base Model

In [7]:
def AP_k(rec, positive, k=10):
    rec = rec[:k]

    count = 0
    prec_sum = 0.0

    for i, book in enumerate(rec):
        if book in positive:
            count += 1
            prec_sum += count / (i+1)

    return prec_sum / min(len(positive), k) if positive else 0  

In [8]:
base_model = AlternatingLeastSquares()
base_model.fit(sparse.csr_matrix(matrix))

  check_blas_config()


  0%|          | 0/15 [00:00<?, ?it/s]

In [9]:
random_users = random.sample(list(user_ind), 500)

ap_scores_als = []
for user in random_users:
    positive = test_data.loc[(test_data['user_id'] == user) & (test_data['binary_rating'] == 1)].book_id.to_list()
    if len(positive) == 0:
            continue
    recs = list(base_model.recommend(user, sparse.csr_matrix(matrix[user]), N=15)[0])
    filt_recs = [book for book in recs if book not in train_data[train_data["user_id"]== user].book_id.values]
    
    ap_score = AP_k(filt_recs, positive, 10)
    ap_scores_als.append(ap_score)
print(f'AP@10 для ALS: {sum(ap_scores_als)/len(ap_scores_als)}')

AP@10 для ALS: 0.0024553968253968254


### Feature engeneering

#### Genres

In [10]:
genres = pd.read_json('goodreads_book_genres_initial.json', lines=True)
genres = genres[genres['book_id'].isin(books['goodreads_book_id'])]
genres.shape

(9996, 2)

In [11]:
genres_expanded = genres['genres'].apply(pd.Series)

In [12]:
genres_expanded = genres_expanded.fillna(0)
row_sums = genres_expanded.sum(axis=1)
genres_expanded = genres_expanded.div(row_sums, axis=0)

In [13]:
genres = pd.concat([genres.drop(columns=['genres']), genres_expanded], axis=1)
genres.head(5)

Unnamed: 0,book_id,fiction,romance,"mystery, thriller, crime",non-fiction,"history, historical fiction, biography","comics, graphic","fantasy, paranormal",young-adult,children,poetry
3,6066819,0.943878,0.039116,0.017007,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,89375,0.021798,0.0,0.0,0.72752,0.242507,0.008174,0.0,0.0,0.0,0.0
583,54270,0.0,0.0,0.0,0.332524,0.657767,0.009709,0.0,0.0,0.0,0.0
807,38568,0.022042,0.445871,0.0,0.0,0.0,0.0,0.532087,0.0,0.0,0.0
816,38562,0.021429,0.457143,0.010204,0.0,0.0,0.0,0.511224,0.0,0.0,0.0


In [14]:
genres = genres.fillna(0)

In [15]:
books = books.merge(genres, left_on='goodreads_book_id', right_on='book_id', how='left').fillna(0)

In [16]:
books = books.drop(columns=['goodreads_book_id', 'book_id_y','best_book_id', 'work_id', 'isbn', 'isbn13', 'image_url', 'small_image_url'])
books.head(5)

Unnamed: 0,book_id_x,books_count,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,...,fiction,romance,"mystery, thriller, crime",non-fiction,"history, historical fiction, biography","comics, graphic","fantasy, paranormal",young-adult,children,poetry
0,1,272,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,4942365,...,0.354124,0.045639,0.010811,0.0,0.0,0.0,0.183215,0.406212,0.0,0.0
1,2,491,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,Harry Potter and the Sorcerer's Stone (Harry P...,eng,4.44,4602479,4800065,...,0.153051,0.0,0.006809,0.0,0.0,0.0,0.551987,0.173864,0.114289,0.0
2,3,226,Stephenie Meyer,2005.0,Twilight,"Twilight (Twilight, #1)",en-US,3.57,3866839,3916824,...,0.148521,0.043887,0.0,0.0,0.0,0.0,0.455153,0.352439,0.0,0.0
3,4,487,Harper Lee,1960.0,To Kill a Mockingbird,To Kill a Mockingbird,eng,4.25,3198671,3340896,...,0.477807,0.0,0.030489,0.0,0.359405,0.0,0.0,0.132299,0.0,0.0
4,5,1356,F. Scott Fitzgerald,1925.0,The Great Gatsby,The Great Gatsby,eng,3.89,2683664,2773745,...,0.778033,0.079932,0.0,0.0,0.126989,0.0,0.0,0.015046,0.0,0.0


In [17]:
books['title'] = books['original_title'].fillna(books['title'])
books = books.drop(columns=['original_title'])

In [18]:
def preprocess_authors(authors):
    return authors.split(',')[0]  # take first author

books['authors'] = books['authors'].apply(preprocess_authors)

#### users

In [19]:
train_data = train_data.merge(books, left_on='book_id', right_on='book_id_x', how='left')

In [20]:
for genre in list(genres.columns[1:]):
    print(genre)

fiction
romance
mystery, thriller, crime
non-fiction
history, historical fiction, biography
comics, graphic
fantasy, paranormal
young-adult
children
poetry


In [21]:
def get_all_user_features():
    user_features = []

    for user_id in user_ind:
        user_data = train_data[train_data['user_id'] == user_id]
        
        if user_data.empty:
            continue

        total_books_read = user_data.shape[0]

        liked_books_count = user_data[user_data['binary_rating'] == 1].shape[0]

        average_rating_user = user_data['rating'].mean()

        genre_means = user_data[list(genres.columns[1:])].mean()

        liked_genres = user_data[user_data['binary_rating'] == 1][list(genres.columns[1:])].mean().fillna(0)

        favorite_authors = user_data[user_data['binary_rating'] == 1]['authors'].value_counts()
        favorite_author = favorite_authors.idxmax() if not favorite_authors.empty else 'unk'
        
        average_publication_year_all = user_data['original_publication_year'].mean()


        user_feature = {
            'user_id': user_id,
            'total_books_read': total_books_read,
            'liked_books_count': liked_books_count,
            'average_rating_user': average_rating_user,
            'favorite_author': favorite_author,
            'average_publication_year_all': average_publication_year_all
        }

        for genre in genre_means.index:
            user_feature[f'average_{genre}'] = genre_means[genre]
            user_feature[f'liked_{genre}'] = liked_genres[genre]

        user_features.append(user_feature)

    return pd.DataFrame(user_features)

all_user_features = get_all_user_features()

In [22]:
all_user_features.head(5)

Unnamed: 0,user_id,total_books_read,liked_books_count,average_rating_user,favorite_author,average_publication_year_all,average_fiction,liked_fiction,average_romance,liked_romance,...,"average_comics, graphic","liked_comics, graphic","average_fantasy, paranormal","liked_fantasy, paranormal",average_young-adult,liked_young-adult,average_children,liked_children,average_poetry,liked_poetry
0,1,81,36,3.444444,John Steinbeck,1940.135802,0.627707,0.599762,0.025698,0.026672,...,0.000573,0.0005,0.038389,0.053921,0.021887,0.026274,0.027876,0.037618,0.003087,0.004667
1,2,45,37,4.333333,Malcolm Gladwell,1975.466667,0.241727,0.25237,0.010829,0.007728,...,0.000957,0.001164,0.031298,0.038066,0.011065,0.013457,0.003346,0.004069,0.010758,0.013084
2,4,93,67,3.913978,J.K. Rowling,1968.870968,0.552889,0.50863,0.008221,0.008067,...,0.00504,0.006893,0.080346,0.105528,0.043641,0.057772,0.073061,0.101131,0.011223,0.015578
3,6,63,54,4.349206,Bill Bryson,1996.063492,0.187904,0.187118,0.009077,0.010431,...,0.021877,0.025031,0.01241,0.011584,0.004593,0.002014,0.00029,0.000215,0.000727,0.000849
4,8,68,49,3.911765,Fyodor Dostoyevsky,1780.602941,0.647859,0.671053,0.021754,0.020504,...,0.000259,0.00036,0.054887,0.050087,0.006405,0.00297,0.008041,0.011159,0.020372,0.027705


#### books

In [23]:
books['title'] = books['title'].astype(str)

In [24]:
books.loc[books['language_code'] == 0, 'language_code'] = 'unk'

In [25]:
tfidf_title = TfidfVectorizer(max_features=60)
title_vectors = tfidf_title.fit_transform(books['title']).toarray()

In [26]:
categorical_features = books[['language_code']]

In [27]:
encoder = OneHotEncoder(sparse_output=False)
encoded_categorical = encoder.fit_transform(categorical_features)

In [28]:
all_features = np.hstack([title_vectors, encoded_categorical])
all_features_books = pd.DataFrame(all_features)

books_features = pd.concat([books.reset_index(drop=True), all_features_books.reset_index(drop=True)], axis=1)

books_features = books_features.drop(columns=['books_count', 'work_text_reviews_count', 'title', 'language_code', 'ratings_1', 'ratings_2','ratings_3', 'ratings_4', 'ratings_5'])

In [29]:
books_features.head(5)

Unnamed: 0,book_id_x,authors,original_publication_year,average_rating,ratings_count,work_ratings_count,fiction,romance,"mystery, thriller, crime",non-fiction,...,76,77,78,79,80,81,82,83,84,85
0,1,Suzanne Collins,2008.0,4.34,4780653,4942365,0.354124,0.045639,0.010811,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,J.K. Rowling,1997.0,4.44,4602479,4800065,0.153051,0.0,0.006809,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Stephenie Meyer,2005.0,3.57,3866839,3916824,0.148521,0.043887,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Harper Lee,1960.0,4.25,3198671,3340896,0.477807,0.0,0.030489,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,F. Scott Fitzgerald,1925.0,3.89,2683664,2773745,0.778033,0.079932,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Author_coding

In [30]:
missing_authors = set(all_user_features['favorite_author']) - set(books_features['authors'])
print("Неизвестные авторы:", missing_authors)

Неизвестные авторы: {'unk'}


In [31]:
author_enc = LabelEncoder()
books_features['authors'] = author_enc.fit_transform(books_features['authors'])
fav_author_enc = LabelEncoder()
all_user_features['favorite_author'] = fav_author_enc.fit_transform(all_user_features['favorite_author'])

#### Normalisation

In [32]:
scaler_books = StandardScaler()
columns_to_normalize = ['original_publication_year', 'average_rating', 'ratings_count', 'work_ratings_count']
books_features[columns_to_normalize] = scaler_books.fit_transform(books_features[columns_to_normalize])
books_features.head(5)

Unnamed: 0,book_id_x,authors,original_publication_year,average_rating,ratings_count,work_ratings_count,fiction,romance,"mystery, thriller, crime",non-fiction,...,76,77,78,79,80,81,82,83,84,85
0,1,3526,0.170121,1.327789,30.036789,29.098997,0.354124,0.045639,0.010811,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1529,0.108104,1.720847,28.904534,28.25094,0.153051,0.0,0.006809,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,3464,0.153207,-1.698765,24.22971,22.987143,0.148521,0.043887,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,1365,-0.100498,0.974035,19.983656,19.55482,0.477807,0.0,0.030489,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,1146,-0.297825,-0.440977,16.710904,16.174804,0.778033,0.079932,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
scaler_users = StandardScaler()
columns_to_normalize_u = ['total_books_read', 'liked_books_count', 'average_rating_user', 'average_publication_year_all']
all_user_features[columns_to_normalize_u] = scaler_users.fit_transform(all_user_features[columns_to_normalize_u])
all_user_features.head(5)

Unnamed: 0,user_id,total_books_read,liked_books_count,average_rating_user,favorite_author,average_publication_year_all,average_fiction,liked_fiction,average_romance,liked_romance,...,"average_comics, graphic","liked_comics, graphic","average_fantasy, paranormal","liked_fantasy, paranormal",average_young-adult,liked_young-adult,average_children,liked_children,average_poetry,liked_poetry
0,1,0.172076,-0.966444,-1.06932,541,-0.457629,0.627707,0.599762,0.025698,0.026672,...,0.000573,0.0005,0.038389,0.053921,0.021887,0.026274,0.027876,0.037618,0.003087,0.004667
1,2,-1.800303,-0.912834,0.839911,704,0.230944,0.241727,0.25237,0.010829,0.007728,...,0.000957,0.001164,0.031298,0.038066,0.011065,0.013457,0.003346,0.004069,0.010758,0.013084
2,4,0.829535,0.695445,-0.060815,431,0.102398,0.552889,0.50863,0.008221,0.008067,...,0.00504,0.006893,0.080346,0.105528,0.043641,0.057772,0.073061,0.101131,0.011223,0.015578
3,6,-0.814113,-0.001476,0.874004,117,0.632361,0.187904,0.187118,0.009077,0.010431,...,0.021877,0.025031,0.01241,0.011584,0.004593,0.002014,0.00029,0.000215,0.000727,0.000849
4,8,-0.540172,-0.269523,-0.06557,339,-3.566807,0.647859,0.671053,0.021754,0.020504,...,0.000259,0.00036,0.054887,0.050087,0.006405,0.00297,0.008041,0.011159,0.020372,0.027705


#### Final_merging

In [34]:
X_train = train_data.drop(columns=['binary_rating'])
y_train = train_data['binary_rating']

In [35]:
X_train = X_train[['user_id', 'book_id']].merge(all_user_features, on='user_id', how='left')

In [36]:
X_train = X_train.merge(books_features, left_on='book_id', right_on='book_id_x', how='left')

In [37]:
f_cols = (all_user_features.columns.tolist() + books_features.columns.tolist())
f_cols.remove('book_id_x')
f_cols.remove('user_id')
X_train = X_train[f_cols]

In [38]:
n_c = 0
for u in X_train.columns:
    n_c += 1
    print(n_c, u)

1 total_books_read
2 liked_books_count
3 average_rating_user
4 favorite_author
5 average_publication_year_all
6 average_fiction
7 liked_fiction
8 average_romance
9 liked_romance
10 average_mystery, thriller, crime
11 liked_mystery, thriller, crime
12 average_non-fiction
13 liked_non-fiction
14 average_history, historical fiction, biography
15 liked_history, historical fiction, biography
16 average_comics, graphic
17 liked_comics, graphic
18 average_fantasy, paranormal
19 liked_fantasy, paranormal
20 average_young-adult
21 liked_young-adult
22 average_children
23 liked_children
24 average_poetry
25 liked_poetry
26 authors
27 original_publication_year
28 average_rating
29 ratings_count
30 work_ratings_count
31 fiction
32 romance
33 mystery, thriller, crime
34 non-fiction
35 history, historical fiction, biography
36 comics, graphic
37 fantasy, paranormal
38 young-adult
39 children
40 poetry
41 0
42 1
43 2
44 3
45 4
46 5
47 6
48 7
49 8
50 9
51 10
52 11
53 12
54 13
55 14
56 15
57 16
58 17
5

In [39]:
X_train.head(5)

Unnamed: 0,total_books_read,liked_books_count,average_rating_user,favorite_author,average_publication_year_all,average_fiction,liked_fiction,average_romance,liked_romance,"average_mystery, thriller, crime",...,76,77,78,79,80,81,82,83,84,85
0,0.172076,-0.966444,-1.06932,541,-0.457629,0.627707,0.599762,0.025698,0.026672,0.046846,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.800303,-0.912834,0.839911,704,0.230944,0.241727,0.25237,0.010829,0.007728,0.014168,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.800303,-0.912834,0.839911,704,0.230944,0.241727,0.25237,0.010829,0.007728,0.014168,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1.800303,-0.912834,0.839911,704,0.230944,0.241727,0.25237,0.010829,0.007728,0.014168,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.800303,-0.912834,0.839911,704,0.230944,0.241727,0.25237,0.010829,0.007728,0.014168,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Modeling

In [40]:
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)

In [41]:
class BookClassifier(nn.Module):
    def __init__(self, num_authors, num_favorite_authors, num_other_features, embedding_dim=10):
        super(BookClassifier, self).__init__()
        self.author_embedding = nn.Embedding(num_authors, embedding_dim)
        self.favorite_author_embedding = nn.Embedding(num_favorite_authors, embedding_dim)

        self.fc1 = nn.Linear(embedding_dim * 2 + num_other_features, 64)
        self.bn1 = nn.BatchNorm1d(64)  
        self.dropout = nn.Dropout(0.5) 
        self.fc2 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        author_embeds = self.author_embedding(x[:, 25].long())
        favorite_author_embeds = self.favorite_author_embedding(x[:, 3].long())
        
        combined = torch.cat((author_embeds, favorite_author_embeds, x[:, :3], x[:, 4:25], x[:, 26:]), dim=1)
        x = torch.relu(self.bn1(self.fc1(combined)))  
        x = self.dropout(x) 
        x = self.fc2(x)
        return self.sigmoid(x)


In [42]:
num_authors = len(X_train['authors'].unique())
num_favorite_authors = len(X_train['favorite_author'].unique())
num_other_features = X_train.shape[1] - 2
print(f'In column authors {num_authors} unique values')
print(f'In column favorite_authors {num_favorite_authors} unique values')
print(f'Another features: {num_other_features}')

In column authors 3888 unique values
In column favorite_authors 1148 unique values
Another features: 124


In [43]:
model = BookClassifier(num_authors, num_favorite_authors, num_other_features)

In [44]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [45]:
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor).squeeze()
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/50], Loss: 0.7028
Epoch [2/50], Loss: 0.6928
Epoch [3/50], Loss: 0.6832
Epoch [4/50], Loss: 0.6747
Epoch [5/50], Loss: 0.6664
Epoch [6/50], Loss: 0.6585
Epoch [7/50], Loss: 0.6512
Epoch [8/50], Loss: 0.6443
Epoch [9/50], Loss: 0.6375
Epoch [10/50], Loss: 0.6314
Epoch [11/50], Loss: 0.6253
Epoch [12/50], Loss: 0.6197
Epoch [13/50], Loss: 0.6142
Epoch [14/50], Loss: 0.6091
Epoch [15/50], Loss: 0.6040
Epoch [16/50], Loss: 0.5997
Epoch [17/50], Loss: 0.5951
Epoch [18/50], Loss: 0.5907
Epoch [19/50], Loss: 0.5869
Epoch [20/50], Loss: 0.5831
Epoch [21/50], Loss: 0.5797
Epoch [22/50], Loss: 0.5763
Epoch [23/50], Loss: 0.5728
Epoch [24/50], Loss: 0.5698
Epoch [25/50], Loss: 0.5668
Epoch [26/50], Loss: 0.5639
Epoch [27/50], Loss: 0.5613
Epoch [28/50], Loss: 0.5586
Epoch [29/50], Loss: 0.5562
Epoch [30/50], Loss: 0.5538
Epoch [31/50], Loss: 0.5516
Epoch [32/50], Loss: 0.5496
Epoch [33/50], Loss: 0.5476
Epoch [34/50], Loss: 0.5455
Epoch [35/50], Loss: 0.5439
Epoch [36/50], Loss: 0.5420
E

### Classifier Evaluation

In [46]:
X_test = test_data.drop(columns=['binary_rating'])
y_test = test_data['binary_rating']

In [47]:
X_test = X_test[['user_id', 'book_id']].merge(all_user_features, on='user_id', how='left')
X_test = X_test.merge(books_features, left_on='book_id', right_on='book_id_x', how='left')

In [48]:
X_test = X_test[f_cols]
X_test.head(5)

Unnamed: 0,total_books_read,liked_books_count,average_rating_user,favorite_author,average_publication_year_all,average_fiction,liked_fiction,average_romance,liked_romance,"average_mystery, thriller, crime",...,76,77,78,79,80,81,82,83,84,85
0,-0.26623,0.26657,0.653565,431,-0.297687,0.635689,0.615566,0.043596,0.050119,0.031475,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.26623,0.26657,0.653565,431,-0.297687,0.635689,0.615566,0.043596,0.050119,0.031475,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.26623,0.26657,0.653565,431,-0.297687,0.635689,0.615566,0.043596,0.050119,0.031475,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.26623,0.26657,0.653565,431,-0.297687,0.635689,0.615566,0.043596,0.050119,0.031475,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.26623,0.26657,0.653565,431,-0.297687,0.635689,0.615566,0.043596,0.050119,0.031475,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

In [50]:
model.eval()
with torch.inference_mode():
    test_outputs = model(X_test_tensor).squeeze()
    predicted = (test_outputs >= 0.5).float()
    accuracy = (predicted == y_test_tensor).float().mean()
    print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.7175


### Combine

In [51]:
ap_scores_comb = []
for user in random_users:
    positive = test_data.loc[(test_data['user_id'] == user) & (test_data['binary_rating'] == 1)].book_id.to_list()
    if len(positive) == 0:
            continue
    recs = list(base_model.recommend(user, sparse.csr_matrix(matrix[user]), N=30)[0])
    filt_recs = [book for book in recs if book not in train_data[train_data["user_id"]== user].book_id.values]
    recom = pd.DataFrame({'user_id': user, 'book_id': filt_recs})
    recom = recom.merge(all_user_features, on='user_id', how='left')
    recom = recom.merge(books_features, left_on='book_id', right_on='book_id_x', how='left')
    recom = recom[f_cols]
    recom_tensor = torch.tensor(recom.values, dtype=torch.float32)
    with torch.inference_mode():
        recom_outputs = model(recom_tensor).squeeze()
    
    combined = list(zip(filt_recs, recom_outputs.tolist()))
    combined = sorted(combined, key=lambda x: x[1], reverse=True)
    sorted_filt_recs = [book_id for book_id, _ in combined]

    ap_score = AP_k(sorted_filt_recs, positive, 10)
    ap_scores_comb.append(ap_score)
print(f'AP@10 для combination method: {sum(ap_scores_comb)/len(ap_scores_comb)}')

AP@10 для combination method: 0.003999920634920634


In [52]:
print(f'AP@10 для ALS: {sum(ap_scores_als)/len(ap_scores_als)}')

AP@10 для ALS: 0.0024553968253968254


### Сохранение признаков и классификатора

In [113]:
torch.save(model.state_dict(), 'classifier.pth')

In [110]:
joblib.dump(all_user_features, 'user_features.joblib')
joblib.dump(books_features, 'book_features.joblib')
joblib.dump(matrix, 'user_item_matrix.joblib')
joblib.dump(base_model, 'als_model.joblib')


['als_model.joblib']

In [56]:
top_books = ratings['book_id'].value_counts().nlargest(40).index.tolist()
#joblib.dump(top_books, 'top_books.joblib')

In [112]:
joblib.dump(f_cols, 'f_cols.joblib')

['f_cols.joblib']

### Conversion for baseline model(1000 users)

In [53]:
random_users = train_data['user_id'].drop_duplicates().sample(1000, random_state=42).tolist()

In [54]:
def calculate_conversion(user_ids):
    count_pos = 0
    for user in user_ids:
        readed = train_data[train_data['user_id'] == user].book_id.unique()
        recs = [book for book in top_books if book not in readed][:10] 
        count_pos += test_data[(test_data['user_id'] == user) & (test_data['binary_rating'] == 1)]['book_id'].isin(recs).sum()
        
    
    return count_pos/(10*len(user_ids))


In [57]:
conversion_rate = calculate_conversion(random_users)

In [58]:
print(f'Conversion Rate for Baseline Model: {conversion_rate}')

Conversion Rate for Baseline Model: 0.0704


### Предположите, какой прирост метрики вы можете ожидать от своей модели машинного обучения. Определитесь с уровнем значимости и мощностью. Посчитайте, сколько пользователей вам необходимо для проведения A/B-теста, с помощью калькулятора.

Допустим, хотим увидеть улучшение на 4%(abs). Тогда мин.размер выборок = 744

### A-A test

In [59]:
all_users = train_data['user_id'].drop_duplicates().sample(n=744*2, random_state=42).tolist()
group_a = all_users[:744]
group_b = all_users[744:]

# Подсчет конверсий в группах
conversion_a = calculate_conversion(group_a)
conversion_b = calculate_conversion(group_b)

print(f'Conversion Rate for Group A: {conversion_a}')
print(f'Conversion Rate for Group B: {conversion_b}')

Conversion Rate for Group A: 0.0668010752688172
Conversion Rate for Group B: 0.07231182795698925


In [60]:
from statsmodels.stats.proportion import test_proportions_2indep

In [69]:
pos1 = int(conversion_a*744)
pos2 = int(conversion_b*744)

In [74]:
test_proportions_2indep(count1=pos1, nobs1=744, count2=pos2, nobs2=744)

<class 'statsmodels.stats.base.HolderTuple'>
statistic = -0.4066839847225056
pvalue = 0.6842401046014359
compare = 'diff'
method = 'agresti-caffo'
diff = -0.0053763440860215145
ratio = 0.9245283018867924
odds_ratio = 0.9192072756888827
variance = 0.00017383138378896035
alternative = 'two-sided'
value = 0
tuple = (-0.4066839847225056, 0.6842401046014359)

p-value>0.05

We do not reject the null hypothesis. There is not enough evidence to conclude that the conversion rates between the two groups are significantly different.


## A/B тест

In [75]:
np.random.shuffle(all_users)
baseline_users = all_users[:744]
model_users = all_users[744:]

In [76]:
baseline_conversion = calculate_conversion(baseline_users)
print(f"baseline_conversion: {baseline_conversion}")

baseline_conversion: 0.0614247311827957


In [90]:
def calculate_my_model_conversion(user_ids):
    count_pos = 0
    for user in user_ids:        
        positive = test_data.loc[(test_data['user_id'] == user) & (test_data['binary_rating'] == 1)].book_id.to_list()
        if len(positive) == 0:
            continue
            
        # Генерируем рекомендации
        recs = list(base_model.recommend(user, sparse.csr_matrix(matrix[user]), N=30)[0])
        filt_recs = [book for book in recs if book not in train_data[train_data["user_id"]== user].book_id.values]

        if len(filt_recs) == 0:
            continue

        recom = pd.DataFrame({'user_id': user, 'book_id': filt_recs})
        recom = recom.merge(all_user_features, on='user_id', how='left')
        recom = recom.merge(books_features, left_on='book_id', right_on='book_id_x', how='left')
        recom = recom[f_cols]
        recom_tensor = torch.tensor(recom.values, dtype=torch.float32)
        with torch.inference_mode():
            recom_outputs = model(recom_tensor).squeeze()

        combined = list(zip(filt_recs, recom_outputs.tolist()))
        combined = sorted(combined, key=lambda x: x[1], reverse=True)
        sorted_filt_recs = [book_id for book_id, _ in combined][:10]

        # Подсчет положительных конверсий
        count_pos += len(set(positive) & set(sorted_filt_recs))
    return count_pos / (10 * len(user_ids))  # 10 рекомендаций на пользователя
    

In [92]:
model_conv = calculate_my_model_conversion(model_users)

In [93]:
print(f'model conversion: {model_conv}')

baseline_conversion: 0.013844086021505376


In [94]:
test_proportions_2indep(count1=int(744*baseline_conversion), nobs1=744, count2=int(744*model_conv), nobs2=744)

<class 'statsmodels.stats.base.HolderTuple'>
statistic = 4.762837606696996
pvalue = 1.9088945751073313e-06
compare = 'diff'
method = 'agresti-caffo'
diff = 0.04704301075268817
ratio = 4.5
odds_ratio = 4.725321888412018
variance = 9.703465934954921e-05
alternative = 'two-sided'
value = 0
tuple = (4.762837606696996, 1.9088945751073313e-06)

The p-value is very low, which leads us to reject the null hypothesis. There is a significant difference in conversion rates between the two models. It's important to note that the chance of reading a book from the highly-rated recommendations for the user is initially higher.