In [1]:
#Loading files

# It contains the user id, an artist id, the name of the artists 
# and the number of times a user played any given artist

import random
import pandas as pd
import numpy as np

import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn.preprocessing import MinMaxScaler
import implicit

from tqdm import tqdm

train = pd.read_csv('task2_data/train.csv')
table = list() 
 
for c in tqdm(train.index): 
    user = train['buyer'][c] 
    lst = list(train['items'][c].split(' ')) 
    
    for item in lst: 
        table.append([int(user), int(item), 1])
raw_data = pd.DataFrame.from_records(table, columns=['buyer', 'product', 'rating'])
raw_data.head()

100%|██████████| 65950/65950 [00:04<00:00, 15094.92it/s]


Unnamed: 0,buyer,product,rating
0,4,101,1
1,4,1933,1
2,4,1828,1
3,4,1135,1
4,4,1367,1


In [2]:
#Preparing matrix
data = raw_data.dropna()
data['buyer_id'] = data["buyer"].astype("category").cat.codes

item_lookup = data[['buyer_id', 'buyer']].drop_duplicates() #Buyer_id <-> Buyer in train set
item_lookup['buyer_id'] = item_lookup.buyer_id.astype(str)
item_lookup.set_index("buyer_id", inplace=True)

data = data.drop(["buyer"], axis=1)

# buyers = list(np.sort(data.buyer_id))

# products = list()
# for p in data["product"]:
#     products.append(p)

# raitings = list()
# for r in data["rating"]:
#     raitings.append(r)
    
# rows = [int(x) for x in buyers]
# cols = [int(x) for x in products]

# data_sparse = sparse.csr_matrix((raitings, (rows, cols)), shape=(len(buyers), len(products)))

sparse_item_user = sparse.csr_matrix((data['rating'].astype(float), (data['product'], data['buyer_id'])))
sparse_user_item = sparse.csr_matrix((data['rating'].astype(float), (data['buyer_id'], data['product'])))

item_lookup

Unnamed: 0_level_0,buyer
buyer_id,Unnamed: 1_level_1
0,4
1,11
2,34
3,37
4,44
5,95
6,119
7,121
8,134
9,159


In [3]:
# user_interactions = sparse_user_item[0,:].toarray()
# print(user_interactions[0][:120])

# user_interactions = user_interactions.reshape(-1) + 1 #Reshape to turn into 1D array
# user_interactions[user_interactions > 1] = 1
# print(user_interactions[:120])

In [4]:
# Initialize the als model and fit it using the sparse item-user matrix
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=200)

# Calculate the confidence by multiplying it by our alpha value.
alpha_val = 15
data_conf = (sparse_item_user * alpha_val).astype('double')

# Fit the model
model.fit(data_conf)

100%|██████████| 200.0/200 [00:57<00:00,  3.57it/s]


In [29]:
def recommend(user_id, data_sparse, user_vecs, item_vecs, num_items=100):
    """Recommend items for a given user given a trained model
    
    Args:
        user_id (int): The id of the user we want to create recommendations for.
        
        data_sparse (csr_matrix): Our original training data.
        
        user_vecs (csr_matrix): The trained user x features vectors
        
        item_vecs (csr_matrix): The trained item x features vectors
        
        item_lookup (pandas.DataFrame): Used to map artist ids to artist names
        
        num_items (int): How many recommendations we want to return:
        
    Returns:
        recommendations (pandas.DataFrame): DataFrame with num_items artist names and scores
    
    """
  
    # Get all interactions by the user
    user_interactions = data_sparse[user_id,:].toarray()

    # We don't want to recommend items the user has consumed. So let's
    # set them all to 0 and the unknowns to 1.
    user_interactions = user_interactions.reshape(-1) + 1 #Reshape to turn into 1D array
    user_interactions[user_interactions > 1] = 3.5

    # This is where we calculate the recommendation by taking the 
    # dot-product of the user vectors with the item vectors.
    rec_vector = user_vecs[user_id,:].dot(item_vecs.T).toarray()

    # Let's scale our scores between 0 and 1 to make it all easier to interpret.
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    recommend_vector = user_interactions*rec_vector_scaled
   
    # Get all the artist indices in order of recommendations (descending) and
    # select only the top "num_items" items. 
    item_idx = np.argsort(recommend_vector)[::-1][:num_items]
    
    predicted_products = []
    scores = []

    for idx in item_idx:
        predicted_products.append(idx)
        scores.append(recommend_vector[idx])

    # Create a new dataframe with recommended artist names and scores
    recommendations = pd.DataFrame({'product': predicted_products, 'score': scores})
    
    return recommendations



# Get the trained user and item vectors. We convert them to 
# csr matrices to work with our previous recommend function.
user_vecs = sparse.csr_matrix(model.user_factors)
item_vecs = sparse.csr_matrix(model.item_factors)

# Create recommendations for user with id 2025
user_id = 0
recommendations = recommend(user_id, sparse_user_item, user_vecs, item_vecs)
recommendations.head()
# buyer = item_lookup.loc[str(user_id)][0]
# predicted_products = ' '.join(list(map(str, list(recommendations['product']))))

Unnamed: 0,product,score
0,101,3.5
1,1135,2.957321
2,429,2.53303
3,1874,2.501538
4,1640,2.381557


In [30]:
submition_buyers = []
submition_products = []
for person in tqdm(train.index):
    recommendations = list(recommend(person, sparse_user_item, user_vecs, item_vecs, 200)['product'])
#     tmp_rec = recommendations[:5]
#     recommendations = recommendations[5:] + tmp_rec
    
    buyer = item_lookup.loc[str(person)][0]
    predicted_products = ' '.join(list(map(str, recommendations)))
    
    submition_buyers.append(buyer)
    submition_products.append(predicted_products)
    
    
submition = pd.DataFrame({"buyer": submition_buyers,
                          "items": submition_products})
submition.head()
    
# df1 = df = pd.DataFrame({"a":[1, 2, 3, 4], 
#                          "b":[5, 6, 7, 8]}) 
    

100%|██████████| 65950/65950 [02:49<00:00, 388.44it/s]


Unnamed: 0,buyer,items
0,4,101 1135 429 1874 1640 836 1533 1775 1629 1360...
1,11,1527 194 1475 1795 77 2017 1385 1391 2067 534 ...
2,34,269 962 1596 185 351 1458 2036 220 1671 801 38...
3,37,1151 87 1706 1708 1819 453 680 1234 1178 1214 ...
4,44,249 1149 437 657 975 824 1234 986 1743 261 676...


In [25]:
submition["items"].values[2]

'269 962 1596 185 351 1458 2036 220 1671 801 389 31 658 296 90 139 292 160 109 58 1723 308 81 23 625 841 71 563 22 162 137 593 437 138 87 671 1407 1011 136 29 1206 265 657 377 1355 403 1708 1482 644 673 249 674 1182 1706 695 1149 733 243 395 1705 20 824 1284 1126 298 1959 1636 1208 1659 1755 222 1234 1613 680 1483 1095 1761 1391 387 223 450 1615 522 1270 1639 672 297 253 1256 350 1990 1732 545 614 898 843 1255 866 1235 129 244 852 92 429 1883 646 443 1151 33 572 1707 929 228 1789 221 1347 257 1579 2002 1346 1927 1471 28 623 533 832 1640 842 503 2038 1832 1859 116 766 364 1129 498 365 1082 64 501 869 512 574 1534 1200 975 1072 83 85 273 612 1420 1099 1644 55 1743 1279 1338 1725 930 1051 1846 307 1342 745 47 183 2 675 818 1573 1904 1958 846 1874 702 1071 1580 1517 684 589 54 50 577 467 134 1604 587 472 1437 790 121 682 1588 640 1877 867 847 1634'

In [31]:
from datetime import datetime
from zipfile import ZipFile, ZIP_DEFLATED
import os

name = 'submission-%s.csv' % datetime.now()

submition.to_csv(name, index=False)

zip_obj = ZipFile("%s.zip" % name, 'w', ZIP_DEFLATED)
zip_obj.write(name, os.path.basename(name))
zip_obj.close()