In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import numpy as np
import time
import heapq

### Load original Data, sort by timestamp and save it

In [2]:
# df = pd.concat((pd.read_csv('events_train.csv'), pd.read_csv('purchases_train.csv')), ignore_index=True)
# df.sort_values(by=['timestamp'], inplace=True)
# df.to_csv('unified_df.csv')

In [3]:
df = pd.read_csv('unified_df.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
def get_unique_counts(df, columns):
    print('Unique value counts per column:')
    for column in columns:
        print('{:<25}{:>}'.format('Unique ' + column + 's:', len(df[column].unique())))

In [5]:
get_unique_counts(df, ['event_type', 'customer_id', 'product_id', 'title', 'category_name'])

Unique value counts per column:
Unique event_types:      2
Unique customer_ids:     2211386
Unique product_ids:      123837
Unique titles:           71441
Unique category_names:   714


##### Extract gender, clothes types?

In [6]:
# df.category_name.unique()[:10]

In [7]:
# df.category_name.str.count('>')

In [8]:
# df.category_name.apply(lambda x: str(x).split('>')[0] if len(str(x).split('>')) > 0 else 'empty').value_counts()

In [9]:
# df.price.describe().apply(lambda x: format(x, 'f'))

In [10]:
# df_purchases.price.plot.hist(bins=30)

In [11]:
# df_purchases.price.plot.box()

## Filling missing data

In [4]:
def get_missing_count(df, columns):
    print('Missing counts per column:')
    for column in columns:
        print('{:<25}{:>}'.format('Missing ' + column + 's:', df[column].isna().sum()))

In [5]:
get_missing_count(df, ['customer_id', 'event_type', 'product_id', 'title', 'category_name', 'price'])

Missing counts per column:
Missing customer_ids:    0
Missing event_types:     0
Missing product_ids:     0
Missing titles:          1813
Missing category_names:  3435732
Missing prices:          44886


Fill missing titles based on equal product_ids.

In [6]:
df.loc[df.title.isnull(), 'title'] = df.groupby('product_id')['title'].transform('first')
get_missing_count(df, ['title'])

Missing counts per column:
Missing titles:          30


Fill missing category_names based on equal product_ids.

In [7]:
df.loc[df.category_name.isnull(), 'category_name'] = df.groupby('product_id')['category_name'].transform('first')
get_missing_count(df, ['category_name'])

Missing counts per column:
Missing category_names:  25034


Fill missing titles based on equal category_names.

In [8]:
df.loc[df.title.isnull(), 'title'] = df.groupby('category_name')['title'].transform('first')
get_missing_count(df, ['title'])

Missing counts per column:
Missing titles:          26


Fill missing category_names based on equal titles.

In [9]:
df.loc[df.category_name.isnull(), 'category_name'] = df.groupby('title')['category_name'].transform('first')
get_missing_count(df, ['category_name'])

Missing counts per column:
Missing category_names:  8583


Fill missing price based on equal titles and category_names

In [10]:
df.loc[df.price.isnull(), 'price'] = df.groupby('title')['price'].transform('first')
df.loc[df.price.isnull(), 'price'] = df.groupby('category_name')['price'].transform('first')
get_missing_count(df, ['price'])

Missing counts per column:
Missing prices:          0


In [30]:
train_size = round(df.shape[0]*0.85)
print('Train len for 85-15% split:', train_size)

NameError: name 'df' is not defined

#### Map categorical columns to their codes for sparse format

In [12]:
customer_id_categorical = df[:train_size]['customer_id'].astype("category")
product_id_categorical = df[:train_size]['product_id'].astype("category")

#### Remember category codes for later lookups

In [13]:
customer_id_lookup = customer_id_categorical.cat.codes
product_id_lookup = product_id_categorical.cat.codes

#### Map customers to categories and items for later lookups

In [14]:
customer_map = dict((y, x) for x, y in enumerate(customer_id_categorical.cat.categories ))
item_map = dict((x, y) for x, y in enumerate(product_id_categorical.cat.categories ))

In [29]:
customer_id_categorical[:3]

0    204685f9-00f0-4a36-bab1-e0b74673c3b1
1    552445a5-f41d-4b25-b479-4e776d9bee61
2    fac01f48-a03a-41f2-acd5-b5f4e407d9c2
Name: customer_id, dtype: category
Categories (1882180, object): [0000130d-9652-4cfb-8c9c-1b1d97b988ab, 000014d9-5e0c-4a15-8697-20ac41f9a388, 000027e5-4dbd-4b9f-9a56-e4414f325358, 0000461a-6b42-456b-a47c-bc34749e62ce, ..., ffffe78e-28e6-467e-98b4-b6ebe83ca14b, fffff354-dd1f-451b-8c41-f1c039560280, fffffdc5-d457-470f-9939-6bb5df9593ba, fffffed2-4e10-445e-af87-f9b67c84438a]

In [31]:
customer_id_lookup[:3]

0     238211
1     626251
2    1843948
dtype: int32

In [32]:
customer_map

{'0000130d-9652-4cfb-8c9c-1b1d97b988ab': 0,
 '000014d9-5e0c-4a15-8697-20ac41f9a388': 1,
 '000027e5-4dbd-4b9f-9a56-e4414f325358': 2,
 '0000461a-6b42-456b-a47c-bc34749e62ce': 3,
 '00005327-02a5-417d-8d62-b7a306498644': 4,
 '00005472-9e83-499a-9626-f60be9f0c66e': 5,
 '00005526-a81b-48ae-9a47-0386c9c8e746': 6,
 '00007581-8778-4364-b947-7392acd1f0f3': 7,
 '00008024-003f-4d1b-ac05-57bb88463abf': 8,
 '00008767-6459-4951-9efe-c551126e6887': 9,
 '00008cb0-f4a9-4a46-9dfc-713b7998b2b2': 10,
 '00008fe6-f3f6-4d64-a79b-84c9257b6b7a': 11,
 '00009144-f7e7-41a3-8911-f63bf6c1239e': 12,
 '0000a346-26cc-4e02-a510-b9d2d493b5e5': 13,
 '0000ab30-a26b-4ccd-9959-4d29f7382edc': 14,
 '0000b13b-0b6e-47ce-8421-aa0eed110952': 15,
 '0000bfe2-1277-4593-8c59-f6c37cd30d81': 16,
 '0000d0a1-4f8f-4aef-b216-d0c4ee51fa8e': 17,
 '0000d44b-010f-4763-be3b-d41c1bd65e12': 18,
 '0000d98c-2871-4c15-b910-73eded02ec4a': 19,
 '0000dcd1-ce01-413c-acc2-c9536980bb00': 20,
 '0000de0a-6bc1-4175-88ee-f1311c484ee8': 21,
 '0000e081-cb9c-4743

In [15]:
test_df = df[train_size:]

In [24]:
df[df['customer_id'] == '204685f9-00f0-4a36-bab1-e0b74673c3b1'].product_id.unique()

array(['f130c542-d793-41d0-88f6-d1c52b4fe8f0',
       '752a7f86-3b60-42b4-945c-9d25a83a9632',
       'da087c55-9ccf-4914-a23e-dc2984582c54',
       '941e630e-93bc-4326-b57b-c2f2cb42edbc',
       'bdd146bd-eba9-4db0-98cf-2a35d8ea3dcd',
       '79df83fc-7c78-4417-89a9-6dd4784a2dda',
       '23dc13de-88ac-4124-837b-e4016a289d6f',
       'a588416e-597c-4ef9-8afa-f315411811a9',
       '2809a461-277e-40a1-afdb-9d5aead6dfbc',
       '2e7079b4-9773-49c4-966d-c876fc9f3919',
       'b5d190c7-32b2-4210-9af7-03d71ba81fc8',
       '28c1ba9b-96ed-41ee-b5ab-c05de22ed6b7',
       '34957903-0b34-49f7-a243-b6b3fc074fdf',
       '046fd932-24cd-45c7-8bc9-899af5204e89',
       '035a620f-9789-4be9-921a-2dfdc3dec371',
       '7805a29f-6076-4802-a5ed-687ae2680abe',
       '75046ab2-a978-44a4-afbc-5ae30ce7f449',
       'a6eb2400-c46b-4e1f-9ce0-aa469d95fc8b',
       '51d65dad-a8be-4553-8034-2c50c5ccb67b',
       '3072b612-5b82-4d74-8fa2-35d4de1256a8',
       'e2eeb1d4-87ab-4981-b553-14f62710f022',
       '591db

## Create occurence sparse matrix

In [16]:
cooDf = sparse.coo_matrix((np.ones(train_size), 
    (customer_id_lookup, product_id_lookup)))

In [15]:
cooDf.shape

(1882180, 118612)

# Popularity

#### Transform coo_matrix to transposed lil_matrix for counting column occurences fast 

In [17]:
lilDf = cooDf.tolil().transpose()

In [18]:
lilDf.shape

(118612, 1882180)

In [18]:
column_counts = [i.nnz for i in lilDf]

In [19]:
del lilDf

#### Get 10 most popular

In [20]:
most_popular_idxs = heapq.nlargest(10, range(len(column_counts)), column_counts.__getitem__)

In [21]:
print(most_popular_idxs)

[78118, 105143, 7219, 44905, 23061, 95360, 50429, 48582, 60728, 82888]


#### Print total user interactions with popular items

In [44]:
print('Sparse density - number of unique user interactions with items:')
list(map(lambda x: 'Item id: {} - Interactions: {}'.format(x, len(cooDf.getcol(x).nonzero()[0])), most_popular_idxs))

Sparse density - number of unique user interactions with items:


['Item id: 78118 - Interactions: 27500',
 'Item id: 105143 - Interactions: 27314',
 'Item id: 7219 - Interactions: 24698',
 'Item id: 44905 - Interactions: 24655',
 'Item id: 23061 - Interactions: 23024',
 'Item id: 95360 - Interactions: 21887',
 'Item id: 50429 - Interactions: 18969',
 'Item id: 48582 - Interactions: 16833',
 'Item id: 60728 - Interactions: 16397',
 'Item id: 82888 - Interactions: 15351']

In [22]:
most_popular_item_ids = list(map(lambda x: item_map[x], most_popular_idxs))

In [46]:
most_popular_item_ids

['a8f0292d-5fca-42b9-b0d3-b38e7efa416b',
 'e33cea18-ee3f-4e37-83a7-1d88c54d83b8',
 '0f943312-7141-4606-abfa-81fd63a5498f',
 '608f32cd-210c-4af5-99d8-6182a9678cf5',
 '31ae211d-57a9-4bfa-b95f-40776af03c3f',
 'ce1bc0cc-5e6c-4587-9d7a-82a6e32648a5',
 '6cee8dc2-0528-49f4-891a-f89a8114cfd6',
 '68d71059-0dd3-4817-b266-ebeb0d555c38',
 '8369aebc-fba5-4957-b3fb-7da05f327dff',
 'b345fd30-281d-4b32-8f88-76fc84bea664']

In [39]:
tmp = df[:train_size]
for id in most_popular_item_ids:
    print(id, tmp[tmp.product_id == id].shape)
del tmp

NameError: name 'df' is not defined

In [None]:
del df

# Model

In [None]:
cooDf.data = np.log1p(cooDf.data)

In [76]:
class KNN_model:
    def __init__(self,
                 n_neighbors, 
                 metric, 
                 coo, 
                 customer_map, 
                 product_map, 
                 N_popular_ids, 
                 discard_already_seen=False,
                 cutoff_low_similarity=False,
                 cutoff_threshold=0.2,
                 normalize_coo=False
                ):
        self.n_neighbors = n_neighbors
        self.model = NearestNeighbors(n_neighbors, metric=metric, n_jobs=-1)
        self.coo = coo
        self.customer_map = customer_map
        self.product_map = product_map
        self.N_popular_ids = N_popular_ids
        self.discard_already_seen = discard_already_seen
        self.cutoff_low_similarity = cutoff_low_similarity
        self.cutoff_threshold = cutoff_threshold
        if normalize_coo == True:
            self.coo.data = np.log1p(self.coo.data)
        
            
        
    def fit(self):
        self.model.fit(self.coo)
        
    def translate_relevances_to_product_ids(self, relevances):
        return list(map(lambda x: self.product_map[x[0]], relevances))
    
    def fill_with_popularity(self, recommendations):
        rec_len = len(recommendations)
        return recommendations + self.N_popular_ids[:10-rec_len]
    
    def find_first_smaller_index(self, relevances):
        try:
            idx_first_smaller = next(idx for idx, value in enumerate(relevances) 
                                     if value[1] < self.cutoff_threshold)
            return idx_first_smaller
        except StopIteration:
            return len(relevances)
            
            
    def predict_10(self, customer_id):
        if customer_id in self.customer_map:
            customer_index = self.customer_map[customer_id]
            customer_row = self.coo.getrow(customer_index)

            # get neighbors
            neighbors = self.model.kneighbors(customer_row, n_neighbors=self.n_neighbors)
            
            # sum their items based on distance            
            item_relevances = {}
            for dist, idx in zip(neighbors[0][0], neighbors[1][0]):           
                nb_items = self.coo.getrow(idx).nonzero()[1]
                relevance = 1 - dist
                for nb_item in nb_items:
                    item_relevances[nb_item] = item_relevances.get(nb_item, 0) + relevance
                    
            if self.discard_already_seen == True:
                # remove items already interacted with
                own_items = customer_row.nonzero()[1]
                for i in own_items:
                    item_relevances.pop(i, None)
                
            
            # Sorted by relevance
            sorted_relevances = sorted(item_relevances.items(), key=lambda kv: kv[1], reverse=True)
            
            # Cutoff items with low relevance and thus fill missing spots with popularity
            if self.cutoff_low_similarity == True:
                first_smaller_idx = self.find_first_smaller_index(sorted_relevances)
                sorted_relevances = sorted_relevances[:first_smaller_idx]
            
            product_recommendations = self.translate_relevances_to_product_ids(sorted_relevances[:10])
            product_recommendations = self.fill_with_popularity(product_recommendations)
    
            return product_recommendations
            # return top 10
        else:
            return self.fill_with_popularity([])

In [68]:
neigh = KNN_model(10,
                  metric='cosine',
                  coo=cooDf, 
                  customer_map=customer_map, 
                  product_map=item_map,
                  N_popular_ids=most_popular_item_ids,
                  discard_already_seen=True
                  cutoff_low_similarity=False,
                  cutoff_threshold=0.2,
                  normalize_coo=False
)
neigh.fit()

In [117]:
model_1_settings = {
    'n_neighbors': 10,
    'metric': 'cosine',
    'coo': cooDf, 
    'customer_map': customer_map, 
    'product_map': item_map,
    'N_popular_ids': most_popular_item_ids,
    'discard_already_seen': True,
    'cutoff_low_similarity': False,
    'cutoff_threshold': 0.2,
    'normalize_coo': False
}

model_2_settings = {
    'n_neighbors': 20,
    'metric': 'cosine',
    'coo': cooDf, 
    'customer_map': customer_map, 
    'product_map': item_map,
    'N_popular_ids': most_popular_item_ids,
    'discard_already_seen': True,
    'cutoff_low_similarity': True,
    'cutoff_threshold': 0.4,
    'normalize_coo': False
}

model_3_settings = {
    'n_neighbors': 5,
    'metric': 'cosine',
    'coo': cooDf, 
    'customer_map': customer_map, 
    'product_map': item_map,
    'N_popular_ids': most_popular_item_ids,
    'discard_already_seen': True,
    'cutoff_low_similarity': True,
    'cutoff_threshold': 0.15,
    'normalize_coo': True
}

model_4_settings = {
    'n_neighbors': 10,
    'metric': 'cosine',
    'coo': cooDf, 
    'customer_map': customer_map, 
    'product_map': item_map,
    'N_popular_ids': most_popular_item_ids,
    'discard_already_seen': False,
    'cutoff_low_similarity': True,
    'cutoff_threshold': 0.4,
    'normalize_coo': False
}

model_5_settings = {
    'n_neighbors': 20,
    'metric': 'cosine',
    'coo': cooDf, 
    'customer_map': customer_map, 
    'product_map': item_map,
    'N_popular_ids': most_popular_item_ids,
    'discard_already_seen': False,
    'cutoff_low_similarity': False,
    'cutoff_threshold': 0.2,
    'normalize_coo': True
}

settings = [model_1_settings, model_2_settings, model_3_settings, model_4_settings, model_5_settings]

In [118]:
neigh = KNN_model(**settings[0])

In [119]:
neigh.fit()

# Evaluation

In [26]:
def dcg_at_k(r, k, method=0):
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.

def ndcg_at_k(r, k, method=0):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
#             print('Relevances:\n', sorted_relevances[:10])
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

In [87]:
first_N_users = test_df[:1235].customer_id.unique()
print('Testing on: {} users'.format(len(first_N_users)))



Testing on: 450 users


In [None]:
first_N = test_df[5:30].customer_id.unique()


for customer in first_N:
    nbs = neigh.predict_10(customer)
    checked_products = set(test_df[test_df.customer_id == customer].product_id.unique())
    
    relevances = list(map(lambda x: 1 if x in checked_products else 0, nbs))
    print(relevances)
    print(ndcg_at_k(relevances, 10))
#     print(dcg_score(checked_products, nbs, len(checked_products)) / dcg_score(checked_products, checked_products, len(checked_products)))
#     print(label_ranking_average_precision_score(checked_products, nbs))
    
    shared = len(set(set(checked_products) & set(nbs)))
    hits += shared
    total += len(checked_products)
#     print(
#         len(nbs), 
#         len(checked_products), 
#         shared
#     )

print('Total:', total)
print('Hits:', hits)

In [42]:
unique_customers = test_df.customer_id.unique()

In [86]:
len(test_df[:1235].customer_id.unique())

450

#### Notes:

##### 6.11.2019
- kategoriu rozparsovat na viacero stlpcov
- TF-IDF na title
- titlov je menej ako ID, casto je rovnaky nazov ale ine id, kedze sa mozu lisit vo farbe
- tam kde chybaju data sa pozerat, ci nahodou rovnaky produkt niekde inde v datach nema doplnenu price/category_name aby som doplnil chybajuce
- ak nemam aj tak najdene chybajuce hodnoty, skusat z rovnakej kategorie doplnit priemer a pod.
- povedal som ze sa chcem zamerat na hladiny cien, kategorie, TF-IDF titles
- on povedal aby som sa pozrel na vztahmi medzi kategoriami, tj ked niekto kupoval hento tak potom casto kupil toto
- data spojit, su rovnake ale proste prisli v rozdielnych suboroch

##### 20.11.2019
- lematizacia, vectorizer, vyratat TF-IDF a ratat podobnosti, da sa to dat do elasticu ???
- CF maticu pre produkty a userov, neudrziavat stlpce, ktore maju malo (1) views/buys + taktiez mozu na kazdy produkt byt pripojene vektory s nejakym kontextom, tj. TF-IDF title/category
- knn?
- ak nepojde - content-based TF-IDF nad titles/categories