In [2]:
import pandas as pd
from scipy.spatial.distance import cosine
import sklearn.preprocessing as pp
import scipy.sparse as sp
import numpy as np
from scipy import io
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import train_test_split

In [3]:
# import dataframes containing e-commerce data

df_clicks = pd.read_csv('yoochoose-clicks.dat', header=None, names=['SessionID', 'Timestamp', 'ItemID', 'Category'])
df_buys = pd.read_csv('yoochoose-buys.dat', header=None, names=['SessionID', 'Timestamp', 'ItemID', 'Price', 'Quantity'])

In [3]:
df_clicks.head()

Unnamed: 0,SessionID,Timestamp,ItemID,Category
0,1,2014-04-07T10:51:09.277Z,214536502,0
1,1,2014-04-07T10:54:09.868Z,214536500,0
2,1,2014-04-07T10:54:46.998Z,214536506,0
3,1,2014-04-07T10:57:00.306Z,214577561,0
4,2,2014-04-07T13:56:37.614Z,214662742,0


In [4]:
df_buys.head()

Unnamed: 0,SessionID,Timestamp,ItemID,Price,Quantity
0,420374,2014-04-06T18:44:58.314Z,214537888,12462,1
1,420374,2014-04-06T18:44:58.325Z,214537850,10471,1
2,281626,2014-04-06T09:40:13.032Z,214535653,1883,1
3,420368,2014-04-04T06:13:28.848Z,214530572,6073,1
4,420368,2014-04-04T06:13:28.858Z,214835025,2617,1


In [5]:
len(df_clicks)

33003944

In [6]:
df_clicks.SessionID.nunique()

9249729

In [7]:
# average number of ItemIDs viewed per session:
33003944 / 9249729

3.568098481587947

In [8]:
len(df_buys)

1150753

In [9]:
df_clicks.ItemID.nunique()

52739

In [4]:
# using the following definitions, the goal is to determine similarity scores between items

def cosine_similarities(mat):
    col_normed_mat = pp.normalize(mat.tocsc(), axis=0)
    return col_normed_mat.T * col_normed_mat

def jaccard_similarities(mat):
    cols_sum = mat.getnnz(axis=0)
    ab = mat.T * mat

    # for rows
    aa = np.repeat(cols_sum, ab.getnnz(axis=0))
    # for columns
    bb = cols_sum[ab.indices]

    similarities = ab.copy()
    # similarities.data /= (aa + bb - ab.data)
    similarities.data = ((aa + bb - ab.data) / similarities.data)

    return similarities

In [5]:
# trying to create a pivot table with the following format:
#            ItemID
# SessionID   1  2  3  4  5
# 1           0  0  1  1  1
# 2           1  1  1  0  0
# 3           0  0  0  1  1
# with ones indicating items that were viewed in each session
# to make the pivot table function less memory intensive, drop unnecessary columns

df_clicks = df_clicks.drop('Timestamp', 1)
df_clicks = df_clicks.drop('Category', 1)

df_buys = df_buys.drop('Timestamp', 1)
df_buys = df_buys.drop('Price', 1)
df_buys = df_buys.drop('Quantity', 1)

In [12]:
df_clicks.head()

Unnamed: 0,SessionID,ItemID
0,1,214536502
1,1,214536500
2,1,214536506
3,1,214577561
4,2,214662742


In [13]:
df_buys.head()

Unnamed: 0,SessionID,ItemID
0,420374,214537888
1,420374,214537850
2,281626,214535653
3,420368,214530572
4,420368,214835025


In [None]:
# this is where I began to encounter memory errors:
df_clicks_pivot = df_clicks.pivot_table(index='SessionID', columns='ItemID', aggfunc=len, fill_value=0) # memory error
df_buys_pivot = df_buys.pivot_table(index='SessionID', columns='ItemID', aggfunc=len, fill_value=0) # memory error

In [14]:
# using an EC2 with 8 GB of RAM, I had to subset the data into 50,000 rows each (and only the 'buy' data) using this form:

# df_buys1 = df_buys[0:50000]
# df_buys2 = df_buys[50000:100000]
# ...
# df_buys20 = df_buys[950000:1000000]

# was able to create .csv files for each 50,000 rows in buy dataframe (1st 1M rows) but the resulting matrix was too large

# df_buys_pivot1 = df_buys1.pivot_table(index='SessionID', columns='ItemID', aggfunc=len, fill_value=0)
# del df_buys1
# df_buys_pivot1.to_csv('df_buys_pivot1.csv')
# del df_buys_pivot1
# df_buys_pivot2 = df_buys2.pivot_table(index='SessionID', columns='ItemID', aggfunc=len, fill_value=0)
# del df_buys2
# df_buys_pivot2.to_csv('df_buys_pivot2.csv')
# del df_buys_pivot2
# ...
# df_buys_pivot20 = df_buys20.pivot_table(index='SessionID', columns='ItemID', aggfunc=len, fill_value=0)
# del df_buys20
# df_buys_pivot20.to_csv('df_buys_pivot20.csv')
# del df_buys_pivot20

In [15]:
# when using an EC2 with 256 GB of RAM, the largest pivot table I was able to create was one that contained the first...
# ...250,000 rows of the buys table; this showed only buys and dataframe seemed to be too sparse, similarity ratings were...
# ...very low

In [6]:
# to illustrate techniques used, we will utilize only the first 50,000 rows of the buy dataframe
df_buys1 = df_buys[0:50000]

In [7]:
# delete df_clicks and df_buys to free up memory
del df_clicks
del df_buys

In [8]:
%%time
df_buys_pivot = df_buys1.pivot_table(index='SessionID', columns='ItemID', aggfunc=len, fill_value=0)

CPU times: user 9.76 s, sys: 2.06 s, total: 11.8 s
Wall time: 11.8 s


In [8]:
df_buys_pivot.head()

ItemID,214507331,214507365,214507385,214507387,214507408,214507415,214507447,214507492,214507556,214507610,...,214844297,214844304,214844306,214844308,214844310,214844312,214844315,214844317,214844357,214844375
SessionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
len(df_buys_pivot)

26196

In [10]:
# 26,196 buy sessions contain only 50,000 buys, so very few items are bought together
# over half of the sessions contain only 1 buy
df_buys_pivot.values.sum()

50000

In [23]:
sum_of_buys = df_buys_pivot.sum(axis=1)
sum_of_buys = sum_of_buys.to_frame()
(sum_of_buys.loc[sum_of_buys[0] == 1]).count()

0    14355
dtype: int64

In [24]:
14355 / 26196

0.5479844251030692

In [8]:
# despite the sparseness of the dataframe, we will run both cosine_similarities and jaccard_similarities to show results
# examine the sessions that contain buys of the first item in the dataframe: 214507331
item1 = df_buys_pivot.loc[df_buys_pivot[214507331] > 0]

In [9]:
item1 = item1.T

In [40]:
item1.head()

SessionID,21046,44739,47766,53331,107777,133862,148252,159578,207202,225172,...,291086,326103,328351,338632,348089,384209,419236,436751,474647,841057
ItemID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
214507331,1,1,1,1,1,1,1,1,1,1,...,1,3,1,3,1,2,2,1,1,1
214507365,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
214507385,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
214507387,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
214507408,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# remove 1st row to see only items with buys along with 1st item
item1_buys = item1[1:]

In [11]:
item1_buys_valid = item1_buys.loc[(item1_buys!=0).any(axis=1)]

In [44]:
# below shows all 16 items that were bought with item 214507331
item1_buys_valid

SessionID,21046,44739,47766,53331,107777,133862,148252,159578,207202,225172,...,291086,326103,328351,338632,348089,384209,419236,436751,474647,841057
ItemID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
214517880,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
214575665,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
214603138,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
214648247,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
214648250,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
214710150,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
214716120,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
214718203,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
214718396,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
214743821,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [45]:
# running both cosine_similarities and jaccard_similarities functions should show these items

In [12]:
# convert pivot dataframe to matrix
df_buys_mat = df_buys_pivot.as_matrix()
df_buys_mat2 = sp.csr_matrix(df_buys_mat)

In [13]:
df_buys_mat_cos = cosine_similarities(df_buys_mat2)



In [14]:
# convert back to dataframe
io.mmwrite("buys.mtx", df_buys_mat_cos)
cos_results = io.mmread("buys.mtx")
cos_results = cos_results.toarray()

In [15]:
cos_item1 = pd.DataFrame(data=cos_results[0])

In [18]:
cos_item1.head()

Unnamed: 0,0
0,1.0
1,0.0
2,0.0
3,0.0
4,0.0


In [20]:
cols = df_buys_pivot.columns.values.tolist()

In [21]:
cos_item1.loc[cos_item1[0] > 0]

Unnamed: 0,0
0,1.0
181,0.016411
1228,0.055132
1826,0.145865
2300,0.145865
2301,0.103142
3528,0.072932
3732,0.048622
3947,0.01139
3974,0.022507


In [22]:
cols[0]

214507331

In [23]:
cols[181]

214517880

In [24]:
cols[4663]

214832557

In [25]:
# items in cos_item1 list that are greater than zero correspond to the rows of item1_buys_valid list

In [28]:
df_buys_mat_jac = jaccard_similarities(df_buys_mat2)

In [29]:
io.mmwrite("buys_j.mtx", df_buys_mat_jac)
jac_results = io.mmread("buys_j.mtx")
jac_results = jac_results.toarray()

In [30]:
jac_item1 = pd.DataFrame(data=jac_results[0])

In [31]:
jac_item1.loc[jac_item1[0] > 0]

Unnamed: 0,0
181,64.0
1228,28.0
1826,22.0
2300,22.0
2301,23.0
3528,25.0
3732,27.0
3947,158.0
3974,60.0
4027,25.0


In [32]:
# same list, different values (using different method to find similarities)

In [8]:
# nDCG can also be used; assuming the entire dataset could be made into a matrix, the rows in the buy dataset could be...
# ...added any number of times, and this would change the total values

# as an example, we will replicate session ID 107777 five times; item 214718203 was purchased in this session, so cosine...
# ...value should go up, while Jaccard value should go down

In [9]:
df_buys_pivot.head()

ItemID,214507331,214507365,214507385,214507387,214507408,214507415,214507447,214507492,214507556,214507610,...,214844297,214844304,214844306,214844308,214844310,214844312,214844315,214844317,214844357,214844375
SessionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
df_buys_pivot_toadd = df_buys_pivot.loc[[107777]]

In [23]:
df_buys_pivot = df_buys_pivot.append([df_buys_pivot_toadd]*5, ignore_index=True)

In [27]:
df_buys_mat = df_buys_pivot.as_matrix()
df_buys_mat2 = sp.csr_matrix(df_buys_mat)

In [28]:
df_buys_mat_cos = cosine_similarities(df_buys_mat2)



In [29]:
io.mmwrite("buys.mtx", df_buys_mat_cos)
cos_results = io.mmread("buys.mtx")
cos_results = cos_results.toarray()

In [30]:
cos_item1 = pd.DataFrame(data=cos_results[0])

In [31]:
cos_item1.loc[cos_item1[0] > 0]

Unnamed: 0,0
0,1.0
181,0.012906
1228,0.043355
1826,0.114708
2300,0.114708
2301,0.081111
3528,0.057354
3732,0.038236
3947,0.247706
3974,0.0177


In [32]:
cols = df_buys_pivot.columns.values.tolist()

In [33]:
cols[3947]

214718203

In [34]:
# item above now has the highest cosine similarity score, due to multiplying the row where this item was purchased with 214507331

In [9]:
# in order to evaluate the effectiveness of the recommender system, train / test and nDCG was utilized, although due to...
# ...the sparseness of the dataset, the nDCG function gave unexpected results

In [10]:
# implementation of ndcg scores

def ranking_precision_score(y_true, y_score, k=10):
    """Precision at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array-like, shape = [n_samples]
        Predicted scores.
    k : int
        Rank.
    Returns
    -------
    precision @k : float
    """
    unique_y = np.unique(y_true)

    if len(unique_y) > 2:
        raise ValueError("Only supported for two relevance levels.")

    pos_label = unique_y[1]
    n_pos = np.sum(y_true == pos_label)

    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])
    n_relevant = np.sum(y_true == pos_label)

    # Divide by min(n_pos, k) such that the best achievable score is always 1.0.
    return float(n_relevant) / min(n_pos, k)


def average_precision_score(y_true, y_score, k=10):
    """Average precision at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array-like, shape = [n_samples]
        Predicted scores.
    k : int
        Rank.
    Returns
    -------
    average precision @k : float
    """
    unique_y = np.unique(y_true)

    if len(unique_y) > 2:
        raise ValueError("Only supported for two relevance levels.")

    pos_label = unique_y[1]
    n_pos = np.sum(y_true == pos_label)

    order = np.argsort(y_score)[::-1][:min(n_pos, k)]
    y_true = np.asarray(y_true)[order]

    score = 0
    for i in range(len(y_true)):
        if y_true[i] == pos_label:
            # Compute precision up to document i
            # i.e, percentage of relevant documents up to document i.
            prec = 0
            for j in range(0, i + 1):
                if y_true[j] == pos_label:
                    prec += 1.0
            prec /= (i + 1.0)
            score += prec

    if n_pos == 0:
        return 0

    return score / n_pos


def dcg_score(y_true, y_score, k=10, gains="exponential"):
    """Discounted cumulative gain (DCG) at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array-like, shape = [n_samples]
        Predicted scores.
    k : int
        Rank.
    gains : str
        Whether gains should be "exponential" (default) or "linear".
    Returns
    -------
    DCG @k : float
    """
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    if gains == "exponential":
        gains = 2 ** y_true - 1
    elif gains == "linear":
        gains = y_true
    else:
        raise ValueError("Invalid gains option.")

    # highest rank is 1 so +2 instead of +1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)


def ndcg_score(y_true, y_score, k=10, gains="exponential"):
    """Normalized discounted cumulative gain (NDCG) at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array-like, shape = [n_samples]
        Predicted scores.
    k : int
        Rank.
    gains : str
        Whether gains should be "exponential" (default) or "linear".
    Returns
    -------
    NDCG @k : float
    """
    best = dcg_score(y_true, y_true, k, gains)
    actual = dcg_score(y_true, y_score, k, gains)
    return actual / best

In [11]:
# create train test from the first 50,000 rows of buys dataset:
train, test = train_test_split(df_buys_pivot, test_size=0.2)

In [12]:
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [20]:
len(train)

20956

In [21]:
len(test)

5240

In [13]:
train_mat = train.as_matrix()
test_mat = test.as_matrix()

In [14]:
train_mat = sp.csr_matrix(train_mat)
test_mat = sp.csr_matrix(test_mat)

In [15]:
train_cos = cosine_similarities(train_mat)



In [16]:
io.mmwrite("train.mtx", train_cos)
io.mmwrite("test.mtx", test_mat)
train_cos2 = io.mmread("train.mtx")
test_mat2 = io.mmread("test.mtx")
train_arr = train_cos2.toarray()
test_arr = test_mat2.toarray()

In [18]:
# dataset is very sparse; may not be any items purchased together in either small train or testing arrays
ndcg_score(test_arr[0], train_arr[0], k=100)

0.0

In [19]:
# increasing k does show a very small ndcg score; this presumably would be larger if we had more data points
ndcg_score(test_arr[0], train_arr[0], k=100000)

0.086036552005183717