In [40]:
import pandas as pd

## Dataset
Dataset available from [kaggle](https://www.kaggle.com/carrie1/ecommerce-data)
* download the zipped file and copy unzipped file to data folder and rename file to 'ecommerce.csv'
* detect the ecoding using `file -I`. You may need to change the encoding to read the file but `ISO-8859-1` should work

In [41]:
df = pd.read_csv('./data/ecommerce.csv', encoding="ISO-8859-1", engine='python')

In [42]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [43]:
list(df.columns)

['InvoiceNo',
 'StockCode',
 'Description',
 'Quantity',
 'InvoiceDate',
 'UnitPrice',
 'CustomerID',
 'Country']

In [44]:
#drop non descriptive fields
df = df.drop(['InvoiceNo', 'StockCode', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country'], axis=1)
df.head()

Unnamed: 0,Description
0,WHITE HANGING HEART T-LIGHT HOLDER
1,WHITE METAL LANTERN
2,CREAM CUPID HEARTS COAT HANGER
3,KNITTED UNION FLAG HOT WATER BOTTLE
4,RED WOOLLY HOTTIE WHITE HEART.


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 1 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Description  540455 non-null  object
dtypes: object(1)
memory usage: 4.1+ MB


In [46]:
df.describe()

Unnamed: 0,Description
count,540455
unique,4223
top,WHITE HANGING HEART T-LIGHT HOLDER
freq,2369


In [47]:
df.dtypes

Description    object
dtype: object

## Preprocess

* drop unnecessary columns
* convert to lower case
* remove stop words
* lemmetize 
* remove non-unicode charaters
* drop duplicates

In [48]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from unidecode import unidecode
import string

In [49]:
STOP_WORDS = stopwords.words('english') + list(string.punctuation)
lemmetizr = WordNetLemmatizer()

def pre_process(text:str):
    text = text.lower()
    text = unidecode(text)
    text = word_tokenize(text)
    text = [word for word in text if word not in STOP_WORDS]
    for index, word in enumerate(text):
        text[index] = lemmetizr.lemmatize(word)
        
    return " ".join(text)
    

In [50]:
# test preprocessing
print(pre_process("I am a ñino. I am a runner and I love and my shoes and trails"))

nino runner love shoe trail


In [51]:
# Description is read as an object, read it as type string before applying preprocessing
df['Description'] = df['Description'].astype('str').apply(pre_process)

In [52]:
df.head()

Unnamed: 0,Description
0,white hanging heart t-light holder
1,white metal lantern
2,cream cupid heart coat hanger
3,knitted union flag hot water bottle
4,red woolly hottie white heart


In [53]:
df.describe()

Unnamed: 0,Description
count,541909
unique,4134
top,white hanging heart t-light holder
freq,2369


In [55]:
df['Description'].value_counts()

white hanging heart t-light holder    2369
regency cakestand 3 tier              2200
jumbo bag red retrospot               2159
party bunting                         1727
lunch bag red retrospot               1638
                                      ... 
20713 wrongly marked                     1
wrongly marked 23343                     1
pink large jeweled photoframe            1
pink beads+hand phone charm              1
mummy mouse red gingham ribbon           1
Name: Description, Length: 4134, dtype: int64

In [56]:
# Drop duplicates
df.drop_duplicates(subset=['Description'], inplace=True)

In [59]:
df.describe()

Unnamed: 0,Description
count,4134
unique,4134
top,snack tray red vintage doily
freq,1


In [73]:
df['Description'].value_counts()

snack tray red vintage doily        1
cute cat tape                       1
s/3 pink square planter rose        1
glass jar peacock bath salt         1
set 10 card christmas tree 16955    1
                                   ..
kitty pencil eraser                 1
treasure island book box            1
childrens apron apple design        1
easter craft 4 chick                1
                                    1
Name: Description, Length: 4134, dtype: int64

## Feature extraction

Both TF-IDF and CountVectorizer will be used to generate word embeddings. 

TfidfVectorizer = CountVectorizer + TfidfTransform (row wise euclidian normalization)

In [100]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

tfidf_description = tfidf_vectorizer.fit_transform((df['Description']))

In [75]:
tfidf_description

<4134x2032 sparse matrix of type '<class 'numpy.float64'>'
	with 16827 stored elements in Compressed Sparse Row format>

In [76]:
tfidf_vectorizer.get_feature_names()

['00',
 '10',
 '11',
 '12',
 '120cm',
 '125g',
 '15',
 '15c',
 '15cm',
 '16',
 '16954',
 '16955',
 '16956',
 '16957',
 '16960',
 '16961',
 '16963',
 '16965',
 '16x16cm',
 '17058',
 '17059',
 '17065',
 '17067',
 '17068',
 '17070',
 '17071',
 '17074',
 '17076',
 '17077',
 '17080',
 '17081',
 '17084',
 '17086',
 '17087',
 '17089',
 '17090',
 '17093',
 '17095',
 '17096',
 '17097',
 '17099',
 '17100',
 '17104',
 '17107',
 '17112',
 '17116',
 '17211',
 '17214',
 '17216',
 '17217',
 '17218',
 '17219',
 '17220',
 '17259',
 '17262',
 '1800',
 '18pc',
 '20',
 '200',
 '2010',
 '20713',
 '20light',
 '21',
 '22467',
 '22719',
 '22804',
 '23',
 '23343',
 '24',
 '250g',
 '25w',
 '25x24x12cm',
 '30',
 '30cm',
 '30cmx30cm',
 '34x20cm',
 '35',
 '36',
 '3d',
 '40',
 '40cm',
 '40x40cm',
 '42',
 '45cm',
 '45x30cm',
 '45x45cm',
 '47',
 '50',
 '500g',
 '50cm',
 '5g',
 '60',
 '60cm',
 '60x40cm',
 '65cmx65cm',
 '6pc',
 '70',
 '72',
 '75',
 '78',
 '84930',
 '85123a',
 '8m',
 'a4',
 'a5',
 'a6',
 'a7',
 'abc',
 

In [77]:
tfidf_description.shape

(4134, 2032)

## Recommendations
We will evaluate recommender systems built using on a few different techniques and packages
### Cosine similarity based recommenders
Map the product descriptions to vector space and use cosine similarity (angle between two vectors) to find most similar products. Results closer to `1` imply similarity. We'll be using the cosine similarity module from `sklearn`

* `tf-idf` vectorizer with `cosine_similarity` 
* `count_vector` with `cosine_similarity`

### Spacy
This implementation will use a pre-trained models from spacy to compute similarity between text. 

### KNN Recommender
The KNN algorithm can be used with `tf-idf` as features to find similar neighbours

## 1) Cosine similarity based recommenders
### `TF-IDF` with `Cosine similarity`

To calculate cosine similarity, we need to calculate the dot product of a document vectory with every other document vector. 


In [78]:
from sklearn.metrics.pairwise import cosine_similarity

In [121]:
# test input
index = 239
print(df.iloc[index])

Description    lady gentleman metal sign
Name: 337, dtype: object


In [122]:
tfidf_description[0:index]

<239x2032 sparse matrix of type '<class 'numpy.float64'>'
	with 977 stored elements in Compressed Sparse Row format>

In [123]:
# check similarity between two vectors
similarities = cosine_similarity(tfidf_description)

In [143]:
type(similarities)

numpy.ndarray

In [142]:
similarities

array([[1.        , 0.21775039, 0.10198416, ..., 0.        , 0.76593101,
        0.        ],
       [0.21775039, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.10198416, 0.        , 1.        , ..., 0.        , 0.29493202,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.76593101, 0.        , 0.29493202, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [139]:
# top 10 indices, skip the first match - itself
top_matches = np.argsort(similarities[index])[::-1][1:11]

In [140]:
df.iloc[index]

Description    lady gentleman metal sign
Name: 337, dtype: object

In [141]:
df.iloc[top_matches]

Unnamed: 0,Description
1469,garden metal sign
3169,party metal sign
342,kitchen metal sign
341,bathroom metal sign
58404,mirrored wall art lady
345,'m holiday metal sign
1303,singing metal sign
343,toilet metal sign
6565,butter metal sign


In [145]:
def get_top_matches(index:int, top=10):
    '''
    Calculates the cosine similarity between an item and every other item,
    returning the top matches
    '''
    return np.argsort(similarities[index])[::-1][1:11]
    

In [152]:
df.index

Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            504104, 507867, 510117, 512588, 514649, 527065, 532724, 535329,
            537621, 540421],
           dtype='int64', length=4134)

In [156]:
test_indices = np.random.randint(0, df.size, 10)

for test_index in test_indices:
    print(f'input: {df.iloc[test_index]}')
    matches = get_top_matches(test_index)
    print(f'top matches: {df.iloc[matches]}')
    print('========' * 10)

input: Description    glass bead hoop necklace amethyst
Name: 95681, dtype: object
top matches:                                 Description
38302      glass bead hoop earring amethyst
95691        glass bead hoop necklace green
224923       glass bead hoop necklace black
95692      glass bead hoop necklace montana
65402         glass bead hoop earring green
16755         glass bead hoop earring black
36150       glass bead hoop earring montana
6149       5 strand glass necklace amethyst
3031    amethyst glass/shell/pearl necklace
4112      amethyst hoop earring floral leaf
input: Description    le grand tray chic set
Name: 184939, dtype: object
top matches:                               Description
184776                    petit tray chic
1378                grand chocolatecandle
6127      black grand baroque photo frame
29859       hippy chic decorative parasol
345377  le jardin botanique cushion cover
193     fridge magnet le enfants assorted
97681               cream sweetheart tra