In [22]:
import pandas as pd

## Dataset
Dataset available from [kaggle](https://www.kaggle.com/carrie1/ecommerce-data)
* download the zipped file and copy unzipped file to data folder and rename file to 'ecommerce.csv'
* detect the ecoding using `file -I`. You may need to change the encoding to read the file but `ISO-8859-1` should work

In [31]:
df = pd.read_csv('./data/ecommerce.csv', encoding="ISO-8859-1", engine='python')

In [32]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [36]:
list(df.columns)

['Description']

In [35]:
#drop non descriptive fields
df = df.drop(['InvoiceNo', 'StockCode', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country'], axis=1)
df.head()

Unnamed: 0,Description
0,WHITE HANGING HEART T-LIGHT HOLDER
1,WHITE METAL LANTERN
2,CREAM CUPID HEARTS COAT HANGER
3,KNITTED UNION FLAG HOT WATER BOTTLE
4,RED WOOLLY HOTTIE WHITE HEART.


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 1 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Description  540455 non-null  object
dtypes: object(1)
memory usage: 4.1+ MB


In [42]:
df.describe()

Unnamed: 0,Description
count,540455
unique,4223
top,WHITE HANGING HEART T-LIGHT HOLDER
freq,2369


In [44]:
df.dtypes

Description    object
dtype: object

## Preprocess

* drop unnecessary columns
* convert to lower case
* remove stop words
* lemmetize 
* remove non-unicode charaters

In [37]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from unidecode import unidecode
import string

In [38]:
STOP_WORDS = stopwords.words('english') + list(string.punctuation)
lemmetizr = WordNetLemmatizer()

def pre_process(text:str):
    text = text.lower()
    text = unidecode(text)
    text = word_tokenize(text)
    text = [word for word in text if word not in STOP_WORDS]
    for index, word in enumerate(text):
        text[index] = lemmetizr.lemmatize(word)
        
    return " ".join(text)
    

In [39]:
print(pre_process("I am a ñino. I am a runner and I love and my shoes and trails"))

nino runner love shoe trail


In [45]:
# Description is read as an object, read it as type string before applying preprocessing
df['Description'] = df['Description'].astype('str').apply(pre_process)

In [46]:
df.head()

Unnamed: 0,Description
0,white hanging heart t-light holder
1,white metal lantern
2,cream cupid heart coat hanger
3,knitted union flag hot water bottle
4,red woolly hottie white heart


## Feature extraction

Both TF-IDF and CountVectorizer will be used to generate word embeddings

In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

tfidf_description = tfidf_vectorizer.fit_transform((df['Description']))

In [70]:
tfidf_description

<541909x2032 sparse matrix of type '<class 'numpy.float64'>'
	with 2199400 stored elements in Compressed Sparse Row format>

In [52]:
print(tfid_vectorizer.get_feature_names())

['00', '10', '11', '12', '120cm', '125g', '15', '15c', '15cm', '16', '16954', '16955', '16956', '16957', '16960', '16961', '16963', '16965', '16x16cm', '17058', '17059', '17065', '17067', '17068', '17070', '17071', '17074', '17076', '17077', '17080', '17081', '17084', '17086', '17087', '17089', '17090', '17093', '17095', '17096', '17097', '17099', '17100', '17104', '17107', '17112', '17116', '17211', '17214', '17216', '17217', '17218', '17219', '17220', '17259', '17262', '1800', '18pc', '20', '200', '2010', '20713', '20light', '21', '22467', '22719', '22804', '23', '23343', '24', '250g', '25w', '25x24x12cm', '30', '30cm', '30cmx30cm', '34x20cm', '35', '36', '3d', '40', '40cm', '40x40cm', '42', '45cm', '45x30cm', '45x45cm', '47', '50', '500g', '50cm', '5g', '60', '60cm', '60x40cm', '65cmx65cm', '6pc', '70', '72', '75', '78', '84930', '85123a', '8m', 'a4', 'a5', 'a6', 'a7', 'abc', 'abstract', 'acapulco', 'account', 'acrylic', 'add', 'address', 'ade', 'adjust', 'adjustment', 'adult', 'adv

In [71]:
tfidf_description.shape

(541909, 2032)

## Recommendations
We will evaluate recommender systems built using on a few different techniques and packages
### Cosine similarity based recommenders
Map the product descriptions to vector space and use cosine similarity (angle between two vectors) to find most similar products. Results closer to `1` imply similarity. We'll be using the cosine similarity module from `sklearn`

* `tf-idf` vectorizer with `cosine_similarity` 
* `count_vector` with `cosine_similarity`

### Spacy
This implementation will use a pre-trained models from spacy to compute similarity between text. 

### KNN Recommender
The KNN algorithm can be used with `tf-idf` as features to find similar neighbours

## Cosine similarity based recommenders
### `TF-IDF` with `Cosine similarity`



In [78]:
from sklearn.metrics.pairwise import cosine_similarity

description_transform = tfid_vectorizer.transform((df['Description']))
# cosine_similarity_tfidf = map(lambda x: cosine_similarity(description_transform, x), tf_description)

In [79]:
description_transform

<541909x2032 sparse matrix of type '<class 'numpy.float64'>'
	with 2199400 stored elements in Compressed Sparse Row format>

In [81]:
df.shape

(541909, 3)

In [83]:
# check similarity between two vectors
print(df.iloc[0])
print(df.iloc[1])

Description                   white hanging heart t-light holder
vectors          (0, 1992)\t0.4516249418581932\n  (0, 1074)\t...
tfidf            (0, 1992)\t0.4516249418581932\n  (0, 1074)\t...
Name: 0, dtype: object
Description                                  white metal lantern
vectors          (0, 1992)\t0.4516249418581932\n  (0, 1074)\t...
tfidf            (0, 1992)\t0.4516249418581932\n  (0, 1074)\t...
Name: 1, dtype: object


In [84]:
df.iloc[0]['tfidf'].todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
cosine_similarity(df.iloc[0]['tfidf'].todense(), df.iloc[1]['tfidf'].todense())