# Intro to Recommender Systems Lab

Complete the exercises below to solidify your knowledge and understanding of recommender systems.

For this lab, we are going to be putting together a user similarity based recommender system in a step-by-step fashion. Our data set contains customer grocery purchases, and we will use similar purchase behavior to inform our recommender system. Our recommender system will generate 5 recommendations for each customer based on the purchases they have made.

In [None]:
#Libraries
#Dataframe and arrays
import pandas as pd
import numpy as np


from scipy.spatial.distance import pdist, squareform

In [None]:
df = pd.read_excel('../data/online_fashion.xlsx')

In [None]:
df.isna().sum()

In [None]:
df.shape

In [None]:
df.Quantity.unique()

In [None]:
df.sort_values(by='Quantity')

In [None]:
df.isnull().sum() / df.shape[0] * 100.00

In [None]:
df.info()

In [None]:
df.InvoiceDate.min()

In [None]:
df.InvoiceDate.max()

## Decide what I want to drop

In [None]:
#Country = Unspecified 
#CustomerID
#Description

#####Price
#Unit price of 0 (zero)

#####Quantity
#Massive negative values

#####Items
#POSTAGE
#DOTCOM POSTAGE

In [None]:
df['Rev'] = df['Quantity']*df['UnitPrice']

In [None]:
df.head()

In [None]:
#Top 10 Countries
df_top_countries_stg = df.groupby(['Country'])['Rev'].agg('sum')
df_top_countries = df_top_countries_stg.sort_values(ascending=False).head(10).to_frame()
df_top_countries

In [None]:
df_top_countries = df_top_countries.reset_index()


In [None]:
top_country_rev = []
for i in df_top_countries['Country'][:3]:
    top_country_rev.append(i)
top_country_rev    

In [None]:
#Top 10 Description by Revenue - All Countries
df_top_items_rev = df.groupby(['Description'])['Rev'].agg('sum')
df_top_items_rev.sort_values(ascending=False).head(10)

In [None]:
#Top 10 Description by Quantity - All Countries
df_top_items_vol = df.groupby(['Description'])['Quantity'].agg('sum')
df_top_items_vol.sort_values(ascending=False).head(10)

In [None]:



for i in top_country_rev:
    print(df.loc[df['Country'] == i].groupby(['Description'])['Quantity'].agg('sum').sort_values(ascending=False).head(3))
    

In [None]:
df_days_week = df

In [None]:
df_days_week.info()

In [None]:
df_days_week['Weekday'] = df_days_week['InvoiceDate'].dt.day_name()
df_days_week.head()

In [None]:
#customers with more than one country
df_cust_country = df.groupby(['CustomerID','Country']).count()
#df_cust_country.to_frame
df_cust_country

In [None]:
#Revenue by Day of Week - All Countries
df_day_of_week_rev = df_days_week.groupby(['Weekday'])['Rev'].agg('sum')
df_day_of_week_rev.sort_values(ascending=False)#.head(10)

In [None]:
df.head()

In [106]:
#Customers with Negative Qty
#Identify Customer/ Product mix that are negative - i.e. potential returns from outside reporting period

df_cust_neg_qty = df.groupby(['CustomerID', 'StockCode'])['Quantity'].agg('sum').to_frame()
df_cust_neg_qty.reset_index(inplace=True)
df_cust_neg_qty = df_cust_neg_qty.loc[(df_cust_neg_qty['Quantity'] <= 0)]
df_cust_neg_qty

df_new = df[~(df['StockCode'].isin(df_cust_neg_qty.StockCode) & df['CustomerID'].isin(df_cust_neg_qty.CustomerID))]

df_new.shape

(219610, 10)

In [None]:
#Top Returning Customers
df_top_returners = df.groupby(['CustomerID'])['Quantity'].agg('sum').to_frame()
df_top_returners.reset_index(inplace=True)
df_top_returners = df_top_returners.loc[(df_top_returners['Quantity'] < 0)].sort_values(by='Quantity', ascending=True)
df_top_returners

In [None]:
#Min Max Mode Median for Each StockCode
#HELP!!!!
df_product_price_variance = df.groupby(['StockCode'])['UnitPrice'].agg('min')
df_product_price_variance

In [None]:
#Top Invoice Values by Rev
df_top_invoices_rev = df.groupby(['InvoiceNo', 'CustomerID'])['Rev'].agg('sum').to_frame()
df_top_invoices_rev.reset_index(inplace=True)
df_top_invoices_rev = df_top_invoices_rev.sort_values(by='Rev', ascending=False)
df_top_invoices_rev

In [None]:
#Top Invoice Values by Qty
df_top_invoices_qty = df.groupby(['InvoiceNo', 'CustomerID'])['Quantity'].agg('sum').to_frame()
df_top_invoices_qty.reset_index(inplace=True)
df_top_invoices_qty = df_top_invoices_qty.sort_values(by='Quantity', ascending=False)
df_top_invoices_qty

In [None]:
#Top Customer by Rev
df_top_customer_rev = df.groupby(['CustomerID'])['Rev'].agg('sum').to_frame()
df_top_customer_rev.reset_index(inplace=True)
df_top_customer_rev = df_top_customer_rev.sort_values(by='Rev', ascending=False)
df_top_customer_rev['CustomerID'] = df_top_customer_rev['CustomerID'].astype(int)
df_top_customer_rev

In [None]:
#Find Min, Max, Mean of StockCodes
df_item_prices = df[['StockCode', 'UnitPrice']]
df_item_prices.groupby(by='StockCode').agg([min, max, 'mean'])

In [None]:
#Find Strange StockCodes
df_unique_stock_codes = df['StockCode'].unique()
df_unique_stock_codes = pd.DataFrame(df_unique_stock_codes)
df_unique_stock_codes['len'] = df_unique_stock_codes[0].astype(str).str.len()
df_unique_stock_codes.rename(columns={0:'StockCode'}, inplace=True)
df_unique_stock_codes.sort_values(by='len')
df_unique_stock_codes['len'] = df_unique_stock_codes['len'].astype(int)
df_unique_stock_codes.loc[(df_unique_stock_codes['len'] >= 8) | (df_unique_stock_codes['len'] <= 4)].sort_values(by='len')

In [None]:

df.loc[df['StockCode'] == 'C2']

In [None]:
#stockcodes to drop
df_stock_to_drop_stg = df_unique_stock_codes.loc[(df_unique_stock_codes['len'] >= 8) | (df_unique_stock_codes['len'] <= 4)].sort_values(by='len')
df_stock_to_drop_stg.reset_index(drop=True)
df_stock_to_drop = pd.concat([df_stock_to_drop_stg.iloc[0:10], df_stock_to_drop_stg.iloc[-6:], df_stock_to_drop_stg.iloc[-9:-8] ])
df_stock_to_drop

In [None]:
#Find Strange Descriptions
df_unique_descriptions = df['Description'].unique()
df_unique_descriptions = pd.DataFrame(df_unique_descriptions)
df_unique_descriptions['len'] = df_unique_descriptions[0].astype(str).str.len()
df_unique_descriptions.rename(columns={0:'Description'}, inplace=True)
df_unique_descriptions.sort_values(by='len')
df_unique_descriptions['len'] = df_unique_descriptions['len'].astype(int)
df_unique_descriptions.loc[(df_unique_descriptions['len'] >= 36) | (df_unique_descriptions['len'] <= 11)].sort_values(by='len', ascending=False)

In [None]:
#Find descriptions with negative works like 'damaged' 'fees' 'mouldy', 'faulty', 'wet', 'lost', 'missing','wrong', 'broke'

In [None]:
#Get word frequency to assist with categorisation
#errors out as some descriptons are rubbish
#search lookup string to find counterpart part II: qwerty1wd

from collections import defaultdict

import pandas as pd

text_list = []

#need to get unique descriptions

df_uniq_descs = df['Description'].unique()

for i in df_uniq_descs[:395]:
    text_list.append(i)

word_freq = defaultdict(int)

for text in text_list:
    for word in text.split():
        word_freq[word] += 1

pd.DataFrame.from_dict(word_freq, orient='index') \
.sort_values(0, ascending=False) \
.rename(columns={0: 'abs_freq'})
    
    
    

In [None]:
#same as above - this works too!
#wrod count
#search lookup string to find counterpart part I: qwerty1wd
text_list = []

#need to get unique descriptions

word_freq = defaultdict(int)

for text in df['Description'][:395].unique():
    for word in text.split():
        word_freq[word] += 1

pd.DataFrame.from_dict(word_freq, orient='index') \
.sort_values(0, ascending=False) \
.rename(columns={0: 'abs_freq'})
    

In [None]:
len(df['StockCode'].unique())

In [None]:
len(df['CustomerID'].unique())

In [None]:
#Finding Customers with more than 1 Country listed
df_customer_stg = df.groupby('CustomerID')['Country'].unique()
df_customer_stg.loc[df_customer_stg.apply(lambda x:len(x)>1)]

In [None]:
pd.DataFrame(df_customer_stg)

In [None]:
#Finding StockCodes with  more than 1 Description listed
df_inventory_stg = df.groupby('StockCode')['Description'].unique()
df_inventory_stg.loc[df_inventory_stg.apply(lambda x:len(x)>1)]

#greater than 1 = 1324
#greater than 2 = 255
#greater than 3 = 70
#greater than 4 = 20
#greater than 5 = 6

In [None]:
#Find Cusomters who have returned items that weren't purchased in this reporting period
df_customers_returning_non_purchased_stg = df.groupby(['CustomerID', 'StockCode'])['Quantity'].agg('sum')
df_customers_returning_non_purchased = df_customers_returning_non_purchased_stg[lambda x: x<=0]
df_customers_returning_non_purchased = pd.DataFrame(df_customers_returning_non_purchased)
df_customers_returning_non_purchased

In [None]:
#Calculate Weighted Average Price (AWP) Part I
df_awp_trial = df[['StockCode','Quantity','Rev']]
df_awp_trial.groupby('StockCode').agg({'Quantity': ['sum'], 'Rev': ['sum']})
##need to divide SUM by QTY to give AWP


In [None]:
#Calculate Weighted Average Price (AWP) PArt II
#df_AWP_stg = df.groupby(['StockCode'])['Quantity','UnitPrice'].agg('sum',count)
#df_AWP_stg


df.groupby('StockCode').agg({'Quantity': ['sum','count'], 'UnitPrice': ['sum','count']})

# sum(qty*unit_price - by row) / sum(total Qty)


#df_AWP = df_AWP_stg[lambda x: x<=0]
#df_AWP = pd.DataFrame(df_AWP)
#df_AWP

In [None]:
#Calculate Weighted Average Price (AWP) PArt III
grouped = df.groupby('Date')

def wavg(group):
    d = group['value']
    w = group['wt']
    return (d * w).sum() / w.sum()

grouped.apply(wavg)

In [51]:
df.groupby(['StockCode','UnitPrice'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002786386B8D0>


In [None]:
def tagforremoval_negqty(row):
    '''
    Input: Row (which will be slicked into columns)
    Output: 1 or 0 depening on if row is to be deleted
    
    The purpose of this function is to tag rows for deletion.
    '''
    if row['CustomerID'].isin(df_cust_neg_qty.CustomerID) & row['StockCode'].isin(df_cust_neg_qty.StockCode):
        return 1
    else:
        return 0
