# Recommender System | Revenue Potential 

Complete the exercises below to solidify your knowledge and understanding of recommender systems.

For this lab, we are going to be putting together a user similarity based recommender system in a step-by-step fashion. Our data set contains customer grocery purchases, and we will use similar purchase behavior to inform our recommender system. Our recommender system will generate 5 recommendations for each customer based on the purchases they have made.

In [49]:
##Libraries
#Dataframe & Arrays
import pandas as pd
import numpy as np

#SCiPy for Cluster Distance Analysis
from scipy.spatial.distance import pdist, squareform

In [2]:
#df = pd.read_excel('../data/online_fashion.xlsx')

In [26]:
df = pd.read_csv('../data/cleaned_df2.csv')

In [27]:
df['CustomerID'].isna().sum()

0

In [28]:
df['CustomerID'] = df['CustomerID'].astype(int)


In [29]:
df.shape

(298407, 11)

In [51]:
##Just weighted average price

In [7]:
#data.rename(columns={'Description':'ProductName'}, inplace=True)

## Step 1: Create a data frame that contains the total quantity of each product purchased by each customer.

You will need to group by CustomerID and ProductName and then sum the Quantity field.

In [37]:
grouped = pd.DataFrame(df.groupby(['CustomerID', 'StockCode'])['Quantity'].agg('sum'))
grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Quantity
CustomerID,StockCode,Unnamed: 2_level_1
12347,16008,24
12347,17021,36
12347,20665,6
12347,20719,40
12347,20780,12


## Step 2: Use the `pivot_table` method to create a product by customer matrix.

The rows of the matrix should represent the products, the columns should represent the customers, and the values should be the quantities of each product purchased by each customer. You will also need to replace nulls with zeros, which you can do using the `fillna` method.

In [38]:
matrix = grouped.pivot_table('Quantity', 'StockCode', 'CustomerID', aggfunc='sum', fill_value = 0)
matrix

CustomerID,12347,12348,12349,12350,12352,12353,12354,12355,12356,12357,...,18272,18273,18276,18277,18278,18280,18281,18282,18283,18287
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10080,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10120,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10123C,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10124A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10124G,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10125,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10133,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10135,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Step 3: Create a customer similarity matrix using `squareform` and `pdist`. For the distance metric, choose "euclidean."

In [39]:
# I need to transpose the matrix, otherwise I get the distance for products, not customers.
# First applying pdist, gives an 1D array.
# Then applying squareform to turn it into a squareform
# Finally convert it into a DataFrame

dist_matrix = pd.DataFrame(squareform(pdist(matrix.T, metric='euclidean')), index=matrix.columns, columns=matrix.columns)
dist_matrix

CustomerID,12347,12348,12349,12350,12352,12353,12354,12355,12356,12357,...,18272,18273,18276,18277,18278,18280,18281,18282,18283,18287
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12347,0.000000,635.655567,401.766101,394.970885,394.313327,392.573560,401.880579,347.997126,447.187880,490.658741,...,474.527133,400.492197,378.457395,392.593938,392.802749,392.149206,393.146283,394.717621,405.086411,451.182890
12348,635.655567,0.000000,569.817515,564.191457,563.220206,562.003559,569.092260,540.118506,574.054875,637.457450,...,617.243874,567.563212,559.489053,562.017793,562.419772,562.105862,562.403770,563.897154,554.340148,595.815408
12349,401.766101,569.817515,0.000000,106.113147,102.239914,95.310020,134.773885,150.943698,300.444670,306.065352,...,274.779912,123.951603,98.590060,95.393920,97.365292,95.911417,97.642204,105.280578,158.874164,283.591255
12350,394.970885,564.191457,106.113147,0.000000,63.820060,52.000000,108.967885,128.093716,304.018092,301.502902,...,271.834508,94.783965,57.792733,52.153619,56.320511,53.094256,56.160484,69.541355,142.313035,274.218891
12352,394.313327,563.220206,102.239914,63.820060,0.000000,40.112342,103.474635,123.745707,301.734983,303.501236,...,270.279485,88.820043,47.381431,40.311289,45.574115,41.521079,45.376205,61.163715,139.168962,272.214989
12353,392.573560,562.003559,95.310020,52.000000,40.112342,0.000000,97.005155,118.016948,299.578037,300.599401,...,268.875436,80.746517,29.597297,16.000000,26.608269,18.841444,26.267851,48.703183,134.361453,269.688709
12354,401.880579,569.092260,134.773885,108.967885,103.474635,97.005155,0.000000,152.032891,314.017515,311.765938,...,285.306852,125.259730,100.229736,97.087589,99.388128,97.555113,99.287461,107.433700,156.572667,286.185255
12355,347.997126,540.118506,150.943698,128.093716,123.745707,118.016948,152.032891,0.000000,321.090330,321.272470,...,293.254156,142.211111,99.859902,118.152444,120.049990,118.570654,119.974997,126.475294,173.726797,271.418496
12356,447.187880,574.054875,300.444670,304.018092,301.734983,299.578037,314.017515,321.090330,0.000000,414.003623,...,392.184905,310.230559,300.996678,299.964998,300.717475,300.129972,300.687545,303.247424,320.365416,403.057068
12357,490.658741,637.457450,306.065352,301.502902,303.501236,300.599401,311.765938,321.272470,414.003623,0.000000,...,374.440917,311.486757,302.291250,301.264004,301.589124,301.083045,300.069992,304.703134,325.534944,395.378300


In [40]:
# The distances I have doesn't tell me much. I will normalize to a value between 0 and 1,
# and inverse them: The closer to 1, the more similar they are

dist_norm = pd.DataFrame(1/(1 + dist_matrix))
dist_norm

CustomerID,12347,12348,12349,12350,12352,12353,12354,12355,12356,12357,...,18272,18273,18276,18277,18278,18280,18281,18282,18283,18287
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12347,1.000000,0.001571,0.002483,0.002525,0.002530,0.002541,0.002482,0.002865,0.002231,0.002034,...,0.002103,0.002491,0.002635,0.002541,0.002539,0.002544,0.002537,0.002527,0.002463,0.002211
12348,0.001571,1.000000,0.001752,0.001769,0.001772,0.001776,0.001754,0.001848,0.001739,0.001566,...,0.001617,0.001759,0.001784,0.001776,0.001775,0.001776,0.001775,0.001770,0.001801,0.001676
12349,0.002483,0.001752,1.000000,0.009336,0.009686,0.010383,0.007365,0.006581,0.003317,0.003257,...,0.003626,0.008003,0.010041,0.010374,0.010166,0.010319,0.010138,0.009409,0.006255,0.003514
12350,0.002525,0.001769,0.009336,1.000000,0.015427,0.018868,0.009094,0.007746,0.003278,0.003306,...,0.003665,0.010440,0.017009,0.018813,0.017446,0.018486,0.017495,0.014176,0.006978,0.003633
12352,0.002530,0.001772,0.009686,0.015427,1.000000,0.024324,0.009572,0.008016,0.003303,0.003284,...,0.003686,0.011133,0.020669,0.024206,0.021471,0.023518,0.021563,0.016087,0.007134,0.003660
12353,0.002541,0.001776,0.010383,0.018868,0.024324,1.000000,0.010204,0.008402,0.003327,0.003316,...,0.003705,0.012233,0.032683,0.058824,0.036221,0.050400,0.036673,0.020119,0.007388,0.003694
12354,0.002482,0.001754,0.007365,0.009094,0.009572,0.010204,1.000000,0.006535,0.003174,0.003197,...,0.003493,0.007920,0.009879,0.010195,0.009961,0.010147,0.009971,0.009222,0.006346,0.003482
12355,0.002865,0.001848,0.006581,0.007746,0.008016,0.008402,0.006535,1.000000,0.003105,0.003103,...,0.003398,0.006983,0.009915,0.008393,0.008261,0.008363,0.008266,0.007845,0.005723,0.003671
12356,0.002231,0.001739,0.003317,0.003278,0.003303,0.003327,0.003174,0.003105,1.000000,0.002410,...,0.002543,0.003213,0.003311,0.003323,0.003314,0.003321,0.003315,0.003287,0.003112,0.002475
12357,0.002034,0.001566,0.003257,0.003306,0.003284,0.003316,0.003197,0.003103,0.002410,1.000000,...,0.002664,0.003200,0.003297,0.003308,0.003305,0.003310,0.003321,0.003271,0.003062,0.002523


## Step 4: Check your results by generating a list of the top 5 most similar customers for a specific CustomerID.

In [41]:
Top5_cust200 = dist_norm[12350].sort_values(ascending = False).head(6)
Top5_cust200

CustomerID
12350    1.000000
15180    0.020469
15422    0.020460
15435    0.019740
16484    0.019289
17956    0.019289
Name: 12350, dtype: float64

## Step 5: From the data frame you created in Step 1, select the records for the list of similar CustomerIDs you obtained in Step 4.

In [42]:
# I select index from 1 because I don't want to get the first input, 
# as it is the customer itself

similar = grouped.loc[(Top5_cust200.index[1:],)]
similar

Unnamed: 0_level_0,Unnamed: 1_level_0,Quantity
CustomerID,StockCode,Unnamed: 2_level_1
15180,22112,3
15180,22113,4
15180,22114,4
15180,22348,12
15180,22835,4
15180,72741,9
15422,21218,6
15422,21531,6
15422,21844,6
15422,22199,4


## Step 6: Aggregate those customer purchase records by ProductName, sum the Quantity field, and then rank them in descending order by quantity.

This will give you the total number of each product purchased by the 5 most similar customers to the customer you selected in order from most purchased to least.

In [43]:
agg_similar = similar.groupby('StockCode')[['Quantity']].sum()\
                .sort_values(by = 'Quantity', ascending = False)
agg_similar

Unnamed: 0_level_0,Quantity
StockCode,Unnamed: 1_level_1
22348,28
72741,9
21531,8
21218,7
21844,6
22417,6
22964,4
22835,4
22113,4
22114,4


## Step 7: Filter the list for products that the chosen customer has not yet purchased and then recommend the top 5 products with the highest quantities that are left.

- Merge the ranked products data frame with the customer product matrix on the ProductName field.
- Filter for records where the chosen customer has not purchased the product.
- Show the top 5 results.

In [44]:
products = pd.concat([agg_similar, matrix[12350]], axis=1, sort=False)
products.rename(columns = {12350:'Cust_200'}, inplace = True)
products

Unnamed: 0,Quantity,Cust_200
22348,28.0,24
72741,9.0,0
21531,8.0,0
21218,7.0,0
21844,6.0,0
22417,6.0,0
22964,4.0,0
22835,4.0,0
22113,4.0,0
22114,4.0,0


In [45]:
Top5rec = products.query('Quantity > 0 and Cust_200 == 0').head(5)
Top5rec

Unnamed: 0,Quantity,Cust_200
72741,9.0,0
21531,8.0,0
21218,7.0,0
21844,6.0,0
22417,6.0,0


## Step 8: Now that we have generated product recommendations for a single user, put the pieces together and iterate over a list of all CustomerIDs.

- Create an empty dictionary that will hold the recommendations for all customers.
- Create a list of unique CustomerIDs to iterate over.
- Iterate over the customer list performing steps 4 through 7 for each and appending the results of each iteration to the dictionary you created.

In [46]:
recommendations = {}
unique_ID = dist_norm.columns.unique()

In [47]:
for customer in unique_ID:
    head = dist_norm[customer].sort_values(ascending = False).head(6)
    similar = grouped.loc[(head.index[1:],)]
    agg_similar = similar.groupby('StockCode')[['Quantity']].sum()\
                .sort_values(by = 'Quantity', ascending = False)
    products = pd.concat([agg_similar, matrix[customer]], axis=1, sort=False)
    products.rename(columns = {customer:'customer'}, inplace = True)
    recommendations[customer] = list(products.query('Quantity > 0 and customer == 0').head(5).index)
    

In [19]:
recommendations

{12347: ['23077', '22418', '22614', '22029', '84375'],
 12348: ['15056N', '22693', '21829', '22384', '20727'],
 12349: ['85194S', '22265', '22851', '22322', '72741'],
 12350: ['72741', '21531', '21218', '21844', '22417'],
 12352: ['22915', '21733', '23321', '23322', '22469'],
 12353: ['22485', '22802', '22803', '22982', '23073'],
 12354: ['20979', '21245', '20674', '22993', '22962'],
 12355: ['71477', '21167', '21381', '21380', '20829'],
 12356: ['16161P', '21210', '22961', '22952', '22986'],
 12357: ['72351B', '85034B', '21108', '72349B', '72225C'],
 12358: ['16008', '85015', '84946', '15044D', '85048'],
 12359: ['16156S', '23170', '22915', '84947', '22921'],
 12360: ['22631', '22993', '22962', '22966', '22659'],
 12361: ['20979', '23371', '22352', '22138', '22617'],
 12362: ['79190B', '85040A', '84569D', '82552', '22973'],
 12363: ['22961', '22909', '21975', '21880', '47591D'],
 12364: ['23077', '23080', '21231', '23076', '21232'],
 12365: ['22485', '22802', '22803', '22982', '23073'

##  Step 9: Store the results in a Pandas data frame. The data frame should a column for Customer ID and then a column for each of the 5 product recommendations for each customer.

In [48]:
recommendations_df = pd.DataFrame.from_dict(recommendations, orient='index', 
                                columns=['rec1', 'rec2', 'rec3', 'rec4', 'rec5'])
recommendations_df

Unnamed: 0,rec1,rec2,rec3,rec4,rec5
12347,23077,22418,22614,22029,84375
12348,15056N,22693,21829,22384,20727
12349,85194S,22265,22851,22322,72741
12350,72741,21531,21218,21844,22417
12352,22915,21733,23321,23322,22469
12353,22485,22802,22803,22982,23073
12354,20979,21245,20674,22993,22962
12355,71477,21167,21381,21380,20829
12356,16161P,21210,22961,22952,22986
12357,72351B,85034B,21108,72349B,72225C


## Step 10: Change the distance metric used in Step 3 to something other than euclidean (correlation, cityblock, consine, jaccard, etc.). Regenerate the recommendations for all customers and note the differences.

In [22]:
metrics = [ 'cityblock', 'correlation', 'cosine', 'dice', 
           'euclidean', 'hamming', 'jaccard']

In [25]:
df_dict_diff_metrics = {}
#new_FN = '../data/listings{}.csv.gz'
dfname = 'df_{}'
#try-the code below       except-what tiem happen if it fails
for i in metrics:
    #Step 3 - changed metric euclidean to others and itterate through.
    df_dict_diff_metrics[dfname.format(i)] = pd.DataFrame(squareform(pdist(matrix.T, metric=i)), index=matrix.columns, columns=matrix.columns)
    #Step 4
    
    #Step 5
    
    #Step 6
    
    #Step 7
    
    #Step 8
    
    #Step 9a
    
    #Step 9b - get a $$ value

print("All distance metrics loaded you may proceed")

All distance metrics loaded you may proceed


In [67]:
#Create a price lookup table
df_price_lookup = pd.DataFrame(df['StockCode'].unique())
df_price_lookup['ModePrice'] = 1.5
df_price_lookup.columns=['StockCode','ModePrice']
df_price_lookup

Unnamed: 0,StockCode,ModePrice
0,22749,1.5
1,22310,1.5
2,84969,1.5
3,22913,1.5
4,22912,1.5
5,22914,1.5
6,21756,1.5
7,21724,1.5
8,21883,1.5
9,10002,1.5


In [58]:
#Step 10
#calculate the value of thse
#recommendations_df.iloc[:,1]

for index, row in recommendations_df.head(n=2).iterrows():
    print(row)
     #print(index, row)


rec1    23077
rec2    22418
rec3    22614
rec4    22029
rec5    84375
Name: 12347, dtype: object
rec1    15056N
rec2     22693
rec3     21829
rec4     22384
rec5     20727
Name: 12348, dtype: object


In [112]:
recommendations_df['rec1value'] = recommendations_df['rec1'].apply(pricelookup)

In [128]:
recommendations_df.head()

Unnamed: 0,rec1,rec2,rec3,rec4,rec5,rec1value
12347,23077,22418,22614,22029,84375,1.5
12348,15056N,22693,21829,22384,20727,1.5
12349,85194S,22265,22851,22322,72741,1.5
12350,72741,21531,21218,21844,22417,1.5
12352,22915,21733,23321,23322,22469,1.5


In [135]:
def pricelookup(row):
    try:
        return float(df_price_lookup.loc[df_price_lookup['StockCode'] == row]["ModePrice"])
    except:
        print(f"Failed on row {row}")

In [140]:
recommendations_df['TotalPossCustRev'] = recommendations_df.iloc[:,-5:].sum(axis=1)

###start to monitor acceptance of recommendations as this will give success rate - which is a valuable metric

In [127]:
df_price_lookup.duplicated().sum()

0

In [95]:
df_price_lookup.head()

Unnamed: 0,StockCode,ModePrice
0,22749,1.5
1,22310,1.5
2,84969,1.5
3,22913,1.5
4,22912,1.5


In [None]:
recommendations_df['rec1value'] = recommendations_df['rec1'].apply(pricelookup)
recommendations_df['rec2value'] = recommendations_df['rec2'].apply(pricelookup)
recommendations_df['rec3value'] = recommendations_df['rec3'].apply(pricelookup)
recommendations_df['rec4value'] = recommendations_df['rec4'].apply(pricelookup)
recommendations_df['rec5value'] = recommendations_df['rec5'].apply(pricelookup)

In [133]:
recommendations_df

Unnamed: 0,rec1,rec2,rec3,rec4,rec5,rec1value,rec2value,rec3value,rec4value,rec5value
12347,23077,22418,22614,22029,84375,1.5,1.5,1.5,1.5,1.5
12348,15056N,22693,21829,22384,20727,1.5,1.5,1.5,1.5,1.5
12349,85194S,22265,22851,22322,72741,1.5,1.5,1.5,1.5,1.5
12350,72741,21531,21218,21844,22417,1.5,1.5,1.5,1.5,1.5
12352,22915,21733,23321,23322,22469,1.5,1.5,1.5,1.5,1.5
12353,22485,22802,22803,22982,23073,1.5,1.5,1.5,1.5,1.5
12354,20979,21245,20674,22993,22962,1.5,1.5,1.5,1.5,1.5
12355,71477,21167,21381,21380,20829,1.5,1.5,1.5,1.5,1.5
12356,16161P,21210,22961,22952,22986,1.5,1.5,1.5,1.5,1.5
12357,72351B,85034B,21108,72349B,72225C,1.5,1.5,1.5,1.5,1.5
