<a href="https://colab.research.google.com/github/tnewtont/ModCloth_Recommendation_System/blob/main/rsp_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import pickle
import sqlite3

In [4]:
# Making the utility matrix
def make_um(df):
    um = df.pivot_table(index = 'user_id', columns = 'item_id', values = 'rating')
    um.fillna(0, inplace = True)
    return um

In [14]:
def recommend_items(UM, user_LC, CS, model, kay):
    rec_items = np.zeros((len(UM.index), kay))

    user_dict = {}
    for u in range(len(user_LC.index)):
        user_dict[u] = user_LC.index[u]
    user_dict = dict((v,k) for k,v in user_dict.items())

    for user in user_LC.index:
        user_reshaped = np.array(user_LC.loc[user]).reshape((1,len(user_LC.loc[user])))
        items = model.kneighbors(user_reshaped, return_distance = False)
        rec_items[user_dict[user],:] = np.array(CS.columns)[items]

    rec_items_df = pd.DataFrame(rec_items, index = user_dict.keys())
    return rec_items_df

In [17]:
def obtain_all_user_recs(user_df, rec_model, generic_rec):

    # Obtain a series that contains the avg product rating from each user in the test dataset
    # By grouping, we can then automatically extract the unique user ID's
    grouped = user_df.groupby('user_id')['rating'].mean()

    # Store a list of each specific user
    user_list = grouped.index


    PI_mean_adj_dict = dict(zip(generic_rec.index, generic_rec['weighted_vals'])) # Item to weight
    PI_mean_adj_dict2 = dict((v,k) for k,v in PI_mean_adj_dict.items()) # Weight to item

    all_user_recs = {}

    for user in user_list:
        # Extract avg product rating from a specific user
        specific_user_mean = grouped[user]
        # Extract items reviewed by user from test dataset
        specific_user_actual_items = user_df.loc[user_df['user_id'] == user, 'item_id']

        # If a specific user's avg product rating is > 3 (i.e. 4 or 5), use the SVD model.
        # Otherwise, if a specified user's avg product rating is <= 3 (i.e. 3 or below) give generic recommendations instead

        if specific_user_mean > 3:
                # Extract the products recommended by the model
                items_pred = rec_model.loc[user]

                # Exclude products already purchased by the user
                final_items = list(set(items_pred) - set(specific_user_actual_items).intersection(set(items_pred)))
                final_items2 = [PI_mean_adj_dict[f] for f in final_items]
                final_items2.sort
                final_items3 = [PI_mean_adj_dict2[f] for f in final_items2][0:3]
                all_user_recs[user] = final_items3
        else:
                _generic_rec = set(generic_rec.index[0:5])
                final_generic_recs = list(_generic_rec - set(specific_user_actual_items).intersection(_generic_rec))[0:3]
                all_user_recs[user] = final_generic_recs
    return all_user_recs

In [19]:
def convert_recs_dict_to_df(users_recs_dict):
    df = pd.DataFrame(users_recs_dict.items(), columns = ['username', 'items_recommended'])
    df2 = pd.concat([df['username'], pd.DataFrame(df['items_recommended'].to_list()).add_prefix('product')], axis = 1)
    df2 = df2.rename(columns = {'product0': 'product_1', 'product1': 'product_2', 'product2': 'product_3'})
    return df2

In [5]:
# Load filtered dataframe
df = pd.read_csv('/content/df_modcloth_filtered.csv')
df

Unnamed: 0,item_id,user_id,rating,category
0,7443,Alex,4,Dresses
1,7443,carolyn.agan,3,Dresses
2,7443,Robyn,4,Dresses
3,7443,De,4,Dresses
4,7443,tasha,4,Dresses
...,...,...,...,...
93910,154797,BernMarie,5,Dresses
93911,77949,Sam,4,Bottoms
93912,67194,Janice,5,Dresses
93913,71607,amy,3,Outerwear


In [16]:
# Load dataframe containing generic recommendations
pop = pd.read_csv('/content/pop_items.csv')
pop.set_index('item_id', inplace = True)
pop

Unnamed: 0_level_0,mean,num_reviews,cat,weighted_vals
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
34935,4.482247,1887,Tops,4228767.0
21296,4.171760,1636,Bottoms,3412696.0
32405,4.325829,1599,Dresses,3458637.0
32406,4.328648,1494,Dresses,3233016.0
32403,4.367199,1378,Dresses,3009552.0
...,...,...,...,...
153801,4.083333,24,Bottoms,49008.0
138414,4.000000,24,Tops,48000.0
153397,3.791667,24,Bottoms,0.0
153470,4.416667,24,Outerwear,52992.0


In [9]:
# Use make_um to create the utility matrix
um = make_um(df)
um

item_id,6454,7443,11960,16411,21296,22563,24853,27439,27590,28252,...,154505,154540,154543,154567,154661,154665,154794,154797,155293,155317
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Ferrari"")",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#1dad,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Chelle,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Tree',0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zurajohnson,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zuzu_zoom,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
üá¶üá∫,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
üêª,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Since the ratings are ordinal/rank-based, we will use Spearman's correlation.

In [10]:
cs = um.corr(method = 'spearman')
cs

item_id,6454,7443,11960,16411,21296,22563,24853,27439,27590,28252,...,154505,154540,154543,154567,154661,154665,154794,154797,155293,155317
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6454,1.000000,0.068324,0.053686,0.039297,0.048203,0.039648,0.049449,0.061552,0.037637,0.040417,...,0.036821,0.019727,-0.000972,0.091684,0.061388,0.035897,0.046448,0.033179,0.100182,0.089478
7443,0.068324,1.000000,0.114484,0.112813,0.120469,0.102194,0.095090,0.086744,0.100983,0.106817,...,0.034368,0.027694,0.026820,0.028728,0.027750,0.033541,0.026664,0.039383,0.047621,0.052852
11960,0.053686,0.114484,1.000000,0.113169,0.108872,0.117086,0.116585,0.093671,0.088930,0.106381,...,0.033073,0.019365,0.029695,0.057957,0.042516,0.052956,0.016499,0.029208,0.047048,0.036908
16411,0.039297,0.112813,0.113169,1.000000,0.123298,0.095751,0.088756,0.098161,0.087320,0.089824,...,0.025691,0.014570,0.034105,0.037542,0.038708,0.033576,0.017944,0.034430,0.023444,0.028778
21296,0.048203,0.120469,0.108872,0.123298,1.000000,0.102865,0.104940,0.094092,0.106626,0.114525,...,0.025251,0.019916,0.028853,0.032007,0.028967,0.032007,0.009642,0.035231,0.043449,0.035802
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154665,0.035897,0.033541,0.052956,0.033576,0.032007,0.041016,0.042434,0.055937,0.040290,0.051249,...,0.068886,0.050126,0.028265,0.067266,0.101110,1.000000,0.057316,0.062338,0.048699,0.073074
154794,0.046448,0.026664,0.016499,0.017944,0.009642,0.015646,0.016232,0.003680,0.016026,0.004859,...,0.028962,0.031861,-0.000622,0.028268,0.064432,0.057316,1.000000,0.053074,0.030922,0.022759
154797,0.033179,0.039383,0.029208,0.034430,0.035231,0.032642,0.038279,0.015623,0.010041,0.024459,...,0.042212,0.070115,0.026100,0.041222,0.022738,0.062338,0.053074,1.000000,0.068144,0.050468
155293,0.100182,0.047621,0.047048,0.023444,0.043449,0.045265,0.041067,0.043812,0.036580,0.043344,...,0.024475,0.054741,-0.000728,0.073508,0.026972,0.048699,0.030922,0.068144,1.000000,0.079871


We then calculate the linear combinations by utilizing matrix multiplication for each user that we will then use for our model to evaluate.

In [11]:
user_LC = um@cs
user_LC

item_id,6454,7443,11960,16411,21296,22563,24853,27439,27590,28252,...,154505,154540,154543,154567,154661,154665,154794,154797,155293,155317
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Ferrari"")",0.523910,0.660719,0.680375,0.527445,0.556202,0.637333,0.573613,0.596824,0.584918,0.592128,...,0.206865,0.192513,0.190472,0.287193,0.507716,0.594803,0.308654,0.246852,0.222885,0.343305
#,0.257196,0.469893,0.580747,0.502325,0.562046,0.495646,0.487415,0.441805,0.443010,0.528802,...,0.130669,0.147500,0.173448,0.199775,0.118633,0.202114,0.076976,0.277215,0.250661,0.215905
#1dad,0.081977,0.329556,0.233603,0.218380,0.208539,0.236790,0.246058,0.179568,0.244533,0.304895,...,0.049156,0.055773,0.065799,0.163360,0.120738,0.221442,-0.007833,0.043055,0.180251,-0.012253
'Chelle,0.180786,0.205820,0.219967,0.168732,0.223808,0.160789,0.208197,0.134193,0.138987,0.271256,...,0.087793,0.097335,0.113008,0.131992,0.149911,0.038773,0.112918,0.121311,0.144919,0.295192
'Tree',0.579958,0.552420,0.823034,0.629798,0.620359,0.759280,0.614793,0.606357,0.489773,0.675222,...,0.329849,0.165081,0.114989,0.271390,0.178396,0.364062,0.248744,0.433012,0.400893,0.312069
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zurajohnson,0.594719,1.204901,1.332723,1.120946,1.102677,1.215751,1.057991,1.141124,1.119589,1.256869,...,0.372017,0.312016,0.494757,0.464318,0.264436,0.645407,0.350011,0.416707,0.707983,0.643968
zuzu_zoom,0.203460,0.254603,0.278410,0.218151,0.276479,0.230031,0.253305,0.208595,0.240862,0.262624,...,0.091326,0.139021,0.204057,0.121933,0.176589,0.188611,0.118634,0.235188,0.279690,0.230834
üá¶üá∫,0.292026,0.289367,0.319362,0.252313,0.282708,0.238571,0.301009,0.264456,0.212086,0.252274,...,0.303305,0.130404,0.070579,0.420195,0.203092,0.360832,0.308647,0.221147,0.394678,0.241177
üêª,0.166095,0.196127,0.265622,0.190461,0.197406,0.115139,0.192608,0.306783,0.117376,0.291602,...,-0.005329,0.231981,-0.004272,0.205942,0.113489,0.311588,0.265572,-0.005873,0.110286,0.079605


We want to make sure that our model's scope is wide enough to capture similar products. We build a k Nearest Neighbors model where the number of neighors, n, is 10.

In [12]:
nn = NearestNeighbors(n_neighbors = 10)
nn.fit(cs)

In [15]:
recommended_items = recommend_items(um, user_LC, cs, nn, 10)
recommended_items

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
"""Ferrari"")",44893.0,36172.0,77949.0,80427.0,34931.0,11960.0,114371.0,40899.0,106758.0,62482.0
#,59176.0,34931.0,77949.0,40899.0,11960.0,36172.0,80427.0,64745.0,114371.0,50405.0
#1dad,126098.0,36172.0,77949.0,11960.0,7443.0,114371.0,28252.0,40899.0,34931.0,113512.0
'Chelle,141880.0,146475.0,34931.0,80427.0,77949.0,140525.0,114371.0,36172.0,128359.0,67194.0
'Tree',84931.0,107794.0,77949.0,80427.0,34931.0,11960.0,114371.0,36172.0,40899.0,80841.0
...,...,...,...,...,...,...,...,...,...,...
zurajohnson,106758.0,77949.0,36172.0,80427.0,34931.0,11960.0,89044.0,89042.0,114371.0,50405.0
zuzu_zoom,126114.0,36172.0,77949.0,34931.0,140525.0,114371.0,11960.0,106758.0,40899.0,80427.0
üá¶üá∫,86074.0,144572.0,77949.0,36172.0,114371.0,140525.0,67194.0,114770.0,34931.0,40899.0
üêª,151870.0,140525.0,34931.0,113512.0,114770.0,77949.0,114371.0,36172.0,80427.0,137731.0


In [18]:
all_user_recs_dict = obtain_all_user_recs(df, recommended_items, pop)
all_user_recs_dict

{'"Ferrari")': [114371, 40899, 106758],
 '#': [40899, 114371, 50405],
 '#1dad': [114371, 40899, 113512],
 "'Chelle": [114371, 153397, 80427],
 "'Tree'": [114371, 40899, 80841],
 '(usually)': [21296, 32403, 32405],
 '-L': [114371, 106758, 153397],
 '.': [114371, 153397, 146475],
 '..': [114371, 106758, 36172],
 '01085': [148257, 106758, 36172],
 '0123annac': [21296, 32403, 32405],
 '02ambersmith': [21296, 32403, 32405],
 '02draper': [40899, 114371, 64745],
 '0311': [114371, 80427, 36172],
 '044de0c8': [50818, 40899, 50405],
 '0846': [114371, 80841, 80427],
 '0bazooka0': [40899, 50405, 64745],
 '0heatherstone0': [114371, 40899, 50405],
 '0k4sh1m0m0': [40899, 114371, 50405],
 '10097685jk': [114371, 50405, 59176],
 '100daysofrain': [50818, 114371, 40899],
 '10227sk1': [114371, 40899, 50405],
 '12.basch.09': [40899, 114371, 50405],
 '1234babysitter': [40899, 114371, 80841],
 '123jane123': [21296, 32403, 32405],
 '12sarahyoung': [114371, 50405, 80841],
 '13brokenroses': [114371, 40899, 59176

In [20]:
# Store our result as its own dataframe
all_users_recs_df = convert_recs_dict_to_df(all_user_recs_dict)
all_users_recs_df

Unnamed: 0,username,product_1,product_2,product_3
0,"""Ferrari"")",114371.0,40899.0,106758.0
1,#,40899.0,114371.0,50405.0
2,#1dad,114371.0,40899.0,113512.0
3,'Chelle,114371.0,153397.0,80427.0
4,'Tree',114371.0,40899.0,80841.0
...,...,...,...,...
43465,zurajohnson,114371.0,50405.0,80427.0
43466,zuzu_zoom,114371.0,40899.0,106758.0
43467,üá¶üá∫,21296.0,32403.0,32405.0
43468,üêª,114371.0,137731.0,113512.0


In [21]:
# We will also store our result as a SQL database
conn = sqlite3.connect('user_recs.sqlite')
all_users_recs_df.to_sql('recommendations', conn, if_exists = 'replace', index = False)

43470

In [22]:
pd.read_sql('SELECT * FROM recommendations', conn)

Unnamed: 0,username,product_1,product_2,product_3
0,"""Ferrari"")",114371.0,40899.0,106758.0
1,#,40899.0,114371.0,50405.0
2,#1dad,114371.0,40899.0,113512.0
3,'Chelle,114371.0,153397.0,80427.0
4,'Tree',114371.0,40899.0,80841.0
...,...,...,...,...
43465,zurajohnson,114371.0,50405.0,80427.0
43466,zuzu_zoom,114371.0,40899.0,106758.0
43467,üá¶üá∫,21296.0,32403.0,32405.0
43468,üêª,114371.0,137731.0,113512.0
