In [1]:
import pandas as pd
import json 
import numpy as np
import pathlib

# Read in test data from json format

In [2]:
test_data_dir = pathlib.Path('test_data/')
test_data_files = list(test_data_dir.glob('*.json'))

all_test_data = {}
for test_data_file in test_data_files: 
    with open(test_data_file) as fin:
        all_test_data[test_data_file.name.replace('.json', '')] = json.load(fin)
        
all_test_data['test_1'].keys()

dict_keys(['order_history', 'current_order'])

## Convert To Dataframe

In [3]:
table_values = []
user = 12345678912345
order_id = 98765432198765
order_num = 0
for test in all_test_data:
    test_order_history = all_test_data[test]['order_history']
    for order in test_order_history:
        order_num += 1 
        product_list = test_order_history[order]['products']
        dow = test_order_history[order]['order_dow']
        hour= test_order_history[order]['order_hour_of_day']
        days = test_order_history[order]['days_since_prior_order']
        for product in product_list:
            table_values.append([user, order_id, order, product, dow, hour, days])
        order_id+=1
    user +=1
   
test_data = pd.DataFrame(table_values, columns=['user_id', 'order_id', 'order', 'product_id', 'order_dow', 'order_hour_of_day', 'days_since_prior_order'])

In [4]:
test_data.shape

(16335, 7)

In [5]:
products_csv = pd.read_csv('products.csv')

In [6]:
test_data = pd.merge(test_data, products_csv, how = 'inner', on = 'product_id')

In [7]:
test_data.columns

Index(['user_id', 'order_id', 'order', 'product_id', 'order_dow',
       'order_hour_of_day', 'days_since_prior_order', 'product_name',
       'aisle_id', 'department_id'],
      dtype='object')

In [8]:
test_data.drop(['product_id', 'aisle_id', 'department_id'], axis = 1, inplace = True)

In [9]:
test_data.shape

(16335, 7)

### Create User mean profile for clustering purposes 

In [10]:
test_user_mean = test_data.groupby('user_id')['order_dow','order_hour_of_day', 'days_since_prior_order'].agg(np.nanmean)
total_orders = test_data.groupby('user_id').order_id.nunique() # <- series, will need to be added to larger dataframe 
# Average number of products in each order 
num_products = test_data.groupby('user_id').product_name.nunique()

# add average number of products in each order to user_mean df
test_user_mean['num_orders'] = total_orders
test_user_mean['num_products'] = num_products
test_user_mean['avg_num_products_per_order'] = num_products / total_orders

# Add every product ever ordered for each user 
# Combine users products into a list specific to that user   
product_list = []
for product in test_data.groupby('user_id')['product_name']:
    product_list.append(' '.join(product[1]))

test_user_mean['Products'] = product_list   
    

  test_user_mean = test_data.groupby('user_id')['order_dow','order_hour_of_day', 'days_since_prior_order'].agg(np.nanmean)


In [11]:
# 
test_user_mean.to_csv('test_user_profile.csv')

In [12]:
# test_data.to_csv('project_order_history_test_data.csv')

### Get the current order products into a dataframe with user id and order id and list of products in one row

In [13]:
current_table_values = []
user = 12345678912345
order_id = 98765432198765
for test in all_test_data:
    test_current_order = all_test_data[test]['current_order']
    for order in test_current_order:
        product_list = test_current_order[order]['products']
        dow = test_current_order[order]['order_dow']
        hour= test_current_order[order]['order_hour_of_day']
        days = test_current_order[order]['days_since_prior_order']
        for product in product_list:
            current_table_values.append([user, order_id, order, product, dow, hour, days])
    user +=1
    order_id+=1
current_test_data = pd.DataFrame(current_table_values, columns=['user_id', 'order_id', 'order', 'product_id', 'order_dow', 'order_hour_of_day', 'days_since_prior_order'])

In [14]:
current_test_data = pd.merge(current_test_data, products_csv, how = 'inner', on = 'product_id')

In [15]:
current_test_data.drop(['product_id', 'aisle_id', 'department_id'], axis = 1, inplace = True)

## Get a list of the products in one row of the dataframe

In [16]:
products = current_test_data['product_name']

product_no_space = []
for product in products:
    product = product.replace(" ", "_")
    product_no_space.append(product)

# drop original column, replace it with one with no space
current_test_data.drop(['product_name'], axis=1)
current_test_data['product_name'] = product_no_space

# add product name to each user
name_list = []
for p_name in current_test_data.groupby('order_id')['product_name']:
    name_list.append(' '.join(p_name[1]))
    
order_id = current_test_data.groupby('order_id')['product_name'].agg('count').index
# user_id = current_test_data.groupby(['user_id'])['product_name'].agg('count').index
user_id = current_test_data['user_id'].drop_duplicates()
order_id = current_test_data['order_id'].drop_duplicates()
current_test_order_products = pd.DataFrame({'user_id': user_id, 'order_id':order_id, 'products':name_list})


In [17]:
current_test_order_products.head()

Unnamed: 0,user_id,order_id,products
0,12345678912345,98765432198765,Clementines Sweet_&_Salty_Nut_Granola_Bars_Peanut
2,12345678912346,98765432198766,Organic_Butternut_Squash Organic_Grape_Tomatoes
4,12345678912347,98765432198767,Vanilla_Unsweetened_Almond_Milk Total_2%_Greek...
6,12345678912348,98765432198768,Grapefruit_Sparkling_Water Sliced_Sourdough_Bread
8,12345678912349,98765432198769,Organic_Gluten_Free_Chia_Plus_with_Quinoa_&_Am...


###### Will need to get the xy coordinates of each new test user. calculate the euclidiean distance and use that to assign to each cluster

### Create a dictionary of current order information 

In [18]:
current_order_products_list = []
for row in current_test_order_products['products']:
    productsName = row.split(' ')
    current_order_products_list.append(productsName)
#     current_test_order_products['products']

In [19]:
current_test_order_products.head()

Unnamed: 0,user_id,order_id,products
0,12345678912345,98765432198765,Clementines Sweet_&_Salty_Nut_Granola_Bars_Peanut
2,12345678912346,98765432198766,Organic_Butternut_Squash Organic_Grape_Tomatoes
4,12345678912347,98765432198767,Vanilla_Unsweetened_Almond_Milk Total_2%_Greek...
6,12345678912348,98765432198768,Grapefruit_Sparkling_Water Sliced_Sourdough_Bread
8,12345678912349,98765432198769,Organic_Gluten_Free_Chia_Plus_with_Quinoa_&_Am...


In [20]:
current_test_order_products['product_list'] = current_order_products_list

In [24]:
userid_cluster = pd.DataFrame(current_test_order_products['user_id'])

In [30]:
cluster_list = ['Cluster_6','Cluster_8','Cluster_5','Cluster_8','Cluster_1','Cluster_6','Cluster_8','Cluster_11','Cluster_10',
                'Cluster_12','Cluster_6','Cluster_12','Cluster_5','Cluster_4','Cluster_6','Cluster_1','Cluster_6','Cluster_8',
                'Cluster_1','Cluster_5','Cluster_13','Cluster_6','Cluster_4','Cluster_6','Cluster_14']

In [31]:
current_test_order_products['Clusters'] = cluster_list

In [32]:
current_test_order_products

Unnamed: 0,user_id,order_id,products,product_list,Clusters
0,12345678912345,98765432198765,Clementines Sweet_&_Salty_Nut_Granola_Bars_Peanut,"[Clementines, Sweet_&_Salty_Nut_Granola_Bars_P...",Cluster_6
2,12345678912346,98765432198766,Organic_Butternut_Squash Organic_Grape_Tomatoes,"[Organic_Butternut_Squash, Organic_Grape_Tomat...",Cluster_8
4,12345678912347,98765432198767,Vanilla_Unsweetened_Almond_Milk Total_2%_Greek...,"[Vanilla_Unsweetened_Almond_Milk, Total_2%_Gre...",Cluster_5
6,12345678912348,98765432198768,Grapefruit_Sparkling_Water Sliced_Sourdough_Bread,"[Grapefruit_Sparkling_Water, Sliced_Sourdough_...",Cluster_8
8,12345678912349,98765432198769,Organic_Gluten_Free_Chia_Plus_with_Quinoa_&_Am...,[Organic_Gluten_Free_Chia_Plus_with_Quinoa_&_A...,Cluster_1
10,12345678912350,98765432198770,Organic_Lemon Organic_Gunpowder_Green_Tea,"[Organic_Lemon, Organic_Gunpowder_Green_Tea]",Cluster_6
12,12345678912351,98765432198771,Organic_Vanilla_Chip_Chewy_Granola,[Organic_Vanilla_Chip_Chewy_Granola],Cluster_8
13,12345678912352,98765432198772,Organic_Lime,[Organic_Lime],Cluster_11
14,12345678912353,98765432198773,Organic_Poppy_Seed,[Organic_Poppy_Seed],Cluster_10
15,12345678912354,98765432198774,Original_Fruit_Candy,[Original_Fruit_Candy],Cluster_12


In [35]:
current_order_dict= {} # products, product_list 
for index, content, in current_test_order_products.iterrows():
    user_id = content[0]
    order_id = content[1]
    product_list = content[3]
    cluster = content[4]
    if len(product_list) > 1:
        product_one = product_list[0]
        product_two = product_list[1]
        current_order_dict[user_id] = {'order_id': order_id, 'product_one': product_one, 'product_two': product_two, 'Cluster': cluster}
    else:
        product_one = product_list[0]
        current_order_dict[user_id] = {'order_id': order_id, 'product_one': product_one, 'Cluster': cluster}
    

In [36]:
with open('current_order_test_data.json', 'w') as file:
    json.dump(current_order_dict, file)