In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import nltk
import os

import matplotlib.pyplot as plt 
from functools import partial # to reduce df memory consumption by applying to_numeric

import warnings
warnings.filterwarnings('ignore') 

In [2]:
cwd = os.getcwd()
cwd

'C:\\Users\\yangyq\\Google Drive\\Grad school project\\ucberkeley_mids\\academics\\W207_ML\\finalprojdata'

In [3]:
aisles = pd.read_csv('C:\\Temp\\w207finalprojdata\\aisles.csv', engine='c')
print('Total aisles: {}'.format(aisles.shape[0]))
aisles.head()

Total aisles: 134


Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation


In [4]:
#departments
departments = pd.read_csv('C:\\Temp\\w207finalprojdata\\departments.csv', engine='c')

print('Total departments: {}'.format(departments.shape[0]))
departments.head()

Total departments: 21


Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


In [5]:
# products
products = pd.read_csv('C:\\Temp\\w207finalprojdata\\products.csv', engine='c')
print('Total products: {}'.format(products.shape[0]))
products.head(5)

Total products: 49688


Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [6]:
# combine aisles, departments and products (left joined to products)
goods = pd.merge(left=pd.merge(left=products, right=departments, how='left'), right=aisles, how='left')
# to retain '-' and make product names more "standard"
goods.product_name = goods.product_name.str.lower() 
print(goods.info())

goods.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49688 entries, 0 to 49687
Data columns (total 6 columns):
product_id       49688 non-null int64
product_name     49688 non-null object
aisle_id         49688 non-null int64
department_id    49688 non-null int64
department       49688 non-null object
aisle            49688 non-null object
dtypes: int64(3), object(3)
memory usage: 2.7+ MB
None


Unnamed: 0,product_id,product_name,aisle_id,department_id,department,aisle
0,1,chocolate sandwich cookies,61,19,snacks,cookies cakes
1,2,all-seasons salt,104,13,pantry,spices seasonings
2,3,robust golden unsweetened oolong tea,94,7,beverages,tea
3,4,smart ones classic favorites mini rigatoni wit...,38,1,frozen,frozen meals
4,5,green chile anytime sauce,5,13,pantry,marinades meat preparation


###### Train Dataset

In [7]:
# train dataset
op_train = pd.read_csv('C:\\Temp\\w207finalprojdata\\order_products__train.csv', engine='c', 
                       dtype={'order_id': np.int32, 'product_id': np.int32, 
                              'add_to_cart_order': np.int16, 'reordered': np.int8})    
print('Total ordered products(train): {}'.format(op_train.shape[0]))
op_train.head()

Total ordered products(train): 1384617


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


###### Prior Dataset

In [8]:
op_prior = pd.read_csv('C:\\Temp\\w207finalprojdata\\order_products__prior.csv', engine='c', 
                       dtype={'order_id': np.int32, 
                              'product_id': np.int32, 
                              'add_to_cart_order': np.int16, 
                             'reordered': np.int8})

print('Total ordered products(prior): {}'.format(op_prior.shape[0]))


Total ordered products(prior): 32434489


In [9]:
op_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


Some 

In [10]:
# Curious how many unique product have been purchased
print(op_prior.product_id.unique().shape[0])

49677


###### Test Dataset

In [11]:
# test dataset (submission)
test = pd.read_csv('C:\\Temp\\w207finalprojdata\\sample_submission.csv', engine='c')

print('Total orders(test): {}'.format(test.shape[0]))
test.head()

Total orders(test): 75000


Unnamed: 0,order_id,products
0,17,39276 29259
1,34,39276 29259
2,137,39276 29259
3,182,39276 29259
4,257,39276 29259


In [12]:
# orders
orders = pd.read_csv('C:\\Temp\\w207finalprojdata\\orders.csv', engine='c', dtype={'order_id': np.int32, 
                                                           'user_id': np.int32, 
                                                           'order_number': np.int32, 
                                                           'order_dow': np.int8, 
                                                           'order_hour_of_day': np.int8, 
                                                           'days_since_prior_order': np.float16})


print('Total orders: {}'.format(orders.shape[0]))
print(orders.info())
orders.head()

Total orders: 3421083
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
order_id                  int32
user_id                   int32
eval_set                  object
order_number              int32
order_dow                 int8
order_hour_of_day         int8
days_since_prior_order    float16
dtypes: float16(1), int32(3), int8(2), object(1)
memory usage: 78.3+ MB
None


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [13]:
# merge train and prior together iteratively, to fit into 8GB kernel RAM
# split df indexes into parts
indexes = np.linspace(0, len(op_prior), num=10, dtype=np.int32)

# initialize it with train dataset
order_details = pd.merge(
                left=op_train,
                 right=orders, 
                 how='left', 
                 on='order_id'
        ).apply(partial(pd.to_numeric, errors='ignore', downcast='integer'))

# add order hierarchy
order_details = pd.merge(
                left=order_details,
                right=goods[['product_id',
                             'product_name',
                             'aisle_id', 
                             'department_id']].apply(partial(pd.to_numeric, 
                                                             errors='ignore', 
                                                             downcast='integer')),
                how='left',
                on='product_id'
)

print(order_details.shape, op_train.shape)


# delete (redundant now) dataframes
#del op_train

order_details.head()

(1384617, 13) (1384617, 4)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id
0,1,49302,1,1,112108,train,4,4,10,9,bulgarian yogurt,120,16
1,1,11109,2,1,112108,train,4,4,10,9,organic 4% milk fat whole milk cottage cheese,108,16
2,1,10246,3,0,112108,train,4,4,10,9,organic celery hearts,83,4
3,1,49683,4,0,112108,train,4,4,10,9,cucumber kirby,83,4
4,1,43633,5,1,112108,train,4,4,10,9,lightly smoked sardines in olive oil,95,15


In [14]:
# Josh's order_details that preserves the product name as a python object that points to a string
indexes = np.linspace(0, len(op_prior), num=10, dtype=np.int32)

# initialize it with train dataset
order_details = pd.merge(
                left=op_train,
                 right=orders, 
                 how='left', 
                 on='order_id'
        ).apply(partial(pd.to_numeric, errors='ignore', downcast='integer'))

order_details_jw = pd.merge(
                left=order_details,
                right=goods[['product_id',
                             'product_name']],
                how='left',
                on='product_id'
)

order_details_jw.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name
0,1,49302,1,1,112108,train,4,4,10,9,bulgarian yogurt
1,1,11109,2,1,112108,train,4,4,10,9,organic 4% milk fat whole milk cottage cheese
2,1,10246,3,0,112108,train,4,4,10,9,organic celery hearts
3,1,49683,4,0,112108,train,4,4,10,9,cucumber kirby
4,1,43633,5,1,112108,train,4,4,10,9,lightly smoked sardines in olive oil


In [15]:
# Formatting for count vectorizer
order_details_jw["product_name"].str.split()

0                                        [bulgarian, yogurt]
1          [organic, 4%, milk, fat, whole, milk, cottage,...
2                                  [organic, celery, hearts]
3                                          [cucumber, kirby]
4                [lightly, smoked, sardines, in, olive, oil]
5                                [bag, of, organic, bananas]
6                                   [organic, hass, avocado]
7                           [organic, whole, string, cheese]
8                         [grated, pecorino, romano, cheese]
9                                            [spring, water]
10                                  [organic, half, &, half]
11                                    [super, greens, salad]
12               [cage, free, extra, large, grade, aa, eggs]
13                                  [prosciutto,, americano]
14                   [organic, garnet, sweet, potato, (yam)]
15                                               [asparagus]
16                      

In [16]:
# Importing countvectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
# Initializing count vectorizer with basically no methods
# We are running into method problems if we try and expand 'max_features'
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 100)

# Fitting the vectorizer to the data => creating the feature space around product name
train_data_features = vectorizer.fit_transform(order_details_jw['product_name'])

# Converting the features into an array
train_data_features = train_data_features.toarray()

In [18]:
# Sparse matrix of vectorized features
train_data_features

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [19]:
# Inspecting New Feature Space
train_data_features.shape
vocab = vectorizer.get_feature_names()

In [20]:
# Which 100 words were chosen...yay Avocado!
vocab

['100',
 'almond',
 'and',
 'apple',
 'avocado',
 'baby',
 'bag',
 'banana',
 'bananas',
 'bar',
 'beans',
 'black',
 'blueberry',
 'bread',
 'breast',
 'broccoli',
 'brown',
 'bunch',
 'butter',
 'carrots',
 'cereal',
 'cheddar',
 'cheese',
 'chicken',
 'chips',
 'chocolate',
 'classic',
 'coconut',
 'coffee',
 'corn',
 'crackers',
 'cream',
 'dark',
 'eggs',
 'extra',
 'fat',
 'free',
 'fresh',
 'frozen',
 'fruit',
 'garlic',
 'gluten',
 'grade',
 'grain',
 'greek',
 'green',
 'half',
 'hass',
 'honey',
 'ice',
 'juice',
 'kale',
 'large',
 'lemon',
 'low',
 'milk',
 'mix',
 'natural',
 'of',
 'oil',
 'onion',
 'orange',
 'organic',
 'original',
 'pasta',
 'peanut',
 'pepper',
 'plain',
 'potato',
 'pure',
 'raspberries',
 'red',
 'reduced',
 'rice',
 'roasted',
 'salt',
 'sauce',
 'sea',
 'seedless',
 'shredded',
 'sliced',
 'sparkling',
 'spinach',
 'strawberries',
 'strawberry',
 'style',
 'sweet',
 'tomato',
 'tomatoes',
 'total',
 'turkey',
 'unsweetened',
 'vanilla',
 'water',


In [21]:
%%time
# update by small portions
for i in range(len(indexes)-1):
    order_details = pd.concat(
        [   
            order_details,
            pd.merge(left=pd.merge(
                            left=op_prior.loc[indexes[i]:indexes[i+1], :],
                            right=goods[['product_id', 
                                         'aisle_id', 
                                         'department_id' ]].apply(partial(pd.to_numeric, 
                                                                          errors='ignore', 
                                                                          downcast='integer')),
                            how='left',
                            on='product_id'
                            ),
                     right=orders, 
                     how='left', 
                     on='order_id'
                ) #.apply(partial(pd.to_numeric, errors='ignore', downcast='integer'))
        ]
    )
        
print('Datafame length: {}'.format(order_details.shape[0]))
print('Memory consumption: {:.2f} Mb'.format(sum(order_details.memory_usage(index=True, 
                                                                         deep=True) / 2**20)))
# check dtypes to see if we use memory effectively
print(order_details.dtypes)

# make sure we didn't forget to retain test dataset :D
test_orders = orders[orders.eval_set == 'test']

# delete (redundant now) dataframes
del op_prior, orders

Datafame length: 33819114
Memory consumption: 3515.51 Mb
add_to_cart_order           int16
aisle_id                  float64
days_since_prior_order    float16
department_id             float64
eval_set                   object
order_dow                    int8
order_hour_of_day            int8
order_id                    int32
order_number                int32
product_id                  int32
reordered                    int8
user_id                     int32
dtype: object
Wall time: 46.7 s


In [22]:
test_orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
38,2774568,3,test,13,5,15,11.0
44,329954,4,test,6,3,12,30.0
53,1528013,6,test,4,3,16,22.0
96,1376945,11,test,8,6,11,8.0
102,1356845,12,test,6,1,20,30.0


###### Customer will take all reordered

In [23]:
%%time
# baseline submission
test_history = order_details[(order_details.user_id.isin(test_orders.user_id)) 
                             & (order_details.reordered == 1)]\
.groupby('user_id')['product_id'].apply(lambda x: ' '.join([str(e) for e in set(x)])).reset_index()
test_history.columns = ['user_id', 'products']

test_history = pd.merge(left=test_history, 
                        right=test_orders, 
                        how='right', 
                        on='user_id')[['order_id', 'products']]

test_history.to_csv('baseline_takeAllReordered.csv', encoding='utf-8', index=False)


Wall time: 22.2 s


### Turning Product Name into a useful feature

In [24]:
'''
import re
print(list(order_details.columns.values))
order_details['product_name'] = order_details['product_name'].astype(str)
order_details.info()
reform_prod_name = re.sub("[_]",
                          " ",
                          order_details.loc[:,'product_name'] )
'''

'\nimport re\nprint(list(order_details.columns.values))\norder_details[\'product_name\'] = order_details[\'product_name\'].astype(str)\norder_details.info()\nreform_prod_name = re.sub("[_]",\n                          " ",\n                          order_details.loc[:,\'product_name\'] )\n'

## Yang's stuff

reusing Josh's combo orders table. Focusing on product ids as the way forward. Global product ids vs Personalized product ids

In [25]:
order_details_jw.head(100)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name
0,1,49302,1,1,112108,train,4,4,10,9,bulgarian yogurt
1,1,11109,2,1,112108,train,4,4,10,9,organic 4% milk fat whole milk cottage cheese
2,1,10246,3,0,112108,train,4,4,10,9,organic celery hearts
3,1,49683,4,0,112108,train,4,4,10,9,cucumber kirby
4,1,43633,5,1,112108,train,4,4,10,9,lightly smoked sardines in olive oil
5,1,13176,6,0,112108,train,4,4,10,9,bag of organic bananas
6,1,47209,7,0,112108,train,4,4,10,9,organic hass avocado
7,1,22035,8,1,112108,train,4,4,10,9,organic whole string cheese
8,36,39612,1,0,79431,train,23,6,18,30,grated pecorino romano cheese
9,36,19660,2,1,79431,train,23,6,18,30,spring water


In [50]:
# need to get the order id and product id into a useful format
# https://stackoverflow.com/questions/26716616/convert-a-pandas-dataframe-to-a-dictionary
# https://stackoverflow.com/questions/20024584/vectorizing-a-pandas-dataframe-for-scikit-learn
# http://scikit-learn.org/stable/modules/feature_extraction.html#loading-features-from-dicts

temp = order_details_jw[["order_id", "product_id"]]
temp = temp[:10]
print(temp)
print(type(temp))
temp = temp.groupby(["order_id"])[0]
print(temp)
print(type(temp))

   order_id  product_id
0         1       49302
1         1       11109
2         1       10246
3         1       49683
4         1       43633
5         1       13176
6         1       47209
7         1       22035
8        36       39612
9        36       19660
<class 'pandas.core.frame.DataFrame'>


KeyError: 'Column not found: 0'

In [None]:







# We are running into method problems if we try and expand 'max_features'
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 100,
                             lowercase = False)

# Fitting the vectorizer to the data => creating the feature space around product name
train_data_features = vectorizer.fit_transform(order_details_jw['product_id'])

# Converting the features into an array
train_data_features = train_data_features.toarray()

print(train_data_features.head())