# EB5202 Web Analytics - Retail Rocket - Clustering

## Load libraries

In [1]:
import os
import pandas as pd
import pickle
import datetime

## Load data

In [2]:
item_prop_1_df = pd.read_csv(os.path.join('data', 'item_properties_part1.csv'))

In [3]:
item_prop_1_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999999 entries, 0 to 10999998
Data columns (total 4 columns):
timestamp    int64
itemid       int64
property     object
value        object
dtypes: int64(2), object(2)
memory usage: 335.7+ MB


In [4]:
item_prop_1_df.head()

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


In [5]:
item_prop_2_df = pd.read_csv(os.path.join('data', 'item_properties_part2.csv'))

In [6]:
item_prop_df = pd.concat([item_prop_1_df, item_prop_2_df])
item_prop_df.reset_index(drop=True, inplace=True)

In [7]:
del item_prop_1_df
del item_prop_2_df

In [8]:
item_prop_df['date'] = pd.to_datetime(item_prop_df['timestamp'], unit='ms', origin='unix')

In [9]:
item_prop_df.head()

Unnamed: 0,timestamp,itemid,property,value,date
0,1435460400000,460429,categoryid,1338,2015-06-28 03:00:00
1,1441508400000,206783,888,1116713 960601 n277.200,2015-09-06 03:00:00
2,1439089200000,395014,400,n552.000 639502 n720.000 424566,2015-08-09 03:00:00
3,1431226800000,59481,790,n15360.000,2015-05-10 03:00:00
4,1431831600000,156781,917,828513,2015-05-17 03:00:00


In [14]:
gby = item_prop_df.loc[item_prop_df['property'].str.contains('cat'), :].groupby('itemid')

In [15]:
sz = gby.size()
sz[sz.apply(lambda x: x >1)]

itemid
25        18
94        18
130       18
149       18
168       18
181       18
216       18
217       18
218       18
234       18
264       18
271       12
306       18
308       18
320       18
326       18
352       18
377       18
402       17
431       18
445       18
488       18
490       11
492       18
538       18
551       18
571       18
613       18
623       18
665       18
          ..
466229    17
466237    18
466240    18
466247     9
466270    18
466291    18
466326    12
466353    18
466364    17
466371     9
466376    18
466408    11
466411    17
466459    18
466502    17
466511    18
466514     9
466526    18
466589    18
466594    12
466612    18
466648    18
466695    18
466696    18
466721     9
466738    17
466767    18
466768    18
466783    18
466829    18
Length: 23352, dtype: int64

In [16]:
item_prop_df.loc[(item_prop_df['itemid'] == 25) & (item_prop_df['property'].str.contains('cat')), :].sort_values('timestamp')

Unnamed: 0,timestamp,itemid,property,value
7786530,1431226800000,25,categoryid,1509
8382882,1431831600000,25,categoryid,1509
8581666,1432436400000,25,categoryid,1509
8979234,1433041200000,25,categoryid,1509
7985314,1433646000000,25,categoryid,1509
8184098,1434250800000,25,categoryid,1509
8780450,1435460400000,25,categoryid,1509
9575586,1436065200000,25,categoryid,1509
9376802,1436670000000,25,categoryid,1509
9178018,1437274800000,25,categoryid,1509


In [17]:
value_gby = item_prop_df.loc[item_prop_df['property'].str.contains('cat'), :].groupby('value')

In [18]:
value_gby.itemid.nunique()

value
0        135
1        867
10         4
100       10
1000       3
1001      87
1002     549
1003     137
1005       1
1006     373
1007    9739
1008      58
1010     135
1011     144
1013       1
1014      42
1015       3
1016      62
1017      66
1018     672
1019      17
102        9
1020     218
1021       1
1022     157
1023      65
1026     526
1029     539
103       27
1030       5
        ... 
964      299
966        3
967      351
968        3
969      178
97        73
970      381
972      175
973      288
974        3
976     1264
977      497
978       48
979       68
98         1
980       11
981       21
982       15
983       15
984     1298
985      180
987      164
988     2720
99        19
991      229
992       41
996     1424
997       78
998      159
999      553
Name: itemid, Length: 1242, dtype: int64

In [19]:
item_prop_df.loc[(item_prop_df['property'].str.contains('cat') & (item_prop_df['value'] == '10')), :]

Unnamed: 0,timestamp,itemid,property,value
5195834,1435460400000,245380,categoryid,10
6306471,1431831600000,449019,categoryid,10
3930791,1431226800000,26377,categoryid,10
5248943,1431226800000,96493,categoryid,10


In [9]:
len(item_prop_df.itemid.unique())

417053

### Create a data frame of categoryid properties

This will be used to determine the categoryid of an itemid.

In [11]:
item_prop_df.sort_values(by='date', inplace=True)

In [12]:
starttime = datetime.datetime.now()
print('start time = ', starttime)
itemid_groups = item_prop_df.loc[item_prop_df['property'].str.contains('cat')].groupby('itemid')
# itemid_groups = item_prop_df.loc[((item_prop_df['itemid'] == 25) & item_prop_df['property'].str.contains('cat')), :].groupby('itemid')
itemtocat_dict = {}

print('start constructing dict ...', datetime.datetime.now())
# for each itemid
# for a_itemid in itemid_groups.keys():
for a_itemid, itemid_df in itemid_groups:
    
    t1 = datetime.datetime.now()
    cat_list = []
    # Get the categoryid for this itemid
    # df = item_prop_df.loc[itemid_groups[a_itemid], ['date', 'value']].sort_values('date')
    # print(itemid_df)

    for v, grp_df in itemid_df.groupby([(itemid_df.value != itemid_df.value.shift()).cumsum()]):
        cat_list.append((grp_df.iloc[-1][4], grp_df.iloc[-1][3]))

    # enddate = df.iloc[0, :]['date']
    # value = df.iloc[0, :]['value']
    # # For each row of categoryid, ignore the repeated values, get only the last one by date.
    # for idx, row in df.iterrows():
    #     if value != row['value']:
    #         cat_list.append((enddate, value))
    #         enddate = row['date']
    #         value = row['value']
    #     else:    
    #         endate = row['date']
    # # Add last categoryid value
    # print(enddate, value)
    # cat_list.append((enddate, value))

    #Add itemid to dict,
    itemtocat_dict[a_itemid] = cat_list
    # t2 = datetime.datetime.now()
    # print(t2-t1)
endtime = datetime.datetime.now()
print('end time = ', endtime, 'duration = ', endtime-starttime)


start time =  2018-10-31 18:08:29.942317


start constructing dict ... 2018-10-31 18:08:41.791030


end time =  2018-10-31 18:24:58.298345 duration =  0:16:28.356028


In [24]:
item_prop_df.loc[(item_prop_df['itemid'] == 29437) & (item_prop_df['property'].str.contains('cat')), :].sort_values('date')

Unnamed: 0,timestamp,itemid,property,value,date
1778037,1431226800000,29437,categoryid,254,2015-05-10 03:00:00
1579254,1431831600000,29437,categoryid,254,2015-05-17 03:00:00
2573169,1432436400000,29437,categoryid,254,2015-05-24 03:00:00
1380471,1433041200000,29437,categoryid,254,2015-05-31 03:00:00
1976820,1433646000000,29437,categoryid,254,2015-06-07 03:00:00
2374386,1434250800000,29437,categoryid,254,2015-06-14 03:00:00
4362219,1435460400000,29437,categoryid,1385,2015-06-28 03:00:00
3169518,1436065200000,29437,categoryid,1385,2015-07-05 03:00:00
2175603,1436670000000,29437,categoryid,1385,2015-07-12 03:00:00
4561003,1437274800000,29437,categoryid,1385,2015-07-19 03:00:00


In [22]:
[ i for i in itemtocat_dict.keys() if len(itemtocat_dict[i]) > 3 ][:10]

[3590, 4375, 5070, 29437, 36776, 42503, 48987, 62122, 63639, 65637]

In [23]:
itemtocat_dict[29437]

[(Timestamp('2015-06-14 03:00:00'), '254'),
 (Timestamp('2015-08-02 03:00:00'), '1385'),
 (Timestamp('2015-08-16 03:00:00'), '381'),
 (Timestamp('2015-09-13 03:00:00'), '1385')]

In [14]:
pickle.dump(itemtocat_dict, open(os.path.join('data', 'itemtocat.pkl'), 'wb'))

## Split data into train / test dataset

In [21]:
# train_idx = random.sample(range(len(sessions_df)), int(len(sessions_df) / 2))
# test_idx = [ i for i in range(len(sessions_df)) if i not in train_idx ]