In [1]:
%load_ext autoreload
%autoreload 2

import time
import sys
sys.path.append('../')
import cf_ec2.dataPrep as dataPrep

## step 1: import train data

In [2]:
import pandas as pd

data = pd.read_csv('../../rec_model_cf_20190702/metadata/trainData/purchase_20180731_20190725.csv',sep=',',header=0)
data.head(2)

Unnamed: 0,contact_uid,event_dt,make,model,jd_cat,jd_sub_cat,seller_cat,auction_cd,channel,age,mile,cr_grade,distance_miles,mmr
0,100118431,2019-04-09,FORD,F150 FFV,PICKUP,LITE DUTY FULLS,Retail Dealership,FAAO,Simulcast,5,70737.0,4.3,680.0,15750.0
1,101224608,2018-09-25,NISSAN,ALTIMA,MIDSIZE CAR,PREMIUM,Captive Finance,FAAO,Simulcast,3,36019.0,4.3,2127.0,12000.0


In [3]:
data['ymm'] = [
    '{year}-{make}-{model}'.format(
        year=year,
        make=make,
        model=model
    )
    for year,make,model in zip(data.age,data.make,data.model)
]

train = data[data.event_dt>='2019-04-26'].groupby(['contact_uid','ymm']).size().reset_index().rename(columns={0:'freq'})
test = data[
    (data.event_dt>='2019-03-26')&(data.event_dt<='2019-04-25')
].groupby(['contact_uid','ymm']).size().reset_index().rename(columns={0:'freq'})
train.head(5)

Unnamed: 0,contact_uid,ymm,freq
0,100000012,10-BUICK-LACROSSE,1
1,100000012,10-CHEVROLET-IMPALA FFV,1
2,100000012,10-CHEVROLET-MALIBU,1
3,100000012,10-CHEVROLET-MALIBU FFV,1
4,100000012,10-FORD-ESCAPE,1


## step 2: test the dataPrep module

In [4]:
dataset = dataPrep.Data(
    train=train,
    test=test,
    col_user='contact_uid',
    col_item='ymm',
    col_rating='freq'
)

In [5]:
dataset.train.head(3)

Unnamed: 0,contact_uid,ymm,freq
0,0,0,1
1,0,1,1
2,0,2,1


In [6]:
dataset.id2user[0], dataset.id2item[0]

(100000012, '10-BUICK-LACROSSE')

In [7]:
train.head(3)

Unnamed: 0,contact_uid,ymm,freq,contact_uid_idx,ymm_idx
0,100000012,10-BUICK-LACROSSE,1,0,0
1,100000012,10-CHEVROLET-IMPALA FFV,1,0,1
2,100000012,10-CHEVROLET-MALIBU,1,0,2


In [8]:
t1 = time.time()
dataset.prepTrainDNN()
print('Finished in {} seconds'.format(time.time()-t1))

Finished in 42.79367017745972 seconds


In [9]:
dataset.interaction_train.head(2)

Unnamed: 0,contact_uid,ymm_interacted,ymm_negative
0,0,"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","{68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 7..."
1,1,"{68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 7...","{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."


In [10]:
dataset.users[:5],dataset.items[:5],dataset.ratings[:5]

(array([0, 0, 0, 0, 0]), array([0, 1, 2, 3, 4]), array([1, 1, 1, 1, 1]))

In [11]:
# t1 = time.time()
# interaction_test,testPlusNegSample = dataset.prepTestDNN()
# print('Finished in {} seconds'.format(time.time()-t1))

In [12]:
t1 = time.time()

col_user='contact_uid'
col_item='ymm'
interaction_test = dataset.test.groupby(col_user)[col_item]\
    .apply(set)\
        .reset_index()\
            .rename(columns={
                col_item:col_item+'_interacted_test'
            })
interaction_test = pd.merge(
    interaction_test,
    dataset.interaction_train,
    on=col_user,
    how='inner'
)

print('Finished in {} seconds'.format(time.time()-t1))

Finished in 61.210012912750244 seconds


In [13]:
interaction_test.head(4)

Unnamed: 0,contact_uid,ymm_interacted_test,ymm_interacted,ymm_negative
0,0,"{640, 3457, 1669, 649, 10, 1802, 651, 660, 309...","{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","{68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 7..."
1,1,"{641, 2435, 100, 369, 633}","{68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 7...","{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
2,2,"{96, 97, 98, 72, 4715, 4203, 494, 944, 1011, 5...","{71, 72, 73, 83, 84, 85, 86, 87, 88, 89, 90, 9...","{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
3,3,"{1784, 118}","{110, 111, 112, 113, 114, 115}","{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."


In [14]:
interaction_test.shape

(38111, 4)

In [15]:
interaction_test.loc[interaction_test.ymm_interacted.isna()].shape

(0, 4)

In [16]:
usersInTrain = set(dataset.train.contact_uid.unique())
usersInTest = set(dataset.test.contact_uid.unique())

In [17]:
len(usersInTest-usersInTrain)

7709

In [18]:
# t1 = time.time()

# ## generate the negative sample set (based on negative set in training data)
# interaction_test[col_item+'_negative']=interaction_test.apply(
#     lambda row: row[col_item+'_negative']-row[col_item+'_interacted_test'],
#     axis=1
# )

# print('Finished in {} seconds'.format(time.time()-t1))
# #### Finished in 221.78759717941284 seconds

In [19]:
t1 = time.time()

for row in interaction_test.itertuples():
    interaction_test.at[row.Index,col_item+'_negative'] = row[4]-row[2]

print('Finished in {} seconds'.format(time.time()-t1))
#### Finished in 51.43584108352661 seconds

Finished in 48.392561197280884 seconds


In [20]:
interaction_test.head(3)

Unnamed: 0,contact_uid,ymm_interacted_test,ymm_interacted,ymm_negative
0,0,"{640, 3457, 1669, 649, 10, 1802, 651, 660, 309...","{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","{68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 7..."
1,1,"{641, 2435, 100, 369, 633}","{68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 7...","{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
2,2,"{96, 97, 98, 72, 4715, 4203, 494, 944, 1011, 5...","{71, 72, 73, 83, 84, 85, 86, 87, 88, 89, 90, 9...","{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."


#### full test after the optimization

In [21]:
# t1 = time.time()
interaction_test, testPlusNegSample = dataset.prepTestDNN()
# print('Finished in {} seconds'.format(time.time()-t1))

Finished initial join with train in 4.671973705291748 seconds
Finished negative sample clean in 183.66682291030884 seconds
Finished negative sample assignment in test data in 183.87535786628723 seconds
Finished negative sampling in test data in 924.1819598674774 seconds


In [22]:
interaction_test.head(3)

Unnamed: 0,contact_uid,ymm_interacted_test,ymm_interacted,ymm_negative
0,0,"{640, 3457, 1669, 649, 10, 1802, 651, 660, 309...","{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","{68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 7..."
1,1,"{641, 2435, 100, 369, 633}","{68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 7...","{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
2,2,"{96, 97, 98, 72, 4715, 4203, 494, 944, 1011, 5...","{71, 72, 73, 83, 84, 85, 86, 87, 88, 89, 90, 9...","{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."


In [23]:
testPlusNegSample.head(3)

Unnamed: 0,contact_uid,ymm,freq,ymm_negative
0,0,4345,1,"[4233, 1762, 2214, 8636, 966, 5450, 879, 299, ..."
1,0,10,1,"[5658, 657, 3503, 5920, 3603, 4783, 5554, 1613..."
2,0,2234,1,"[5330, 5837, 9383, 9496, 337, 5219, 4207, 3090..."


In [24]:
interaction_test[col_item+'_negative'].map(lambda negSet: len(negSet)).min()

8793

In [25]:
dataset.test.head(3)

Unnamed: 0,contact_uid,ymm,freq
0,0,4345,1
1,0,10,1
2,0,2234,1


#### after optimization

In [26]:
del interaction_test, testPlusNegSample 

In [27]:
import gc
gc.collect()

5154

In [28]:
interaction_test, testPlusNegSample = dataset.prepTestDNN()

Finished initial join with train in 4.830550193786621 seconds
Finished negative sample clean in 67.01980018615723 seconds
Finished negative sample assignment in test data in 67.1736741065979 seconds
Finished negative sampling in test data in 402.6069803237915 seconds


In [29]:
interaction_test.head(3)

Unnamed: 0,contact_uid,ymm_interacted_test,ymm_interacted,ymm_negative
0,0,"{640, 3457, 1669, 649, 10, 1802, 651, 660, 309...","{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","{68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 7..."
1,1,"{641, 2435, 100, 369, 633}","{68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 7...","{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
2,2,"{96, 97, 98, 72, 4715, 4203, 494, 944, 1011, 5...","{71, 72, 73, 83, 84, 85, 86, 87, 88, 89, 90, 9...","{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."


In [30]:
testPlusNegSample.head(3)

Unnamed: 0,contact_uid,ymm,freq,ymm_negative
0,0,4345,1,"[3341, 1450, 9230, 6890, 7371, 5779, 9674, 342..."
1,0,10,1,"[8377, 1812, 6120, 6883, 7674, 4669, 3271, 932..."
2,0,2234,1,"[2977, 1805, 1657, 113, 9116, 3433, 8923, 6043..."
