In [1]:
import pandas as pd
import numpy as np

In [2]:
events = pd.read_csv("prepared_dataset\\events.csv")
properties = pd.read_csv("prepared_dataset\\properties.csv")
parents = pd.read_csv("prepared_dataset\\parents.csv")

events['timestamp'] = pd.to_datetime(events['timestamp'])
properties['timestamp'] = pd.to_datetime(properties['timestamp'])

<center>Feature Extraction and Engineering</center>

In [3]:
def check_funnel(row, df: pd.DataFrame, time='7 days'):
    for _, new_row in df.iterrows():
        if new_row['event'] == "view" and new_row['timestamp'] - row['timestamp'] > pd.Timedelta('1 days'):
            return False
        if new_row['timestamp'] - row['timestamp'] < pd.Timedelta('0 days'):
            continue
        if new_row['timestamp'] - row['timestamp'] > pd.Timedelta(time):
            continue
        if new_row['event'] == "transaction":
            return True
    return False

data = []
targets = []

group = events.groupby(['visitorid', 'itemid'])
for (visitorid, itemid), df in group:
    for idx, row in df.iterrows():
        if row['event'] == "view":
            data.append((visitorid, itemid, row['timestamp']))
            if check_funnel(row, df):
                targets.append(1)
            else: 
                targets.append(0)

In [4]:
d_sorted, t_sorted = zip(*sorted(zip(data, targets), key=lambda x: x[0][2]))

d_sorted = list(d_sorted)
t_sorted = list(t_sorted)

d_sorted[t_sorted.index(1)]

(345781, 438400, Timestamp('2015-05-03 03:09:28.107000'))

In [5]:
def sample_random(data, targets, n_samples=1000):
    target_np = np.array(targets)
    count_0 = np.sum(target_np == 0)

    total = len(targets)
    proportion_0 = count_0 / total

    print(total)

    n_0_samples = int(n_samples * proportion_0)
    n_1_samples = n_samples - n_0_samples

    # if (n_1_samples < 0.2 * n_samples):
    #     n_1_samples = min(total - count_0, int(0.2 * n_samples))
    #     n_0_samples = n_samples - n_1_samples

    print(n_1_samples)

    idx_0 = np.where(target_np == 0)[0]
    idx_1 = np.where(target_np == 1)[0]

    print(idx_0, idx_1, proportion_0)

    sampled_0_idx = np.random.choice(idx_0, n_0_samples, replace=False)
    sampled_1_idx = np.random.choice(idx_1, n_1_samples, replace=False)

    sampled_idx = np.concatenate([sampled_0_idx, sampled_1_idx])
    np.random.shuffle(sampled_idx)

    sample_f = []
    sample_t = []

    for idx in sampled_idx:
        sample_f.append(data[idx])
        sample_t.append(targets[idx])

    return sample_f, sample_t

d_subset, t_subset = sample_random(d_sorted, t_sorted, 200000)

2664312
2585
[      0       1       2 ... 2664309 2664310 2664311] [    143     367     368 ... 2663858 2664094 2664146] 0.9870762132963407


In [6]:
def add_properties(data):
    new_features = data.copy()
    for idx, feature in enumerate(new_features):
        df: pd.DataFrame = properties[(properties['itemid'] == feature[1]) & (properties['timestamp'] < feature[2])].copy()
        df.sort_values('timestamp', inplace=True)
        props = {}

        for _, row in df.iterrows():
            props[row['property']] = row['value']

        new_features[idx] += (props, )
    return new_features

d_subset = add_properties(d_subset)

In [7]:
d_subset[t_subset.index(1)]

(1187986,
 120098,
 Timestamp('2015-08-22 18:41:14.018000'),
 {'790': 'n14160.000',
  '764': '1285872',
  '917': '1170932',
  '6': '985131',
  '888': '400845 1097825 n1440.000 628176 n720.000 424566',
  '112': '679677',
  '776': '972043',
  '283': '985131 343546 400845 343546 274150 268271 985131 400845 909292 588497 402932 709607 827951 424566 639502 117069 985131 703408 77688 914749 821952 487363 745504 1056884 436067 1033895 436067 1033895 312815 611917 157929',
  'available': '1',
  '364': '275799',
  '441': 'n1440.000 628176 n720.000 424566',
  '243': '985131',
  '159': '519769',
  '689': '337402',
  '713': '436067 1033895',
  '434': '769062',
  '46': '769062',
  'categoryid': '427',
  '575': '1170932',
  '202': '400845',
  '960': '769062',
  '19': '1297729 n0.000 309206',
  '678': '343546',
  '698': '985131',
  '28': '150169 610517',
  '54': '709607',
  '227': '985131',
  '38': '769062',
  '810': 'n1440.000 628176 n720.000 424566',
  '839': '343546'})

In [8]:
def extract_features(data, target):
    data = {
        'visitorid': [],
        'itemid': [],
        'dayofweek': [],
        'hour': [],
    }

    _properties = {}

    for f in d_subset:
        data['visitorid'].append(f[0])
        data['itemid'].append(f[1])
        data['dayofweek'].append(f[2].dayofweek)
        data['hour'].append(f[2].hour)

        for property in _properties:
            if property in f[3]:
                _properties[property].append(f[3][property])
            else:
                _properties[property].append(np.nan)

        for property, value in f[3].items():
            if property not in properties:
                _properties[property] = [np.nan for _ in range(len(data['visitorid']) - 1)] + [value]

    data = data | _properties
    data['target'] = target
    data_df = pd.DataFrame(data)
    
    data_df = data_df.sample(frac=1, random_state=42).reset_index(drop=True)
    return data_df

df = extract_features(d_subset, t_subset)

In [9]:
df.to_csv("features_2.csv", index=False)