# Load from Compute Target Notebook

In [1]:
import dill
dill.load_session("Notebook_Saves/compute_target.db")

In [2]:
submission = pd.read_csv(data_path + 'submission.csv')

# Functions

In [3]:
dec_14_2018 = datetime(2018, 12, 14, 23, 59, 59).timestamp() * 1000; dec_14_2018

1544831999000.0

In [4]:
def feature_generate(df, events_df, session_df):

    # Feature one: event_count
    count = events_df.loc[:, ['user_id_hash', 'event']].groupby('user_id_hash').\
        count().reset_index().rename(columns={'event': 'event_count'})
    df = df.join(count.set_index('user_id_hash'), on='user_id_hash')

    # Feature two: purchase_count
    count = events_df[events_df.event == '8'].loc[:, ['user_id_hash', 'event']].groupby('user_id_hash').\
        count().reset_index().rename(columns={'event': 'purchase_count'})
    df = df.join(count.set_index('user_id_hash'), on='user_id_hash')

    # Feature three: session_count
    count = session_df.loc[:, ['user_id_hash', 'session_id']].groupby('user_id_hash').\
        count().reset_index().rename(columns={'session_id': 'session_count'})
    df = df.join(count.set_index('user_id_hash'), on='user_id_hash')

    # Feature four: country
    country = session_df.loc[:, ['user_id_hash', 'country']].groupby('user_id_hash').\
        first().reset_index()
    df = df.join(country.set_index('user_id_hash'), on='user_id_hash')

    # Feature five: OS
    os = session_df.loc[:, ['user_id_hash', 'os_name']].groupby('user_id_hash').\
        first().reset_index()
    df = df.join(os.set_index('user_id_hash'), on='user_id_hash')

    # Feature six: session_duration
    duration = session_df.loc[:, ['user_id_hash', 'previous_sessions_duration']].groupby('user_id_hash').\
        mean().reset_index()
    df = df.join(duration.set_index('user_id_hash'), on='user_id_hash')

    # Feature seven: spend
    spend = events_df[events_df.event == '8'].loc[:, ['user_id_hash', 'event_value']].groupby('user_id_hash').\
        sum().reset_index().rename(columns={'event_value': 'spend'})
    df = df.join(spend.set_index('user_id_hash'), on='user_id_hash')

    # Feature eight: event_gap
    event_gap = events_df.loc[:, ['user_id_hash', 'event_timestamp']].groupby('user_id_hash').\
        max().reset_index().rename(columns={'event_timestamp': 'event_gap'})
    event_gap['event_gap'] = dec_14_2018 - event_gap['event_gap']
    df = df.join(event_gap.set_index('user_id_hash'), on='user_id_hash')

    # Feature nine: session_gap
    session_gap = session_df.loc[:, ['user_id_hash', 'start_timestamp']].groupby('user_id_hash').\
        max().reset_index().rename(columns={'start_timestamp': 'session_gap'})
    session_gap['session_gap'] = dec_14_2018 - session_gap['session_gap']
    df = df.join(session_gap.set_index('user_id_hash'), on='user_id_hash')

    # Feature ten: life_time
    life_time = session_df.loc[:, ['user_id_hash', 'user_created_timestamp']].groupby('user_id_hash').\
        max().reset_index().rename(
            columns={'user_created_timestamp': 'life_time'})
    life_time['life_time'] = dec_14_2018 - life_time['life_time']
    df = df.join(life_time.set_index('user_id_hash'), on='user_id_hash')

    # One-Hot Encoding OS
    df['os_name'] = df['os_name'].fillna(value='Missing')
    os = pd.get_dummies(df['os_name']).drop(columns=['Missing'])
    os['os_ios'], os['os_android'] = os['iOS'] + \
        os['iPhone OS'], os['Android OS']
    os = os.loc[:, ['os_ios', 'os_android']]
    df = pd.concat((df, os), axis=1)
    
    return df

In [5]:
# Mean Encoding Country
def reg_target_encoding(train, col="country", splits=5):

    train[f'{col}_mean_enc'] = 0
    train_y2 = training.user_purchase_binary_14_days.values
    skf = StratifiedKFold(splits, shuffle=True, random_state=111)

    for computing_index, encoding_index in skf.split(training, train_y2):
        computing, encoding = train.iloc[computing_index], train.iloc[encoding_index]
        means = computing.groupby(col).user_purchase_binary_14_days.mean()
        train.loc[encoding_index, f'{col}_mean_enc'] = encoding[col].map(means)
    return train

In [6]:
def mean_encoding_test(test, train, col="country"):
    global_mean = train.user_purchase_binary_14_days.mean()
    mean_device_type = train.groupby(col).user_purchase_binary_14_days.mean()
    test[f'{col}_mean_enc'] = test[col].map(mean_device_type)
    test[f'{col}_mean_enc'].fillna(global_mean, inplace=True)

# Training Set Features

In [7]:
training = feature_generate(training, events_training, session_training)
training = reg_target_encoding(training)

# Prediction Set Features

Lets wrap up all feature generating procedures and use it in prediction dataset.

In [8]:
submit = feature_generate(submission, events, session)
mean_encoding_test(submit, training, col="country")

submit.head()

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days,event_count,purchase_count,session_count,country,os_name,previous_sessions_duration,spend,event_gap,session_gap,life_time,os_ios,os_android,country_mean_enc
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.01,0.02,26.0,0.0,2.0,US,iOS,356544.0,0.0,4169025000.0,4169058000.0,4219027000.0,1,0,0.015211
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.01,0.02,50.0,0.0,1.0,MX,Android OS,0.0,0.0,1962124000.0,2068505000.0,2068520000.0,0,1,0.002735
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.01,0.02,31.0,0.0,4.0,MX,Android OS,388631.25,0.0,3259102000.0,3448174000.0,4131999000.0,0,1,0.002735
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.01,0.02,207.0,0.0,10.0,ZW,Android OS,9200582.9,0.0,2712367000.0,2714803000.0,4368927000.0,0,1,0.0
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,0.01,0.02,5.0,0.0,1.0,RS,Android OS,0.0,0.0,5332535000.0,5507037000.0,5507037000.0,0,1,0.001355


<br>
<br>
<br>

# Save Notebook

In [9]:
dill.dump_session("Notebook_Saves/features.db")