# Creating the features

In [2]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import time
import datetime as dt

import os
import os.path as path
import csv
import pickle
from multiprocessing import Pool, cpu_count

In [2]:
transaction_sum = pd.read_csv('data/transaction_sum.csv')
members = pd.read_csv('data/members.csv')
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/sample_submission_zero.csv')
user_logs = pd.read_csv('data/user_logs_output.csv')
recency = pd.read_csv('data/recency.csv')

In [3]:
print('Transaction Sum Shape:    %s' % str(transaction_sum.shape))
print('Members Shape:            %s' % str(members.shape))
print('Train Shape:              %s' % str(train.shape))
print('Test Shape:               %s' % str(test.shape))
print('User Logs Output:         %s' % str(user_logs.shape))
print('Recency Shape:            %s' % str(recency.shape))

Transaction Sum Shape:    (2363626, 7)
Members Shape:            (5116194, 7)
Train Shape:              (992931, 2)
Test Shape:               (970960, 2)
User Logs Output:         (5234111, 3)
Recency Shape:            (2363626, 27)


One basic feature that we can add is mean statistics of the transaction data (transaction_sum). We will also use basic member features provided in the challenge. Lastly, recency records the months that users dropped, maintained or added subscriptions. All of this will be combined to the train and test dataframe.

In [4]:
train.index = train.msno
test.index = test.msno
members.index = members.msno
members = members.drop(['msno'], axis = 1)
user_logs.index = user_logs.msno
user_logs = user_logs.drop(['msno'], axis = 1)
recency.index = recency['Unnamed: 0']
recency = recency.drop(['Unnamed: 0'], axis = 1)
transaction_sum.index = transaction_sum.msno
transaction_sum = transaction_sum.drop(['msno'], axis = 1)

train = pd.concat([train, recency, members, transaction_sum, user_logs], axis = 1, join='outer', join_axes = [train.index])
test = pd.concat([test, recency, members, transaction_sum, user_logs], axis = 1, join='outer', join_axes = [test.index])

train.rename(columns = {list(train)[-3]: 'num_transactions'}, inplace = True)
test.rename(columns = {list(test)[-3]: 'num_transactions'}, inplace = True)

We can feature engineer the number of consecutive days of maintained subscription (basically something like a yearly subscription) and consecutive monthly subscriptions. Users that have long consecutive maintained subscription might be less likely to resub compared to those that consecutively subscribe every month.

In [5]:
def len_consec_zeros(a):
    a = np.array(list(a))    # convert elements to `str`
    rr = np.argwhere(a == '0').ravel()  # find out positions of `0`
    if not rr.size:  # if there are no zeros, return 0
        return 0

    full = np.arange(rr[0], rr[-1]+1)  # get the range of spread of 0s

    # get the indices where `0` was flipped to something else
    diff = np.setdiff1d(full, rr)
    if not diff.size:     # if there are no bit flips, return the 
        return len(full)  # size of the full range

    # break the array into pieces wherever there's a bit flip
    # and the result is the size of the largest chunk
    pos, difs = full[0], []
    for el in diff:
        difs.append(el - pos)
        pos = el + 1

    difs.append(full[-1]+1 - pos)

    # return size of the largest chunk
    res = max(difs) if max(difs) != 1 else 0

    return res

def len_consec_ones(a):
    a = np.array(list(a))    # convert elements to `str`
    rr = np.argwhere(a == '1').ravel()  # find out positions of `0`
    if not rr.size:  # if there are no zeros, return 0
        return 0

    full = np.arange(rr[0], rr[-1]+1)  # get the range of spread of 0s

    # get the indices where `0` was flipped to something else
    diff = np.setdiff1d(full, rr)
    if not diff.size:     # if there are no bit flips, return the 
        return len(full)  # size of the full range

    # break the array into pieces wherever there's a bit flip
    # and the result is the size of the largest chunk
    pos, difs = full[0], []
    for el in diff:
        difs.append(el - pos)
        pos = el + 1

    difs.append(full[-1]+1 - pos)

    # return size of the largest chunk
    res = max(difs) if max(difs) != 1 else 0

    return res

In [6]:
train['concated'] = train.ix[:, 3:27].astype(str).apply(lambda x: ''.join(x), axis=1)
train['consecutive_zeros'] = train.concated.apply(lambda x: len_consec_zeros(x))
train['consecutive_ones'] = train.concated.apply(lambda x: len_consec_ones(x))

test['concated'] = test.ix[:, 3:27].astype(str).apply(lambda x: ''.join(x), axis=1)
test['consecutive_zeros'] = test.concated.apply(lambda x: len_consec_zeros(x))
test['consecutive_ones'] = test.concated.apply(lambda x: len_consec_ones(x))

### Converting Dates into Integers

We want to convert the features with dates (regisration init time, membership_expire_date, etc...) from 20170101 to integers that convey number of days in the past or number of days in the future. We will set 2017/02/28 as our pivot time (our 0) in the train data, and 2017/03/31 for test data.

In [7]:
def get_date(i):
    year = int(i / 10000)
    month = int(int(i % 10000) / 100)
    day = int(i % 100)
    return dt.date(year, month, day)

In [8]:
pivot = dt.date(2017, 2, 28)
test.loc[~test.registration_init_time.isnull(), 'registration_init_time'] = [(pivot - get_date(i)).days / 30 for i in test.loc[~test.registration_init_time.isnull(), 'registration_init_time']]
test.loc[~test.expiration_date.isnull(), 'expiration_date'] = [(pivot - get_date(i)).days / 30 for i in test.loc[~test.expiration_date.isnull(), 'expiration_date']]
test.loc[~test.transaction_date.isnull(), 'transaction_date'] = [(pivot - get_date(i)).days / 30 for i in test.loc[~test.transaction_date.isnull(), 'transaction_date']]
test.loc[~test.membership_expire_date.isnull(), 'membership_expire_date'] = [(pivot - get_date(i)).days / 30 for i in test.loc[~test.membership_expire_date.isnull(), 'membership_expire_date']]

train.loc[~train.registration_init_time.isnull(), 'registration_init_time'] = [(pivot - get_date(i)).days / 30 for i in train.loc[~train.registration_init_time.isnull(), 'registration_init_time']]
train.loc[~train.expiration_date.isnull(), 'expiration_date'] = [(pivot - get_date(i)).days / 30 for i in train.loc[~train.expiration_date.isnull(), 'expiration_date']]
train.loc[~train.transaction_date.isnull(), 'transaction_date'] = [(pivot - get_date(i)).days / 30 for i in train.loc[~train.transaction_date.isnull(), 'transaction_date']]
train.loc[~train.membership_expire_date.isnull(), 'membership_expire_date'] = [(pivot - get_date(i)).days / 30 for i in train.loc[~train.membership_expire_date.isnull(), 'membership_expire_date']]

### Missing Data

Let's do a missing data check with what we have.

In [74]:
def num_missing(x):
  return sum(x.isnull())

#Applying per column:
print ("Train missing values:")
print ("---------------------------------")
print (train.apply(num_missing, axis=0))
print ("")
print ("Test missing values:")
print ("---------------------------------")
print (test.apply(num_missing, axis=0))

Train missing values:
---------------------------------
msno                           0
is_churn                       0
201501                         0
201502                         0
201503                         0
201504                         0
201505                         0
201506                         0
201507                         0
201508                         0
201509                         0
201510                         0
201511                         0
201512                         0
201601                         0
201602                         0
201603                         0
201604                         0
201605                         0
201606                         0
201607                         0
201608                         0
201609                         0
201610                         0
201611                         0
201612                         0
201701                         0
201702                         0
city                

### Filling in Missing Data

We will fill in the missing data with averages of each feature. We will assign 'NA' for categorical variables. For continuous ones such as passed days, we will assign averages. 

In [9]:
combined = pd.concat([train, test])

For Age, we will also assign average age to those with ridiculous ages (negative ages or +95).

In [10]:
avg_age = np.mean(combined.loc[(3 <= combined['bd']) | (combined['bd'] <= 95), 'bd'])
train.loc[(5 > train['bd']) | (train['bd'] > 95) | (train['bd'].isnull()), 'bd'] = avg_age
test.loc[(5 > test['bd']) | (test['bd'] > 95) | (test['bd'].isnull()), 'bd'] = avg_age

Total Secs data is kind of weird. Some rows have extrmeely big negative values which doesn't make sense. We will just fill those with 0.

In [11]:
train.loc[train.total_secs < 0, 'total_secs'] = 0
test.loc[test.total_secs < 0, 'total_secs'] = 0
combined.loc[combined.total_secs < 0, 'total_secs'] = 0

In [12]:
# Fill in Categorical Variables
train.loc[train.city.isnull(), 'city'] = 999
test.loc[test.city.isnull(), 'city'] = 999

train.loc[train.gender.isnull(), 'gender'] = 'NA'
test.loc[test.gender.isnull(), 'gender'] = 'NA'

train.loc[train.registered_via.isnull(), 'registered_via'] = 999
test.loc[test.registered_via.isnull(), 'registered_via'] = 999

# Encode the categorical variables
train = pd.get_dummies(train, columns = ['city', 'gender', 'registered_via'])
test = pd.get_dummies(test, columns = ['city', 'gender', 'registered_via'])

In [6]:
def reject_outliers(data, m=2):
    return data[abs(data - np.mean(data)) < m * np.std(data)]

In [14]:
var = ['registration_init_time',
      'expiration_date',
      'payment_plan_days',
      'transaction_date',
      'membership_expire_date',
      'is_cancel',
      'is_auto_renew',
      'num_transactions',
      'num_unq',
      'total_secs']

# Fill in above variables

avg = np.mean(reject_outliers(combined[var]))

train_na = train[var].isnull().any(axis = 1)
test_na = test[var].isnull().any(axis = 1)
for i in np.arange(len(var)):
    #train.loc[mask, var[i]] = avg[i]
    test.loc[test_na, var[i]] = avg[i]
    
output = open('train/test_na.pkl', 'wb')
pickle.dump(test_na, output, protocol=pickle.HIGHEST_PROTOCOL)
output.close()
output = open('train/train_na.pkl', 'wb')
pickle.dump(train_na, output, protocol=pickle.HIGHEST_PROTOCOL)
output.close()

In [15]:
test.ix[test['201501'].isnull(), 3:27] = 0

I have a suspicion that the missing data from member or data are all new users, who will most likley resub.

### Save the Data

In [16]:
save_dir = path.join(os.getcwd(), 'data', 'train_out.csv')
train.to_csv(save_dir, index =  False)
print('Saved to: %s' % save_dir)
save_dir = path.join(os.getcwd(), 'data', 'test_out.csv')
test.to_csv(save_dir, index =  False)
print('Saved to: %s' % save_dir)

Saved to: C:\Users\Michael\Documents\python\kkbox\data\train_out.csv
Saved to: C:\Users\Michael\Documents\python\kkbox\data\test_out.csv


In [3]:
train_out.describe()

Unnamed: 0,is_churn,bd,registration_init_time,expiration_date,payment_plan_days,transaction_date,membership_expire_date,is_cancel,is_auto_renew,num_transactions,...,city_999.0,gender_NA,gender_female,gender_male,registered_via_3.0,registered_via_4.0,registered_via_7.0,registered_via_9.0,registered_via_13.0,registered_via_999.0
count,992931.0,992931.0,992931.0,992931.0,992931.0,992931.0,992931.0,992931.0,992931.0,992931.0,...,992931.0,992931.0,992931.0,992931.0,992931.0,992931.0,992931.0,992931.0,992931.0,992931.0
mean,0.063923,20.039728,1249.884725,-195.1077,481.840846,524.065225,-11.87459,0.287857,14.859225,16.206193,...,0.117619,0.601123,0.189194,0.209682,0.106192,0.049633,0.485189,0.238267,0.0031,0.117619
std,0.244616,9.677323,1019.419367,61.03121,218.065993,229.093259,12.35679,0.564352,8.791249,7.83276,...,0.322157,0.489668,0.391663,0.407082,0.308083,0.217185,0.499781,0.426024,0.055591,0.322157
min,0.0,5.0,0.0,-3138.0,1.0,0.0,-31.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,13.628574,465.0,-214.0,330.0,370.0,-22.0,0.0,8.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,13.628574,1077.586456,-204.0,466.243007,522.100395,-12.0,0.0,14.474472,15.682239,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,26.0,1696.0,-188.0,652.0,759.0,-2.0,0.182711,22.0,22.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,1.0,95.0,4722.0,2186.0,1830.0,789.0,27.0,20.0,62.0,71.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
test_out.describe()

Unnamed: 0,is_churn,bd,registration_init_time,expiration_date,payment_plan_days,transaction_date,membership_expire_date,is_cancel,is_auto_renew,num_transactions,...,city_999.0,gender_NA,gender_female,gender_male,registered_via_3.0,registered_via_4.0,registered_via_7.0,registered_via_9.0,registered_via_13.0,registered_via_999.0
count,970960.0,970960.0,970960.0,970960.0,970960.0,970960.0,970960.0,970960.0,970960.0,970960.0,...,970960.0,970960.0,970960.0,970960.0,970960.0,970960.0,970960.0,970960.0,970960.0,970960.0
mean,0.0,20.117671,1271.051062,-170.712123,473.804486,542.546286,12.903728,0.269739,14.514559,15.887573,...,0.1146,0.594903,0.192416,0.212681,0.109637,0.054319,0.47529,0.24268,0.003473,0.1146
std,0.0,9.699585,1029.280935,55.575337,224.975126,238.006918,12.205261,0.549489,8.966286,8.058614,...,0.318539,0.490911,0.394198,0.409204,0.312437,0.226647,0.499389,0.428704,0.058829,0.318539
min,0.0,5.0,31.0,-2165.0,0.0,31.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,13.628574,489.0,-187.73878,300.0,386.0,4.0,0.0,8.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,13.628574,1077.586456,-181.0,466.243007,522.100395,12.0,0.0,14.474472,15.682239,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,26.0,1719.0,-167.0,630.0,790.0,22.0,0.182711,22.0,22.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,0.0,95.0,4753.0,1045.0,1830.0,820.0,772.0,20.0,62.0,64.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Visualize what we have

In [3]:
train = pd.read_csv("data/train_out.csv")

In [4]:
def distribute(var):
    fig = sns.FacetGrid(reject_outliers(train), hue = 'is_churn', aspect = 4)
    fig.map(sns.kdeplot, var, shade = True)
    fig.add_legend()

In [None]:
col = ['registration_init_time',
      'expiration_date',
      'payment_plan_days',
      'transaction_date',
      'membership_expire_date',
      'is_cancel',
      'is_auto_renew',
      'num_transactions',
      'num_unq',
      'total_secs']
for var in col:
    distribute(var)