In [1]:
import numpy as np
import pandas as pd
import datetime
from datetime import date

In [2]:
train = pd.read_csv("Data/train_users.csv")
test = pd.read_csv("Data/test_users.csv")
df_sessions = pd.read_csv('Data/sessions.csv')

In [3]:
train.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


In [4]:
test.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,5uwns89zht,2014-07-01,20140701000006,,FEMALE,35.0,facebook,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
1,jtl0dijy2j,2014-07-01,20140701000051,,-unknown-,,basic,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
2,xx0ulgorjt,2014-07-01,20140701000148,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome
3,6c6puo6ix0,2014-07-01,20140701000215,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,IE
4,czqhjk3yfe,2014-07-01,20140701000305,,-unknown-,,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Safari


In [5]:
df_sessions.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0


## Feature extraction

In [6]:
df_sessions['id'] = df_sessions['user_id']
df_sessions = df_sessions.drop(['user_id'],axis=1)
df_sessions.action = df_sessions.action.fillna('NAN')
df_sessions.action_type = df_sessions.action_type.fillna('NAN')
df_sessions.action_detail = df_sessions.action_detail.fillna('NAN')

In [7]:
# Action values with low frequency are changed to 'other'
act_freq = 100  # Threshold of frequency
act = dict(zip(*np.unique(df_sessions.action, return_counts=True)))
df_sessions.action = df_sessions.action.apply(lambda x: 'other' if act[x] < act_freq else x)

In [8]:

f_act = df_sessions.action.value_counts().argsort()
f_act_detail = df_sessions.action_detail.value_counts().argsort()
f_act_type = df_sessions.action_type.value_counts().argsort()
f_dev_type = df_sessions.device_type.value_counts().argsort()

# group by id
dgr_sess = df_sessions.groupby(['id'])
# Loop on dgr_sess to create all the features.
samples = [] 
ln = len(dgr_sess) # calculate length of grouped df_sessions

for g in dgr_sess:
    gr = g[1]   # data frame that comtains all the data for one user id
    l = []  #store temporary features
    l.append(g[0]) #append id

    # number of total actions
    l.append(len(gr)) # append corresponding data length of data

    # secs_elapsed null value filled with 0
    sev = gr.secs_elapsed.fillna(0).values
    
    # action features
    # The frequency of each action, 
    # the number of each action types, 
    # the mean and the standard deviation
    c_act = [0] * len(f_act)
    for i,v in enumerate(gr.action.values): 
        c_act[f_act[v]] += 1
    _, c_act_uqc = np.unique(gr.action.values, return_counts=True)
    c_act += [len(c_act_uqc), np.mean(c_act_uqc), np.std(c_act_uqc)]
    l = l + c_act

    # action_detail features
    c_act_detail = [0] * len(f_act_detail)
    for i,v in enumerate(gr.action_detail.values):
        c_act_detail[f_act_detail[v]] += 1
    _, c_act_det_uqc = np.unique(gr.action_detail.values, return_counts=True)
    c_act_detail += [len(c_act_det_uqc), np.mean(c_act_det_uqc), np.std(c_act_det_uqc)]
    l = l + c_act_detail

    # action_type features
    # log of the sum of secs_elapsed for each value
    l_act_type = [0] * len(f_act_type)
    c_act_type = [0] * len(f_act_type)
    for i,v in enumerate(gr.action_type.values):
        l_act_type[f_act_type[v]] += sev[i] # Total elapse of each action type
        c_act_type[f_act_type[v]] += 1  
    l_act_type = np.log(1 + np.array(l_act_type)).tolist() # log elapse
    _, c_act_type_uqc = np.unique(gr.action_type.values, return_counts=True)
    c_act_type += [len(c_act_type_uqc), np.mean(c_act_type_uqc), np.std(c_act_type_uqc)]
    l = l + c_act_type + l_act_type    

    # device_type features
    c_dev_type  = [0] * len(f_dev_type)
    for i,v in enumerate(gr.device_type .values):
        c_dev_type[f_dev_type[v]] += 1 
    c_dev_type.append(len(np.unique(gr.device_type.values))) 
    _, c_dev_type_uqc = np.unique(gr.device_type.values, return_counts=True)
    c_dev_type += [len(c_dev_type_uqc), np.mean(c_dev_type_uqc), np.std(c_dev_type_uqc)]        
    l = l + c_dev_type    

    # secs_elapsed features 
    l_secs = [0]*5
    l_log = [0]*15
    if len(sev) > 0:
        # Statistics about the secs_elapsed values.
        l_secs[0] = np.log(1 + np.sum(sev))
        l_secs[1] = np.log(1 + np.mean(sev)) 
        l_secs[2] = np.log(1 + np.std(sev))
        l_secs[3] = np.log(1 + np.median(sev))
        l_secs[4] = l_secs[0] / float(l[1]) #

        # Values are grouped in 15 intervals. Compute the number of values in each interval.
        log_sev = np.log(1 + sev).astype(int) 
        l_log = np.bincount(log_sev, minlength=15).tolist()                    
    l = l + l_secs + l_log

    # The list l has the feature values of one sample.
    samples.append(l)

# preparing objects    
samples = np.array(samples) 
samp_ar = samples[:, 1:].astype(np.float16) # Features except id
samp_id = samples[:, 0]   # The first column is id
    
col_names = []    #name of the columns
for i in range(len(samples[0])-1):  #except id
    col_names.append('c_' + str(i))  #column name    
df_agg_sess = pd.DataFrame(samp_ar, columns=col_names)
df_agg_sess['id'] = samp_id
df_agg_sess.index = df_agg_sess.id #set id as index


In [9]:
df_agg_sess.head()

Unnamed: 0_level_0,c_0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,...,c_448,c_449,c_450,c_451,c_452,c_453,c_454,c_455,c_456,id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00023iyk9l,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,12.0,6.0,2.0,3.0,3.0,1.0,0.0,1.0,0.0,00023iyk9l
0010k6l0om,63.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,12.0,2.0,8.0,4.0,3.0,0.0,0.0,0.0,0010k6l0om
001wyh0pz8,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,27.0,30.0,9.0,8.0,1.0,0.0,0.0,0.0,0.0,001wyh0pz8
0028jgx1x1,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,2.0,3.0,5.0,4.0,1.0,0.0,0.0,0.0,0028jgx1x1
002qnbzfs5,789.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,111.0,102.0,104.0,57.0,28.0,9.0,4.0,1.0,1.0,002qnbzfs5


In [10]:
train_row = train.shape[0] 
labels_full = train['country_destination'].values
train.drop(['date_first_booking'], axis = 1, inplace = True)
test.drop(['date_first_booking'], axis = 1, inplace = True)


In [11]:
#concatenate test and train
df = pd.concat([train, test], axis = 0, ignore_index = True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [12]:
tfa = df.timestamp_first_active.astype(str).apply(lambda x: datetime.datetime(int(x[:4]),
                                                                          int(x[4:6]), 
                                                                          int(x[6:8]),
                                                                          int(x[8:10]),
                                                                          int(x[10:12]),
                                                                          int(x[12:])))

In [13]:
# create year, month, day feature
df['tfa_year'] = np.array([x.year for x in tfa])
df['tfa_month'] = np.array([x.month for x in tfa])
df['tfa_day'] = np.array([x.day for x in tfa])

In [14]:
# create weekday feature
df['tfa_wd'] = np.array([x.isoweekday() for x in tfa]) 
df_tfa_wd = pd.get_dummies(df.tfa_wd, prefix = 'tfa_wd')  # one hot encoding 
df = pd.concat((df, df_tfa_wd), axis = 1) 
df.drop(['tfa_wd'], axis = 1, inplace = True)

In [15]:
# create season feature
Y = 2000
seasons = [(0, (date(Y,  1,  1),  date(Y,  3, 1))),  #'winter'
           (1, (date(Y,  3, 2),  date(Y,  6, 1))),  #'spring'
           (2, (date(Y,  6, 2),  date(Y,  9, 1))),  #'summer'
           (3, (date(Y,  9, 2),  date(Y, 12, 1))),  #'autumn'
           (0, (date(Y, 12, 2),  date(Y, 12, 31)))]  #'winter'

def get_season(dt):
    dt = dt.date() 
    dt = dt.replace(year=Y) # set year as 2000, which is useless in this feature
    return next(season for season, (start, end) in seasons if start <= dt <= end)

df['tfa_season'] = np.array([get_season(x) for x in tfa])
df_tfa_season = pd.get_dummies(df.tfa_season, prefix = 'tfa_season') # one hot encoding 
df = pd.concat((df, df_tfa_season), axis = 1)
df.drop(['tfa_season'], axis = 1, inplace = True)

In [16]:
dac = pd.to_datetime(df.date_account_created)

In [17]:
dt_span = dac.subtract(tfa).dt.days 
dt_span.value_counts().head()

-1    275369
 0         7
 6         4
 5         4
 1         4
dtype: int64

In [18]:
# create categorical feature: span = -1; span > -1
def get_span(dt):
    # dt is an integer
    if dt == -1:
        return 'OneDay'
    else:
        return 'other'

df['dt_span'] = np.array([get_span(x) for x in dt_span])
df_dt_span = pd.get_dummies(df.dt_span, prefix = 'dt_span')
df = pd.concat((df, df_dt_span), axis = 1)
df.drop(['dt_span'], axis = 1, inplace = True)

In [19]:
df.drop(['date_account_created','timestamp_first_active'], axis = 1, inplace = True)

In [20]:
# Age
age = df.age
age.fillna(-1, inplace = True)
av = age.values
# This are birthdays instead of age (estimating age by doing 2014 - value)
av = np.where(np.logical_and(av<2000, av>1930), 2014-av, av) 
df['age'] = av

In [21]:
# Age has many abnormal values that we need to deal with. 
def get_age(age):
    if age < 0:
        return 'NAN'
    elif (np.logical_and(age<25, age>=15)):
        return 20
    elif (np.logical_and(age<35, age>=25)):
        return 30 
    elif (np.logical_and(age<45, age>=35)):
        return 40
    elif (np.logical_and(age<55, age>=45)):
        return 50
    elif (np.logical_and(age<65, age>=55)):
        return 60
    elif (np.logical_and(age<75, age>=65)):
        return 70
    elif (np.logical_and(age<85, age>=75)):
        return 80
    else:
        return 'Unphysical' # abnormal age,[0,15][85,]

In [22]:
df['age'] = np.array([get_age(x) for x in age])
df_age = pd.get_dummies(df.age, prefix = 'age')
df = pd.concat((df, df_age), axis = 1)
df.drop(['age'], axis = 1, inplace = True)

In [23]:
feat_toOHE = ['gender', 
             'signup_method', 
             'signup_flow', 
             'language', 
             'affiliate_channel', 
             'affiliate_provider', 
             'first_affiliate_tracked', 
             'signup_app', 
             'first_device_type', 
             'first_browser']
# one-hot-encoding
for f in feat_toOHE:
    df_ohe = pd.get_dummies(df[f], prefix=f, dummy_na=True)
    df.drop([f], axis = 1, inplace = True)
    df = pd.concat((df, df_ohe), axis = 1)

In [24]:
all_row = df.shape[0] 
df.head()

Unnamed: 0,country_destination,id,tfa_year,tfa_month,tfa_day,tfa_wd_1,tfa_wd_2,tfa_wd_3,tfa_wd_4,tfa_wd_5,...,first_browser_SiteKiosk,first_browser_SlimBrowser,first_browser_Sogou Explorer,first_browser_Stainless,first_browser_TenFourFox,first_browser_TheWorld Browser,first_browser_UC Browser,first_browser_Yandex.Browser,first_browser_wOSBrowser,first_browser_nan
0,NDF,gxn3p5htnn,2009,3,19,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,NDF,820tgsjxq7,2009,5,23,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,US,4ft3gnwmtx,2009,6,9,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,other,bjjt8pjhuk,2009,10,31,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,US,87mebub9p4,2009,12,8,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# merge session data and user data
df_all = pd.merge(df, df_agg_sess, how='left')
Xtrain = df_all.iloc[:train_row, :]
Xtest = df_all.iloc[train_row:, :]

Xtrain_temp = []
Xtrain_temp = Xtrain

Xtrain = Xtrain.drop(columns=['id','country_destination']) # delete id and label column
Xtrain = Xtrain.fillna(-1)  # null session data fill -1

# delete rows where all session data is null
Xtrain['all_null'] = np.array([sum(r<0) for r in Xtrain.values]) 
Xtrain = Xtrain_temp[Xtrain['all_null'] == 0]
labels = Xtrain['country_destination'].values
# delete useless feature
Xtrain = Xtrain.drop(['id','country_destination','tfa_year','tfa_month'],axis=1)
Xtest = Xtest.drop(['id','country_destination','tfa_year','tfa_month'],axis=1)


# full dataset
# df_all_full = df_all.drop(columns=['id','country_destination']) 
# df_all_full = df_all_full.fillna(-1)
# df_all_full['all_null'] = np.array([sum(r<0) for r in df_all_full.values]) 
# Xtrain_full = df_all_full.iloc[:train_row, :]
# Xtest_full = df_all_full.iloc[train_row:, :]

Defaulting to column, but this will raise an ambiguity error in a future version
  exec(code_obj, self.user_global_ns, self.user_ns)


In [26]:
Xtrain.head()

Unnamed: 0,tfa_day,tfa_wd_1,tfa_wd_2,tfa_wd_3,tfa_wd_4,tfa_wd_5,tfa_wd_6,tfa_wd_7,tfa_season_0,tfa_season_1,...,c_447,c_448,c_449,c_450,c_451,c_452,c_453,c_454,c_455,c_456
137021,1,0,0,1,0,0,0,0,1,0,...,11.0,27.0,15.0,4.0,9.0,10.0,14.0,5.0,1.0,0.0
137022,1,0,0,1,0,0,0,0,1,0,...,0.0,0.0,1.0,2.0,2.0,1.0,1.0,0.0,0.0,0.0
137023,1,0,0,1,0,0,0,0,1,0,...,0.0,2.0,1.0,1.0,0.0,1.0,6.0,2.0,0.0,0.0
137024,1,0,0,1,0,0,0,0,1,0,...,16.0,22.0,28.0,29.0,7.0,6.0,7.0,7.0,1.0,0.0
137025,1,0,0,1,0,0,0,0,1,0,...,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
Xtrain.to_csv("Airbnb_xtrain_v2.csv")
Xtest.to_csv("Airbnb_xtest_v2.csv")
labels.tofile("Airbnb_ytrain_v2.csv", sep='\n', format='%s') 

# full dataset
# Xtrain_full.to_csv("Airbnb_xtrain_v2_full.csv")
# Xtest_full.to_csv("Airbnb_xtest_v2_full.csv")
# labels_full.tofile("Airbnb_ytrain_v2_full.csv", sep='\n', format='%s') 