# Airbnb New User Bookings

In [153]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier

## Read in file

In [154]:
train = pd.read_csv("../input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip")
train.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


In [155]:
test = pd.read_csv("../input/airbnb-recruiting-new-user-bookings/test_users.csv.zip")
test.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,5uwns89zht,2014-07-01,20140701000006,,FEMALE,35.0,facebook,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
1,jtl0dijy2j,2014-07-01,20140701000051,,-unknown-,,basic,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
2,xx0ulgorjt,2014-07-01,20140701000148,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome
3,6c6puo6ix0,2014-07-01,20140701000215,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,IE
4,czqhjk3yfe,2014-07-01,20140701000305,,-unknown-,,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Safari


In [156]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213451 entries, 0 to 213450
Data columns (total 16 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       213451 non-null  object 
 1   date_account_created     213451 non-null  object 
 2   timestamp_first_active   213451 non-null  int64  
 3   date_first_booking       88908 non-null   object 
 4   gender                   213451 non-null  object 
 5   age                      125461 non-null  float64
 6   signup_method            213451 non-null  object 
 7   signup_flow              213451 non-null  int64  
 8   language                 213451 non-null  object 
 9   affiliate_channel        213451 non-null  object 
 10  affiliate_provider       213451 non-null  object 
 11  first_affiliate_tracked  207386 non-null  object 
 12  signup_app               213451 non-null  object 
 13  first_device_type        213451 non-null  object 
 14  firs

In [157]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62096 entries, 0 to 62095
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       62096 non-null  object 
 1   date_account_created     62096 non-null  object 
 2   timestamp_first_active   62096 non-null  int64  
 3   date_first_booking       0 non-null      float64
 4   gender                   62096 non-null  object 
 5   age                      33220 non-null  float64
 6   signup_method            62096 non-null  object 
 7   signup_flow              62096 non-null  int64  
 8   language                 62096 non-null  object 
 9   affiliate_channel        62096 non-null  object 
 10  affiliate_provider       62096 non-null  object 
 11  first_affiliate_tracked  62076 non-null  object 
 12  signup_app               62096 non-null  object 
 13  first_device_type        62096 non-null  object 
 14  first_browser         

## Concat files

In [158]:
# store the country_destination
labels = train['country_destination'].values
train = train.drop(['country_destination'], axis=1)

# store the test id
test_id = test['id']

# caculate the row number 
train_row = train.shape[0]

# concat file
df = pd.concat((train, test), axis=0, ignore_index=True)

# removing id and date_first_booking
df = df.drop(['id', 'date_first_booking', 'signup_app'], axis=1)

# filling nan
df = df.fillna(-1)


## Data Preprocssing

### Date Account Create

In [159]:
df['date_account_created'] = pd.to_datetime(df['date_account_created'], format='%Y-%m-%d')

df['dac_year'] = pd.DatetimeIndex(df['date_account_created']).year
df['dac_month'] = pd.DatetimeIndex(df['date_account_created']).month
df['dac_day'] = pd.DatetimeIndex(df['date_account_created']).day

df = df.drop(['date_account_created'], axis = 1)

df.head()

Unnamed: 0,timestamp_first_active,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,first_device_type,first_browser,dac_year,dac_month,dac_day
0,20090319043255,-unknown-,-1.0,facebook,0,en,direct,direct,untracked,Mac Desktop,Chrome,2010,6,28
1,20090523174809,MALE,38.0,facebook,0,en,seo,google,untracked,Mac Desktop,Chrome,2011,5,25
2,20090609231247,FEMALE,56.0,basic,3,en,direct,direct,untracked,Windows Desktop,IE,2010,9,28
3,20091031060129,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Mac Desktop,Firefox,2011,12,5
4,20091208061105,-unknown-,41.0,basic,0,en,direct,direct,untracked,Mac Desktop,Chrome,2010,9,14


### Timestamp first active

In [160]:
tfa = np.vstack(df.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)

df['tfa_year'] = tfa[:,0]
df['tfa_month'] = tfa[:,1]
df['tfa_day'] = tfa[:,2]

df = df.drop(['timestamp_first_active'], axis=1)

In [161]:
df.head()

Unnamed: 0,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,first_device_type,first_browser,dac_year,dac_month,dac_day,tfa_year,tfa_month,tfa_day
0,-unknown-,-1.0,facebook,0,en,direct,direct,untracked,Mac Desktop,Chrome,2010,6,28,2009,3,19
1,MALE,38.0,facebook,0,en,seo,google,untracked,Mac Desktop,Chrome,2011,5,25,2009,5,23
2,FEMALE,56.0,basic,3,en,direct,direct,untracked,Windows Desktop,IE,2010,9,28,2009,6,9
3,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Mac Desktop,Firefox,2011,12,5,2009,10,31
4,-unknown-,41.0,basic,0,en,direct,direct,untracked,Mac Desktop,Chrome,2010,9,14,2009,12,8


### Age

In [162]:
def user_age(age):
    if age < 0:
        return 'NA' 
    elif (age < 15):
        return 15 
    elif (age <= 25):
        return 25 
    elif (age <= 35):
        return 35
    elif (age <= 45):
        return 45
    elif (age <= 55):
        return 55
    elif (age <= 65):
        return 65
    elif (age <= 75):
        return 75
    elif (age <= 85):
        return 85
    else:
        return 'NA' 

df['age'] = np.array([user_age(x) for x in df.age])
df_age = pd.get_dummies(df.age, prefix = 'age')
df = pd.concat((df, df_age), axis = 1)

df.drop(['age'], axis = 1, inplace = True)

In [163]:
df.head()

Unnamed: 0,gender,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,first_device_type,first_browser,dac_year,...,tfa_day,age_15,age_25,age_35,age_45,age_55,age_65,age_75,age_85,age_NA
0,-unknown-,facebook,0,en,direct,direct,untracked,Mac Desktop,Chrome,2010,...,19,0,0,0,0,0,0,0,0,1
1,MALE,facebook,0,en,seo,google,untracked,Mac Desktop,Chrome,2011,...,23,0,0,0,1,0,0,0,0,0
2,FEMALE,basic,3,en,direct,direct,untracked,Windows Desktop,IE,2010,...,9,0,0,0,0,0,1,0,0,0
3,FEMALE,facebook,0,en,direct,direct,untracked,Mac Desktop,Firefox,2011,...,31,0,0,0,1,0,0,0,0,0
4,-unknown-,basic,0,en,direct,direct,untracked,Mac Desktop,Chrome,2010,...,8,0,0,0,1,0,0,0,0,0


### One Hot Encoding

In [164]:
OHE_feat = ['gender', 
            'signup_method', 
            'signup_flow', 
            'language', 
            'affiliate_channel', 
            'affiliate_provider', 
            'first_affiliate_tracked', 
            'first_device_type', 
            'first_browser']

for f in OHE_feat:
    df_work = pd.get_dummies(df[f], prefix=f, dummy_na=True)
    df.drop([f], axis = 1, inplace = True)
    df = pd.concat((df, df_work), axis = 1)

## Label Encoding

In [None]:
X = df_all.iloc[:train_row, :]
X_test = df_all.iloc[train_row:, :]

le = LabelEncoder()
y = le.fit_transform(labels)   

## XGBoost

In [None]:
xgb = XGBClassifier(max_depth=6, learning_rate=0.2, n_estimators=43,
                    objective='multi:softprob', subsample=1, colsample_bytree=0.5, seed=1)                  
xgb.fit(X, y)
y_pred = xgb.predict_proba(X_test)  

In [None]:
print(y_pred)

In [None]:
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(test_id)):
    idx = test_id[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('submission.csv',index=False)