# Feature Engineering


In [1]:
import pandas as pd
import numpy as np
from statistics import mean
from scipy.sparse import hstack
from scipy.sparse import vstack
from scipy import sparse 
from scipy.sparse import csr_matrix
from sklearn.preprocessing import OneHotEncoder
import pickle
import joblib


In [2]:
session_df = pd.read_csv(r"C:\Users\Wenxia\Desktop\Python\Airbnb_prediction\data\sessions.csv")

In [3]:
session_df.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0


In [4]:
session_df_concat = session_df.groupby("user_id", as_index=False).agg(lambda x: x.tolist())
session_df_concat.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,00023iyk9l,"[index, dashboard, header_userpic, dashboard, ...","[view, view, data, view, partner_callback, mes...","[view_search_results, dashboard, header_userpi...","[Mac Desktop, Mac Desktop, Mac Desktop, Mac De...","[20438.0, 787.0, 850.0, 934.0, nan, 129817.0, ..."
1,0010k6l0om,"[search_results, show, personalize, show, sear...","[click, view, data, nan, click, click, nan, da...","[view_search_results, p3, wishlist_content_upd...","[Mac Desktop, Mac Desktop, Mac Desktop, Mac De...","[1708.0, 21260.0, 1223.0, 26.0, 847.0, 1230.0,..."
2,001wyh0pz8,"[search, search, search, show, social_connecti...","[click, click, click, view, data, -unknown-, v...","[view_search_results, view_search_results, vie...","[Android App Unknown Phone/Tablet, Android App...","[622.0, 1813.0, 1507.0, 6327.0, 927.0, 142.0, ..."
3,0028jgx1x1,"[show, reviews, show, search, show, search, re...","[view, data, view, click, view, click, data, s...","[user_profile, listing_reviews, p3, view_searc...","[-unknown-, -unknown-, -unknown-, -unknown-, -...","[6162.0, 75.0, 86.0, 13710.0, 25217.0, 10989.0..."
4,002qnbzfs5,"[social_connections, payment_methods, create, ...","[data, -unknown-, -unknown-, view, data, data,...","[user_social_connections, -unknown-, -unknown-...","[iPhone, iPhone, iPhone, iPhone, iPhone, iPhon...","[17135.0, 711.0, 274.0, 179.0, 483.0, 1.0, 782..."


In [5]:
session_df_concat["total_secs"] = session_df_concat["secs_elapsed"].apply(lambda x: sum(np.nan_to_num(x)))
session_df_concat["average_secs"] = session_df_concat["secs_elapsed"].apply(lambda x: mean(np.nan_to_num(x)))

In [6]:
session_df_concat["unique_action"] = session_df_concat["action"].apply(lambda x: set(action for action in x if pd.notnull(action)))
session_df_concat["unique_action"] = session_df_concat["unique_action"].apply(lambda x: ", ".join(str(item) for item in x))

In [7]:
session_df_concat["unique_action_type"] = session_df_concat["action_type"].apply(lambda x: set(action_type for action_type in x if pd.notnull(action_type)))
session_df_concat["unique_action_type"] = session_df_concat["unique_action_type"].apply(lambda x: ", ".join(str(item) for item in x))

In [8]:
session_df_concat["unique_action_detail"] = session_df_concat["action_detail"].apply(lambda x: set(action_detail for action_detail in x if pd.notnull(action_detail)))
session_df_concat["unique_action_detail"] = session_df_concat["unique_action_detail"].apply(lambda x: ", ".join(str(item) for item in x))

In [9]:
session_df_concat["unique_device_type"] = session_df_concat["device_type"].apply(lambda x: set(device for device in x if pd.notnull(device)))
session_df_concat["unique_device_type"] = session_df_concat["unique_device_type"].apply(lambda x: ", ".join(str(item) for item in x))

In [89]:
train_df = pd.read_csv(r"C:\Users\Wenxia\Desktop\Python\Airbnb_prediction\data\train_users_2.csv")

In [90]:
# ensure the date_account_created is in datetime format
train_df["date_account_created"] = pd.to_datetime(train_df["date_account_created"])

train_df["year_account_created"] = train_df["date_account_created"].dt.year
train_df["month_account_created"] = train_df["date_account_created"].dt.month
train_df["day_account_created"] = train_df["date_account_created"].dt.day

In [91]:
# fill missing age value with mean age
mean_age = train_df["age"].mean()

train_df["age"] = train_df["age"].fillna(mean_age)

In [92]:
train_session_df = train_df.merge(session_df_concat, left_on = "id", right_on = "user_id", how = "inner") 

In [93]:
print("Train shape:", train_df.shape)
print("Session shape: ", session_df_concat.shape)
print("Merged shape:", train_session_df.shape)

Train shape: (213451, 19)
Session shape:  (135483, 12)
Merged shape: (73815, 31)


In [94]:
# drop "action", "action_type", 'action_detail', 'device_type', 'secs_elapsed'
train_session_df = train_session_df.drop(["action", "action_type", 'action_detail', 'device_type', 'secs_elapsed', 'user_id', 'timestamp_first_active', "date_account_created", "date_first_booking", "first_affiliate_tracked"], axis=1)
train_session_df.head(2)

Unnamed: 0,id,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,signup_app,first_device_type,...,country_destination,year_account_created,month_account_created,day_account_created,total_secs,average_secs,unique_action,unique_action_type,unique_action_detail,unique_device_type
0,d1mm9tcy42,MALE,62.0,basic,0,en,sem-non-brand,google,Web,Windows Desktop,...,other,2014,1,1,3427529.0,26988.417323,"personalize, other_hosting_reviews_first, rece...","view, data, message_post, -unknown-, click, su...","wishlist_content_update, message_post, header_...","-unknown-, Windows Desktop"
1,yo8nz8bqcq,-unknown-,49.668335,basic,0,en,direct,direct,Web,Mac Desktop,...,NDF,2014,1,1,207842.0,23093.555556,"dashboard, confirm_email, personalize, header_...","data, click, submit, view","dashboard, user_profile_content_update, confir...",Mac Desktop


In [95]:
train_session_df.isnull().sum()

id                       0
gender                   0
age                      0
signup_method            0
signup_flow              0
language                 0
affiliate_channel        0
affiliate_provider       0
signup_app               0
first_device_type        0
first_browser            0
country_destination      0
year_account_created     0
month_account_created    0
day_account_created      0
total_secs               0
average_secs             0
unique_action            0
unique_action_type       0
unique_action_detail     0
unique_device_type       0
dtype: int64

In [96]:
# create X and Y 
train_session_target = train_session_df["country_destination"]
train_session_df = train_session_df.drop(["country_destination"], axis=1)

print("Y Shape:", train_session_target.shape)
print("X Shape:", train_session_df.shape)

Y Shape: (73815,)
X Shape: (73815, 20)


In [97]:
train_session_df = pd.get_dummies(train_session_df, columns=["gender", "signup_method", "language", "affiliate_channel", "affiliate_provider", 
                                                       "signup_app", "first_device_type", "first_browser", "affiliate_provider"]) # those cols are automatically dropped
train_session_df.shape

(73815, 133)

In [98]:
train_session_df = train_session_df.drop(["id"], axis=1)

In [170]:
# clean test data, follow the steps of train data

test_df = pd.read_csv(r"C:\Users\Wenxia\Desktop\Python\Airbnb_prediction\data\test_users.csv")
print(test_df.shape)
test_df.columns

(62096, 15)


Index(['id', 'date_account_created', 'timestamp_first_active',
       'date_first_booking', 'gender', 'age', 'signup_method', 'signup_flow',
       'language', 'affiliate_channel', 'affiliate_provider',
       'first_affiliate_tracked', 'signup_app', 'first_device_type',
       'first_browser'],
      dtype='object')

In [132]:
test_df["date_account_created"] = pd.to_datetime(test_df["date_account_created"])
test_df["year_account_created"] = test_df["date_account_created"].dt.year
test_df["month_account_created"] = test_df["date_account_created"].dt.month
test_df["day_account_created"] = test_df["date_account_created"].dt.day

In [133]:
mean_age = test_df["age"].mean()

test_df["age"] = test_df["age"].fillna(mean_age)

In [172]:
test_session_df = test_df.merge(session_df_concat, left_on = "id", right_on = "user_id", how = "inner") 
test_session_df.to_csv(r'C:\Users\Wenxia\Desktop\Python\Airbnb_prediction\data\test_user_merged.csv', index=False)

In [135]:
test_session_df = pd.get_dummies(test_session_df, columns=["gender", "signup_method", "language", "affiliate_channel", "affiliate_provider", 
                                                       "signup_app", "first_device_type", "first_browser", "affiliate_provider"]) # those cols are automatically dropped
test_session_df.shape

(61668, 139)

In [136]:
test_session_df = test_session_df.drop(["id"], axis=1)

In [162]:
# split the date into train and test for modelling
from sklearn.model_selection import train_test_split

X_train, X_cv, Y_train, Y_cv = train_test_split(train_session_df, train_session_target, test_size = 0.2, random_state=414)

print("X_train:", X_train.shape)
print("Y_train:", Y_train.shape)
print("X_cv:", X_cv.shape)
print("Y_cv:", Y_cv)

X_train: (59052, 132)
Y_train: (59052,)
X_cv: (14763, 132)
Y_cv: 32057      NDF
9468       NDF
50891      NDF
9863        US
43037      NDF
         ...  
55778      NDF
14074    other
44309       US
42860    other
18290       US
Name: country_destination, Length: 14763, dtype: object


In [27]:
# TFIDF Vectorization for the text field
## create unigram and bigram features for "unique_action" field:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect_action = TfidfVectorizer(min_df = 10, max_features=5000)
tfidf_vect_action.fit(X_train["unique_action"].values)
train_session_action_tfidf = tfidf_vect_action.transform(X_train["unique_action"].values)

In [28]:
cv_session_action_tfidf = tfidf_vect_action.transform(X_cv["unique_action"].values)

In [29]:
train_session_action_tfidf.shape

(59052, 251)

In [30]:
cv_session_action_tfidf.shape

(14763, 251)

In [31]:
print("X action shape:", train_session_action_tfidf.shape)
print("CV action shape:", cv_session_action_tfidf.shape)
print("Top 10 Feature Names for Action:", tfidf_vect_action.get_feature_names_out()[:10])

X action shape: (59052, 251)
CV action shape: (14763, 251)
Top 10 Feature Names for Action: ['10' '11' '12' '15' 'about_us' 'account' 'active' 'add_guests' 'add_note'
 'agree_terms_check']


In [32]:
# do the same for unique_action_type, unique_action_detail

tfidf_vect_action_type = TfidfVectorizer(min_df=10, max_features=5000, ngram_range=(2,3)) # the list may contain 2 or 3 words 
tfidf_vect_action_type.fit(X_train["unique_action_type"].values)
train_session_action_type_tfidf = tfidf_vect_action_type.transform(X_train["unique_action_type"].values)

In [33]:
cv_session_action_type_tfidf = tfidf_vect_action_type.transform(X_cv["unique_action_type"].values)

In [34]:
print("X action type shape:", train_session_action_type_tfidf.shape)
print("CV action type shape:", cv_session_action_type_tfidf.shape)
print("Top 10 Feature Names for Action Type:", tfidf_vect_action_type.get_feature_names_out()[:10])

X action type shape: (59052, 105)
CV action type shape: (14763, 105)
Top 10 Feature Names for Action Type: ['booking_request data' 'booking_request data click'
 'booking_request message_post' 'booking_request submit'
 'booking_request submit view' 'booking_request view'
 'booking_request view data' 'booking_request view message_post'
 'click booking_request' 'click message_post']


In [35]:
tfidf_vect_action_detail = TfidfVectorizer(min_df=10, max_features=5000)
tfidf_vect_action_detail.fit(X_train["unique_action_detail"].values)
train_session_action_detail_tfidf = tfidf_vect_action_detail.transform(X_train["unique_action_detail"].values)

In [36]:
cv_session_action_detail_tfidf = tfidf_vect_action_detail.transform(X_cv["unique_action_detail"].values)

In [37]:
print("X action detail shape:", train_session_action_detail_tfidf.shape)
print("CV action detail shape:", cv_session_action_detail_tfidf.shape)
print("Top 10 Feature Names for Action Detail:", tfidf_vect_action_detail.get_feature_names_out()[:10])

X action detail shape: (59052, 119)
CV action detail shape: (14763, 119)
Top 10 Feature Names for Action Detail: ['account_notification_settings' 'account_payout_preferences'
 'account_privacy_settings' 'account_transaction_history'
 'admin_templates' 'airbnb_picks_wishlists' 'alteration_field'
 'alteration_request' 'apply_coupon' 'apply_coupon_click']


In [38]:
tfidf_vect_device = TfidfVectorizer(min_df=10, max_features=5000)
tfidf_vect_device.fit(X_train["unique_device_type"].values)
train_session_device_tfidf = tfidf_vect_device.transform(X_train["unique_device_type"].values)

In [39]:
cv_session_device_tfidf = tfidf_vect_device.transform(X_cv["unique_device_type"].values)

In [40]:
print("X device type shape:", train_session_device_tfidf.shape)
print("CV device type shape:", cv_session_device_tfidf.shape)
print("Top 10 Feature Names for Device:", tfidf_vect_device.get_feature_names_out()[:10])

X device type shape: (59052, 14)
CV device type shape: (14763, 14)
Top 10 Feature Names for Device: ['android' 'app' 'blackberry' 'chromebook' 'desktop' 'ipad' 'iphone'
 'ipodtouch' 'linux' 'mac']


In [163]:
X_train = X_train.drop(["unique_action", "unique_action_type", "unique_action_detail", "unique_device_type"], axis=1)

In [164]:
final_column_list = [col for col in X_train.columns]
final_column_list.extend(tfidf_vect_action.get_feature_names_out())
final_column_list.extend(tfidf_vect_action_detail.get_feature_names_out())
final_column_list.extend(tfidf_vect_action_type.get_feature_names_out())
final_column_list.extend(tfidf_vect_device.get_feature_names_out())
len(final_column_list)

617

In [165]:
X_cv = X_cv.drop(["unique_action", "unique_action_type", "unique_action_detail", "unique_device_type"], axis=1)

In [155]:
test_session_action_tfidf = tfidf_vect_action.transform(test_session_df["unique_action"].values)
test_session_action_type_tfidf = tfidf_vect_action_type.transform(test_session_df["unique_action_type"].values)
test_session_action_detail_tfidf = tfidf_vect_action_detail.transform(test_session_df["unique_action_detail"].values)
test_session_device_tfidf = tfidf_vect_device.transform(test_session_df["unique_device_type"].values)

KeyError: 'unique_action'

In [142]:
# drop "action", "action_type", 'action_detail', 'device_type', 'secs_elapsed'
test_session_df = test_session_df.drop(["action", "action_type", 'action_detail', 'device_type', 'secs_elapsed', 'user_id', 'timestamp_first_active', "date_account_created", "date_first_booking", "first_affiliate_tracked", 'unique_action', 'unique_action_type', 'unique_action_detail', 'unique_device_type'], axis=1)

In [166]:
# create extra cols for test data, which has less data points than train data 
all_columns = set(X_train.columns).union(set(test_session_df.columns))

for col in all_columns: 
    if col not in X_train.columns: 
        X_train[col] = 0 

for col in all_columns: 
    if col not in X_cv.columns: 
        X_cv[col] = 0 

for col in all_columns:
    if col not in test_session_df.columns: 
        test_session_df[col] = 0 
    
print(X_train.shape)
print(X_cv.shape)
print(test_session_df.shape)

(59052, 135)
(14763, 135)
(61668, 135)


In [167]:
 # concatenate the vectorized features 
X_train = sparse.hstack((X_train, train_session_action_tfidf, train_session_action_type_tfidf, train_session_action_detail_tfidf, train_session_device_tfidf)).tocsr()

In [168]:
X_cv = sparse.hstack((X_cv, cv_session_action_tfidf, cv_session_action_detail_tfidf, cv_session_action_type_tfidf, cv_session_device_tfidf)).tocsr()

In [169]:
sparse.save_npz(r'C:\Users\Wenxia\Desktop\Python\Airbnb_prediction\data\X_train.npz', X_train)
sparse.save_npz(r'C:\Users\Wenxia\Desktop\Python\Airbnb_prediction\data\X_cv.npz', X_cv)

In [147]:
with open(r'C:\Users\Wenxia\Desktop\Python\Airbnb_prediction\data\final_column_list.text', 'wb') as fp:
    pickle.dump(final_column_list, fp)

In [148]:
Y_train.to_pickle(r'C:\Users\Wenxia\Desktop\Python\Airbnb_prediction\data\Y_train.pickle')
Y_cv.to_pickle(r'C:\Users\Wenxia\Desktop\Python\Airbnb_prediction\data\Y_cv.pickle') 

In [149]:
X_test = sparse.hstack((test_session_df, test_session_action_tfidf, test_session_action_detail_tfidf, test_session_action_type_tfidf, test_session_device_tfidf)).tocsr()

In [150]:
sparse.save_npz(r'C:\Users\Wenxia\Desktop\Python\Airbnb_prediction\data\X_test.npz', X_test)