In [4]:
import numpy as np 
import pandas as pd
import seaborn as sns
from subprocess import check_output

# Import data

In [5]:
destinations = pd.read_csv("/Users/shengwan/Desktop/destinations.csv")
test = pd.read_csv("/Users/shengwan/Desktop/test.csv")
train = pd.read_csv("/Users/shengwan/Desktop/train.csv")

In [6]:
train["date_time"] = pd.to_datetime(train["date_time"])
train["year"] = train["date_time"].dt.year
train["month"] = train["date_time"].dt.month

# PCA

In [7]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
dest_small = pca.fit_transform(destinations[["d{0}".format(i + 1) for i in range(149)]])
dest_small = pd.DataFrame(dest_small)
dest_small["srch_destination_id"] = destinations["srch_destination_id"]

In [8]:
t1=train
t2=test
df=t1

# Format features

In [9]:
df["srch_ci"] = pd.to_datetime(df["srch_ci"], format='%Y-%m-%d', errors="coerce")
df["srch_co"] = pd.to_datetime(df["srch_co"], format='%Y-%m-%d', errors="coerce")
props = {}
for prop in ["month", "day", "hour", "minute", "dayofweek", "quarter"]:
    props[prop] = getattr(df["date_time"].dt, prop)

In [10]:
carryover = [p for p in df.columns if p not in ["date_time", "srch_ci", "srch_co"]]
for prop in carryover:
    props[prop] = df[prop]

In [11]:
date_props = ["month", "day", "dayofweek", "quarter"]
for prop in date_props:
    props["ci_{0}".format(prop)] = getattr(df["srch_ci"].dt, prop)
    props["co_{0}".format(prop)] = getattr(df["srch_co"].dt, prop)
props["stay_span"] = (df["srch_co"] - df["srch_ci"]).astype('timedelta64[h]')
ret = pd.DataFrame(props)
ret = ret.join(dest_small, on="srch_destination_id", how='left', rsuffix="dest")
ret = ret.drop("srch_destination_iddest", axis=1)

In [58]:
dest_small.head()

Unnamed: 0,0,1,2,srch_destination_id
0,-0.044268,0.169419,0.032524,0
1,-0.440761,0.077405,-0.091573,1
2,0.001033,0.020677,0.012108,2
3,-0.480467,-0.040345,-0.019319,3
4,-0.207253,-0.042694,-0.011744,4


In [12]:
df.head()

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster,year,month
0,2014-08-11 07:46:59,2,3,66,348,48862,2234.2641,12,0,1,...,8250,1,0,3,2,50,628,1,2014,8
1,2014-08-11 08:22:12,2,3,66,348,48862,2234.2641,12,0,1,...,8250,1,1,1,2,50,628,1,2014,8
2,2014-08-11 08:24:33,2,3,66,348,48862,2234.2641,12,0,0,...,8250,1,0,1,2,50,628,1,2014,8
3,2014-08-09 18:05:16,2,3,66,442,35390,913.1932,93,0,0,...,14984,1,0,1,2,50,1457,80,2014,8
4,2014-08-09 18:08:18,2,3,66,442,35390,913.6259,93,0,0,...,14984,1,0,1,2,50,1457,21,2014,8


# Model

In [13]:
t1=df
def make_key(items):
    return "_".join([str(i) for i in items])

match_cols = ["srch_destination_id"]
cluster_cols = match_cols + ['hotel_cluster']
groups = t1.groupby(cluster_cols)
top_clusters = {}
for name, group in groups:
    clicks = len(group.is_booking[group.is_booking == False])
    bookings = len(group.is_booking[group.is_booking == True])
    
    score = bookings + .15 * clicks
    
    clus_name = make_key(name[:len(match_cols)])
    if clus_name not in top_clusters:
        top_clusters[clus_name] = {}
    top_clusters[clus_name][name[-1]] = score

In [14]:
import operator

cluster_dict = {}
for n in top_clusters:
    tc = top_clusters[n]
    top = [l[0] for l in sorted(tc.items(), key=operator.itemgetter(1), reverse=True)[:5]]
    cluster_dict[n] = top
    
preds = []
for index, row in t2.iterrows():
    key = make_key([row[m] for m in match_cols])
    if key in cluster_dict:
        preds.append(cluster_dict[key])
    else:
        preds.append([])

In [15]:
match_cols = ['user_location_country', 'user_location_region', 'user_location_city', 'hotel_market', 'orig_destination_distance']

groups = t1.groupby(match_cols)
    
def generate_exact_matches(row, match_cols):
    index = tuple([row[t] for t in match_cols])
    try:
        group = groups.get_group(index)
    except Exception:
        return []
    clus = list(set(group.hotel_cluster))
    return clus

exact_matches = []
for i in range(t2.shape[0]):
    exact_matches.append(generate_exact_matches(t2.iloc[i], match_cols))

In [21]:
most_common_clusters = list(train.hotel_cluster.value_counts().head().index)

In [22]:
def f5(seq, idfun=None): 
    if idfun is None:
        def idfun(x): return x
    seen = {}
    result = []
    for item in seq:
        marker = idfun(item)
        if marker in seen: continue
        seen[marker] = 1
        result.append(item)
    return result
    

In [48]:
preds[65534]

[63, 81, 57, 60, 46]

In [31]:
full_preds = [f5(exact_matches[p] + preds[p] + most_common_clusters)[:5] for p in range(len(preds))]

In [35]:
from pandas.core.frame import DataFrame
prediction2=DataFrame(full_preds)
prediction2.head()

Unnamed: 0,0,1,2,3,4
0,5,37,55,11,22
1,5,91,41,48,64
2,91,0,31,96,77
3,1,45,79,24,54
4,50,51,91,42,2


# Output prediction

In [51]:
prediction2.to_csv('predict_2.csv',header=True, index_label='id')

In [36]:
write_p = [" ".join([str(l) for l in p]) for p in full_preds]

In [40]:
prediction1=DataFrame(write_p,columns=["hotel_cluster"])
prediction1.head()

Unnamed: 0,hotel_cluster
0,5 37 55 11 22
1,5 91 41 48 64
2,91 0 31 96 77
3,1 45 79 24 54
4,50 51 91 42 2


In [50]:
prediction1.to_csv('predict_new.csv',header=True, index_label='id')

In [45]:
prediction1.shape

(2528243, 1)

In [52]:
train.shape

(37670293, 26)