# https://www.kaggle.com/rakhlin/two-sigma-connect-rental-listing-inquiries/another-python-version-of-it-is-lit-by-branden/code

In [1]:
import numpy as np
import pandas as pd
from itertools import product
from sklearn.model_selection import StratifiedKFold
from sklearn.cluster import KMeans
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, Imputer, LabelBinarizer, MultiLabelBinarizer
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def add_features(df):
    fmt = lambda s: s.replace("\u00a0", "").strip().lower()
    df["num_photo_count"] = df["photos"].apply(len)
    df["street_address"] = df['street_address'].apply(fmt)
    df["display_address"] = df["display_address"].apply(fmt)
    df["num_desc_wordcount"] = df["description"].apply(len)
    df["num_pricePerBed"] = df['price'] / df['bedrooms']
    df["num_pricePerBath"] = df['price'] / df['bathrooms']
    df["num_pricePerRoom"] = df['price'] / (df['bedrooms'] + df['bathrooms'])
    df["num_bedPerBath"] = df['bedrooms'] / df['bathrooms']
    df["num_bedBathDiff"] = df['bedrooms'] - df['bathrooms']
    df["num_bedBathSum"] = df["bedrooms"] + df['bathrooms']
    df["num_bedsPerc"] = df["bedrooms"] / (df['bedrooms'] + df['bathrooms'])

    df = df.fillna(-1).replace(np.inf, -1)
    return df


def factorize(df1, df2, column):
    ps = df1[column].append(df2[column])
    factors = ps.factorize()[0]
    df1[column] = factors[:len(df1)]
    df2[column] = factors[len(df1):]
    return df1, df2


def designate_single_observations(df1, df2, column):
    ps = df1[column].append(df2[column])
    grouped = ps.groupby(ps).size().to_frame().rename(columns={0: "size"})
    df1.loc[df1.join(grouped, on=column, how="left")["size"] <= 1, column] = -1
    df2.loc[df2.join(grouped, on=column, how="left")["size"] <= 1, column] = -1
    return df1, df2


def hcc_encode(train_df, test_df, variable, target, prior_prob, k, f=1, g=1, r_k=None, update_df=None):
    """
    See "A Preprocessing Scheme for High-Cardinality Categorical Attributes in
    Classification and Prediction Problems" by Daniele Micci-Barreca
    """
    hcc_name = "_".join(["hcc", variable, target])

    grouped = train_df.groupby(variable)[target].agg({"size": "size", "mean": "mean"})
    grouped["lambda"] = 1 / (g + np.exp((k - grouped["size"]) / f))
    grouped[hcc_name] = grouped["lambda"] * grouped["mean"] + (1 - grouped["lambda"]) * prior_prob

    df = test_df[[variable]].join(grouped, on=variable, how="left")[hcc_name].fillna(prior_prob)
    if r_k: df *= np.random.uniform(1 - r_k, 1 + r_k, len(test_df))     # Add uniform noise. Not mentioned in original paper

    if update_df is None: update_df = test_df
    if hcc_name not in update_df.columns: update_df[hcc_name] = np.nan
    update_df.update(df)
    return

In [3]:
# Load data
X_train = pd.read_json("../input/train.json").sort_values(by="listing_id")
X_test = pd.read_json("../input/test.json").sort_values(by="listing_id")

# Make target integer, one hot encoded, calculate target priors
X_train = X_train.replace({"interest_level": {"low": 0, "medium": 1, "high": 2}})
X_train = X_train.join(pd.get_dummies(X_train["interest_level"], prefix="pred").astype(int))
prior_0, prior_1, prior_2 = X_train[["pred_0", "pred_1", "pred_2"]].mean()

In [4]:
# Add common features
X_train = add_features(X_train)
X_test = add_features(X_test) 

In [5]:
# Special designation for building_ids, manager_ids, display_address with only 1 observation
for col in ('building_id', 'manager_id', 'display_address'):
    X_train, X_test = designate_single_observations(X_train, X_test, col)

# High-Cardinality Categorical encoding
skf = StratifiedKFold(5)
attributes = product(("building_id", "manager_id"), zip(("pred_1", "pred_2"), (prior_1, prior_2)))
for variable, (target, prior) in attributes:
    hcc_encode(X_train, X_test, variable, target, prior, k=5, r_k=None)
    for train, test in skf.split(np.zeros(len(X_train)), X_train['interest_level']):
        hcc_encode(X_train.iloc[train], X_train.iloc[test], variable, target, prior, k=5, r_k=0.01, update_df=X_train)

In [6]:
# Factorize building_id, display_address, manager_id, street_address
for col in ('building_id', 'display_address', 'manager_id', 'street_address'):
    X_train, X_test = factorize(X_train, X_test, col)

In [7]:
# Create binarized features
fmt = lambda feat: [s.replace("\u00a0", "").strip().lower().replace(" ", "_") for s in feat]  # format features
X_train["features"] = X_train["features"].apply(fmt)
X_test["features"] = X_test["features"].apply(fmt)

features = [f for f_list in list(X_train["features"]) + list(X_test["features"]) for f in f_list]
ps = pd.Series(features)
grouped = ps.groupby(ps).agg(len)
features = grouped[grouped >= 10].index.sort_values().values    # limit to features with >=10 observations
mlb = MultiLabelBinarizer().fit([features])
columns = ['feature_' + s for s in mlb.classes_]
flt = lambda l: [i for i in l if i in mlb.classes_]     # filter out features not present in MultiLabelBinarizer
X_train = X_train.join(pd.DataFrame(data=mlb.transform(X_train["features"].apply(flt)), columns=columns, index=X_train.index))
X_test = X_test.join(pd.DataFrame(data=mlb.transform(X_test["features"].apply(flt)), columns=columns, index=X_test.index))


In [8]:
columns_to_drop = ["photos", "pred_0","pred_1", "pred_2", "description", "features", "created",
#                    'hcc_building_id_pred_1', 'hcc_building_id_pred_2',
#                    'hcc_manager_id_pred_1', 'hcc_manager_id_pred_2',
                  ]
X_train = X_train.drop([c for c in X_train.columns if c in columns_to_drop], axis=1)
X_test = X_test.drop([c for c in X_test.columns if c in columns_to_drop], axis=1)

In [9]:
X_train.columns.values

array([u'bathrooms', u'bedrooms', u'building_id', u'display_address',
       u'interest_level', u'latitude', u'listing_id', u'longitude',
       u'manager_id', u'price', u'street_address', 'num_photo_count',
       'num_desc_wordcount', 'num_pricePerBed', 'num_pricePerBath',
       'num_pricePerRoom', 'num_bedPerBath', 'num_bedBathDiff',
       'num_bedBathSum', 'num_bedsPerc', 'hcc_building_id_pred_1',
       'hcc_building_id_pred_2', 'hcc_manager_id_pred_1',
       'hcc_manager_id_pred_2', u'feature_1_month_free',
       u'feature_24/7_concierge', u'feature_24/7_doorman',
       u'feature_24/7_doorman_concierge', u'feature_actual_apt._photos',
       u'feature_air_conditioning', u'feature_all_pets_ok',
       u'feature_all_utilities_included',
       u'feature_assigned-parking-space', u'feature_attended_lobby',
       u'feature_backyard', u'feature_balcony',
       u'feature_basement_storage', u'feature_basketball_court',
       u'feature_bike_room', u'feature_bike_storage',
       u

In [10]:
# # Save

# X_train = X_train.sort_index(axis=1).sort_values(by="listing_id")
# X_test = X_test.sort_index(axis=1).sort_values(by="listing_id")


## https://www.kaggle.com/arnaldcat/two-sigma-connect-rental-listing-inquiries/neural-network-w-feat-engineering-0-583lb

In [11]:
def basic_preprocess(df_train, df_test, n_min=50, precision=3):
    
    # Interest: Numerical encoding of interest level
    df_train['y'] = 0.0
    df_train.loc[df_train.interest_level=='medium', 'y'] = 1.0
    df_train.loc[df_train.interest_level=='high', 'y'] = 2.0
    
    # Location features: Latitude, longitude
    df_train['num_latitude'] = df_train.latitude.values
    df_test['num_latitude'] = df_test.latitude.values
    df_train['num_longitude'] = df_train.longitude.values
    df_test['num_longitude'] = df_test.longitude.values
    x = np.sqrt(((df_train.latitude - df_train.latitude.median())**2) + (df_train.longitude - df_train.longitude.median())**2)
    df_train['num_dist_from_center'] = x.values
    x = np.sqrt(((df_test.latitude - df_train.latitude.median())**2) + (df_test.longitude - df_train.longitude.median())**2)
    df_test['num_dist_from_center'] = x.values
    df_train['pos'] = df_train.longitude.round(precision).astype(str) + '_' + df_train.latitude.round(precision).astype(str)
    df_test['pos'] = df_test.longitude.round(precision).astype(str) + '_' + df_test.latitude.round(precision).astype(str)
    
    # Degree of "outlierness"
    OutlierAggregated = (df_train.bedrooms > 4).astype(float)
    OutlierAggregated2 = (df_test.bedrooms > 4).astype(float)
    OutlierAggregated += (df_train.bathrooms > 3).astype(float)
    OutlierAggregated2 += (df_test.bathrooms > 3).astype(float)
    OutlierAggregated += (df_train.bathrooms < 1).astype(float)
    OutlierAggregated2 += (df_test.bathrooms < 1).astype(float)
    x = np.abs((df_train.price - df_train.price.median())/df_train.price.std()) > 0.30
    OutlierAggregated += x.astype(float)
    x2 = np.abs((df_test.price - df_train.price.median())/df_train.price.std()) > 0.30
    OutlierAggregated2 += x2.astype(float)
    x = np.log1p(df_train.price/(df_train.bedrooms.clip(1,3) + df_train.bathrooms.clip(1,2))) > 8.2
    OutlierAggregated += x.astype(float)
    x2 = np.log1p(df_test.price/(df_test.bedrooms.clip(1,3) + df_test.bathrooms.clip(1,2))) > 8.2
    OutlierAggregated2 += x2.astype(float)
    x = np.sqrt(((df_train.latitude - df_train.latitude.median())**2) + (df_train.longitude - df_train.longitude.median())**2) > 0.30
    OutlierAggregated += x.astype(float)
    x2 = np.sqrt(((df_test.latitude - df_train.latitude.median())**2) + (df_test.longitude - df_train.longitude.median())**2) > 0.30
    OutlierAggregated2 += x2.astype(float)
    df_train['num_OutlierAggregated'] = OutlierAggregated.values
    df_test['num_OutlierAggregated'] = OutlierAggregated2.values
    
#     # Average interest in unique locations at given precision
#     x = df_train.groupby('pos')['y'].aggregate(['count', 'mean'])
#     d = x.loc[x['count'] >= n_min, 'mean'].to_dict()
#     impute = df_train.y.mean()
#     df_train['num_pos'] = df_train.pos.apply(lambda x: d.get(x, impute))
#     df_test['num_pos'] = df_test.pos.apply(lambda x: d.get(x, impute))
    
    # Density in unique locations at given precision
    vals = df_train['pos'].value_counts()
    dvals = vals.to_dict()
    df_train['num_pos_density'] = df_train['pos'].apply(lambda x: dvals.get(x, vals.min()))
    df_test['num_pos_density'] = df_test['pos'].apply(lambda x: dvals.get(x, vals.min()))

    # Building null
    df_train['num_building_null'] = (df_train.building_id=='0').astype(float)
    df_test['num_building_null'] = (df_test.building_id=='0').astype(float)
    
#     # Building supervised
#     x = df_train.groupby('building_id')['y'].aggregate(['count', 'mean'])
#     d = x.loc[x['count'] >= n_min, 'mean'].to_dict()
#     impute = df_train.y.mean()
#     df_train['num_building_id'] = df_train.building_id.apply(lambda x: d.get(x, impute))
#     df_test['num_building_id'] = df_test.building_id.apply(lambda x: d.get(x, impute))
    
    # Building frequency
    d = np.log1p(df_train.building_id.value_counts()).to_dict()
    impute = np.min(np.array(list(d.values())))
    df_train['num_fbuilding'] = df_train.building_id.apply(lambda x: d.get(x, impute))
    df_test['num_fbuilding'] = df_test.building_id.apply(lambda x: d.get(x, impute))
    
#     # Manager supervised
#     x = df_train.groupby('manager_id')['y'].aggregate(['count', 'mean'])
#     d = x.loc[x['count'] >= n_min, 'mean'].to_dict()
#     impute = df_train.y.mean()
#     df_train['num_manager'] = df_train.manager_id.apply(lambda x: d.get(x, impute))
#     df_test['num_manager'] = df_test.manager_id.apply(lambda x: d.get(x, impute))

    # Manager frequency
    d = np.log1p(df_train.manager_id.value_counts()).to_dict()
    impute = np.min(np.array(list(d.values())))
    df_train['num_fmanager'] = df_train.manager_id.apply(lambda x: d.get(x, impute))
    df_test['num_fmanager'] = df_test.manager_id.apply(lambda x: d.get(x, impute))
    
    # Creation time features
    df_train['created'] = pd.to_datetime(df_train.created)
    df_train['num_created_weekday'] = df_train.created.dt.dayofweek.astype(float)
    df_train['num_created_weekofyear'] = df_train.created.dt.weekofyear
    df_train['num_created_day'] = df_train.created.dt.day
    df_train['num_created_month'] = df_train.created.dt.month
    df_train['num_created_hour'] = df_train.created.dt.hour
  
    df_test['created'] = pd.to_datetime(df_test.created)
    df_test['num_created_weekday'] = df_test.created.dt.dayofweek
    df_test['num_created_weekofyear'] = df_test.created.dt.weekofyear
    df_test['num_created_day'] = df_test.created.dt.day
    df_test['num_created_month'] = df_test.created.dt.month
    df_test['num_created_hour'] = df_test.created.dt.hour
    
    # Bedrooms/Bathrooms/Price
    df_train['num_bathrooms'] = df_train.bathrooms.clip_upper(4)
    df_test['num_bathrooms'] = df_test.bathrooms.clip_upper(4)
    df_train['num_bedrooms'] = df_train.bedrooms.clip_upper(5)
    df_test['num_bedrooms'] = df_test.bedrooms.clip_upper(5)
    df_train['num_price'] = df_train.price.clip_upper(10000)
    df_test['num_price'] = df_test.price.clip_upper(10000)
    bins = df_train.price.quantile(np.arange(0.05, 1, 0.05))
    df_train['num_price_q'] = np.digitize(df_train.price, bins)
    df_test['num_price_q'] = np.digitize(df_test.price, bins)
    
    # Composite features based on: 
    # https://www.kaggle.com/arnaldcat/two-sigma-connect-rental-listing-inquiries/a-proxy-for-sqft-and-the-interest-on-1-2-baths
    df_train['num_priceXroom'] = (df_train.price / (1 + df_train.bedrooms.clip(1, 4) + 0.5*df_train.bathrooms.clip(0, 2))).values
    df_test['num_priceXroom'] = (df_test.price / (1 + df_test.bedrooms.clip(1, 4) + 0.5*df_test.bathrooms.clip(0, 2))).values
    df_train['num_even_bathrooms'] = ((np.round(df_train.bathrooms) - df_train.bathrooms)==0).astype(float)
    df_test['num_even_bathrooms'] = ((np.round(df_test.bathrooms) - df_test.bathrooms)==0).astype(float)
    
    # Other features
    df_train['num_features'] = df_train.features.apply(lambda x: len(x))
    df_test['num_features'] = df_test.features.apply(lambda x: len(x))
    df_train['num_photos'] = df_train.photos.apply(lambda x: len(x))
    df_test['num_photos'] = df_test.photos.apply(lambda x: len(x))
    df_train['num_desc_length'] = df_train.description.str.split(' ').str.len()
    df_test['num_desc_length'] = df_test.description.str.split(' ').str.len()
    df_train['num_desc_length_null'] = (df_train.description.str.len()==0).astype(float)
    df_test['num_desc_length_null'] = (df_test.description.str.len()==0).astype(float)
    
#     # Features/Description Features
#     bows = {'nofee': ['no fee', 'no-fee', 'no  fee', 'nofee', 'no_fee'],
#             'lowfee': ['reduced_fee', 'low_fee','reduced fee', 'low fee'],
#             'furnished': ['furnished'],
#             'parquet': ['parquet', 'hardwood'],
#             'concierge': ['concierge', 'doorman', 'housekeep','in_super'],
#             'prewar': ['prewar', 'pre_war', 'pre war', 'pre-war'],
#             'laundry': ['laundry', 'lndry'],
#             'health': ['health', 'gym', 'fitness', 'training'],
#             'transport': ['train', 'subway', 'transport'],
#             'parking': ['parking'],
#             'utilities': ['utilities', 'heat water', 'water included']
#           }
#     for fname, bow in bows.items():
#         x1 = df_train.description.str.lower().apply(lambda x: np.sum([1 for i in bow if i in x]))
#         x2 = df_train.features.apply(lambda x: np.sum([1 for i in bow if i in ' '.join(x).lower()]))
#         df_train['num_'+fname] = ((x1 + x2) > 0).astype(float).values
#         x1 = df_test.description.str.lower().apply(lambda x: np.sum([1 for i in bow if i in x]))
#         x2 = df_test.features.apply(lambda x: np.sum([1 for i in bow if i in ' '.join(x).lower()]))
#         df_test['num_'+fname] = ((x1 + x2) > 0).astype(float).values

    return df_train, df_test

In [12]:
df = pd.read_json('../input/train.json')
df_test = pd.read_json('../input/test.json')
df['created'] = pd.to_datetime(df.created)
df_test['created'] = pd.to_datetime(df_test.created)


df_test.loc[df_test.bathrooms == 112.0,'bathrooms'] = 1.5    
df_test.loc[df_test.bathrooms == 20.0,'bathrooms'] = 2.0
df_test.loc[14609,'bedrooms'] = 3
df_test.loc[57733,'bedrooms'] = 6

In [13]:
# Get relevant features
df, df_test = basic_preprocess(df, df_test, n_min=15, precision=3)

In [14]:
# Make target integer, one hot encoded, calculate target priors
# X_train = X_train.replace({"interest_level": {"low": 0, "medium": 1, "high": 2}})
df = df.join(pd.get_dummies(df["y"].astype(int), prefix="pred").astype(int))
prior_0, prior_1, prior_2 = df[["pred_0", "pred_1", "pred_2"]].mean()

In [15]:
# Special designation for pos with only 1 observation
df, df_test = designate_single_observations(df, df_test, 'pos')

In [16]:
# High-Cardinality Categorical encoding
skf = StratifiedKFold(5)
attributes = product((['pos']), zip(("pred_1", "pred_2"), (prior_1, prior_2)))
for variable, (target, prior) in attributes:
    hcc_encode(df, df_test, variable, target, prior, k=5, r_k=None)
    for train, test in skf.split(np.zeros(len(df)), df['y']):
        hcc_encode(df.iloc[train], df.iloc[test], variable, target, prior, k=5, r_k=0.01, update_df=df)

In [17]:
dftemp = df.copy()
for i in ['latitude', 'longitude']:
    while(1):
        x = dftemp[i].median()
        ix = abs(dftemp[i] - x) > 3*dftemp[i].std()
        if ix.sum()==0:
            break
        dftemp.loc[ix, i] = np.nan
dftemp = dftemp.loc[dftemp[['latitude', 'longitude']].isnull().sum(1) == 0, :]

dfm = DataFrameMapper([(['latitude'], [StandardScaler()]), (['longitude'], [StandardScaler()])])

for i in [6,40]:
    pipe_location = make_pipeline(dfm, KMeans(n_clusters=i, random_state=1))
    pipe_location.fit(dftemp);
    df['location_'+str(i)] = pipe_location.predict(df).astype(str)
    df_test['location_'+str(i)] = pipe_location.predict(df_test).astype(str)
for i in df.location_6.unique():
    df['num_location_6_'+str(i)] = (df.location_6==i).astype(float)
    df_test['num_location_6_'+str(i)] = (df_test.location_6==i).astype(float)
for i in df.location_40.unique():
    df['num_location_40_'+str(i)] = (df.location_40==i).astype(float)
    df_test['num_location_40_'+str(i)] = (df_test.location_40==i).astype(float)
print 'Done!'

Done!


In [18]:
df['tmp_bathrooms'] = df.bathrooms.clip_upper(2)
df_test['tmp_bathrooms'] = df_test.bathrooms.clip_upper(2)
df['tmp_bedrooms'] = df.bedrooms.clip_upper(4)
df_test['tmp_bedrooms'] = df_test.bedrooms.clip_upper(4)
df['roomcal'] = df.tmp_bedrooms.astype(str) + '_' + df.tmp_bathrooms.astype(str)    
df_test['roomcal'] = df_test.tmp_bedrooms.astype(str) + '_' + df_test.tmp_bathrooms.astype(str)    

room_lb = LabelBinarizer()
room_lb.fit(df['roomcal'])
room_col = ['num_room_type_' + str(x) for x in range(len(df['roomcal'].unique()))]

In [19]:
room_col = ['num_room_type_' + str(x) for x in range(len(df['roomcal'].unique()))]
df = df.join(pd.DataFrame(room_lb.transform(df['roomcal']),columns=room_col,index=df.index))
df_test = df_test.join(pd.DataFrame(room_lb.transform(df_test['roomcal']),columns=room_col,index=df_test.index))

In [20]:
# room cal + location VS price

tmp = df.groupby(['roomcal','location_6'])['num_price'].median().\
            reset_index().rename(columns={'num_price':'num_6_median_price'})
    
df = df.merge(tmp,on=['roomcal','location_6'],how='left')
df_test = df_test.merge(tmp,on=['roomcal','location_6'],how='left')

df_test.loc[27462,'num_6_median_price'] =  7200.0

df['num_6_price_ratio'] = df['num_price'] / df['num_6_median_price']
df['num_6_price_diff'] = df['num_price'] - df['num_6_median_price']
df_test['num_6_price_ratio'] = df_test['num_price'] / df_test['num_6_median_price']
df_test['num_6_price_diff'] = df_test['num_price'] - df_test['num_6_median_price']

tmp = df.groupby(['roomcal','location_40'])['num_price'].median().\
            reset_index().rename(columns={'num_price':'num_loc_median_price'})
    
df = df.merge(tmp,on=['roomcal','location_40'],how='left')
df_test = df_test.merge(tmp,on=['roomcal','location_40'],how='left')

# introduced NaN in 'num_loc_price_diff' before!!!
nul_ind = df_test.num_loc_median_price.isnull()
df_test.loc[nul_ind,['num_loc_median_price']] = \
    df_test.loc[nul_ind,['num_6_median_price']].values
                
df['num_loc_price_ratio'] = df['num_price'] / df['num_loc_median_price']
df['num_loc_price_diff'] = df['num_price'] - df['num_loc_median_price']
df_test['num_loc_price_ratio'] = df_test['num_price'] / df_test['num_loc_median_price']
df_test['num_loc_price_diff'] = df_test['num_price'] - df_test['num_loc_median_price']




df['num_loc_ratio'] = df['num_6_median_price'] / df['num_loc_median_price']
df['num_loc_diff'] = df['num_6_median_price'] - df['num_loc_median_price']
df_test['num_loc_ratio'] = df_test['num_6_median_price'] / df_test['num_loc_median_price']
df_test['num_loc_diff'] = df_test['num_6_median_price'] - df_test['num_loc_median_price']

In [23]:
df.columns.values

array([u'bathrooms', u'bedrooms', u'building_id', u'created',
       u'description', u'display_address', u'features', u'interest_level',
       u'latitude', u'listing_id', u'longitude', u'manager_id', u'photos',
       u'price', u'street_address', 'y', 'num_latitude', 'num_longitude',
       'num_dist_from_center', 'pos', 'num_OutlierAggregated',
       'num_pos_density', 'num_building_null', 'num_fbuilding',
       'num_fmanager', 'num_created_weekday', 'num_created_weekofyear',
       'num_created_day', 'num_created_month', 'num_created_hour',
       'num_bathrooms', 'num_bedrooms', 'num_price', 'num_price_q',
       'num_priceXroom', 'num_even_bathrooms', 'num_features',
       'num_photos', 'num_desc_length', 'num_desc_length_null', 'pred_0',
       'pred_1', 'pred_2', 'hcc_pos_pred_1', 'hcc_pos_pred_2',
       'location_6', 'location_40', 'num_location_6_3', 'num_location_6_1',
       'num_location_6_0', 'num_location_6_5', 'num_location_6_4',
       'num_location_6_2', 'num_locat

In [24]:
feats = [i for i in df.columns.values if i.startswith('num_')]
feats.append('listing_id')
feats.extend(['hcc_pos_pred_1', 'hcc_pos_pred_2'])

In [25]:
x_train = df[feats]
x_test = df_test[feats]
print x_train.shape, x_test.shape

(49352, 100) (74659, 100)


In [26]:
X_train.columns.values

array([u'bathrooms', u'bedrooms', u'building_id', u'display_address',
       u'interest_level', u'latitude', u'listing_id', u'longitude',
       u'manager_id', u'price', u'street_address', 'num_photo_count',
       'num_desc_wordcount', 'num_pricePerBed', 'num_pricePerBath',
       'num_pricePerRoom', 'num_bedPerBath', 'num_bedBathDiff',
       'num_bedBathSum', 'num_bedsPerc', 'hcc_building_id_pred_1',
       'hcc_building_id_pred_2', 'hcc_manager_id_pred_1',
       'hcc_manager_id_pred_2', u'feature_1_month_free',
       u'feature_24/7_concierge', u'feature_24/7_doorman',
       u'feature_24/7_doorman_concierge', u'feature_actual_apt._photos',
       u'feature_air_conditioning', u'feature_all_pets_ok',
       u'feature_all_utilities_included',
       u'feature_assigned-parking-space', u'feature_attended_lobby',
       u'feature_backyard', u'feature_balcony',
       u'feature_basement_storage', u'feature_basketball_court',
       u'feature_bike_room', u'feature_bike_storage',
       u

In [27]:
X_train_feature = ['building_id','display_address','listing_id','manager_id','street_address',
                   'num_pricePerBed', 'num_pricePerBath',
       'num_pricePerRoom', 'num_bedPerBath', 'num_bedBathDiff',
       'num_bedBathSum', 'num_bedsPerc', 
                   'hcc_building_id_pred_1','hcc_building_id_pred_2', 
                   'hcc_manager_id_pred_1','hcc_manager_id_pred_2',
                   u'feature_1_month_free',
       u'feature_24/7_concierge', u'feature_24/7_doorman',
       u'feature_24/7_doorman_concierge', u'feature_actual_apt._photos',
       u'feature_air_conditioning', u'feature_all_pets_ok',
       u'feature_all_utilities_included',
       u'feature_assigned-parking-space', u'feature_attended_lobby',
       u'feature_backyard', u'feature_balcony',
       u'feature_basement_storage', u'feature_basketball_court',
       u'feature_bike_room', u'feature_bike_storage',
       u'feature_billiards_room', u'feature_billiards_table_and_wet_bar',
       u'feature_brand_new', u'feature_breakfast_bar', u'feature_bright',
       u'feature_brownstone', u'feature_building-common-outdoor-space',
       u'feature_business_center', u'feature_cable/satellite_tv',
       u'feature_cable_ready',
       u'feature_call/text_abraham_caro_@_917-373-0862',
       u'feature_cats_allowed', u'feature_central_a/c',
       u'feature_central_ac', u'feature_central_air',
       u'feature_chefs_kitchen', u"feature_children's_playroom",
       u'feature_childrens_playroom', u'feature_cinema_room',
       u'feature_city_view', u'feature_close_to_subway',
       u'feature_closets_galore!',
       u'feature_club_sun_deck_has_spectacular_city_and_river_views',
       u'feature_cold_storage', u'feature_common_backyard',
       u'feature_common_garden', u'feature_common_outdoor_space',
       u'feature_common_parking/garage', u'feature_common_roof_deck',
       u'feature_common_storage', u'feature_common_terrace',
       u'feature_community_recreation_facilities',
       u'feature_complimentary_sunday_brunch', u'feature_concierge',
       u'feature_concierge_service', u'feature_condo_finishes',
       u'feature_courtyard', u'feature_crown_moldings', u'feature_deck',
       u'feature_deco_brick_wall', u'feature_decorative_fireplace',
       u'feature_dining_room', u'feature_dishwasher',
       u'feature_dogs_allowed', u'feature_doorman',
       u'feature_dry_cleaning_service', u'feature_dryer_in_unit',
       u'feature_duplex', u'feature_duplex_lounge',
       u'feature_eat-in_kitchen', u'feature_eat_in_kitchen',
       u'feature_elegant_glass-enclosed_private_lounge_with_magnificent_river_views',
       u'feature_elevator', u'feature_exclusive',
       u'feature_exercise/yoga_studio', u'feature_exposed_brick',
       u'feature_extra_room', u'feature_fireplace', u'feature_fireplaces',
       u'feature_fitness_center', u'feature_fitness_room',
       u'feature_flex-2', u'feature_flex-3',
       u'feature_free_wifi_in_club_lounge', u'feature_ft_doorman',
       u'feature_full-time_doorman', u'feature_full_service_garage',
       u'feature_fully-equipped_club_fitness_center',
       u'feature_fully__equipped', u'feature_furnished',
       u'feature_game_room', u'feature_garage',
       u'feature_garbage_disposal', u'feature_garden',
       u'feature_garden/patio', u'feature_granite_countertops',
       u'feature_granite_kitchen', u'feature_green_building',
       u'feature_guarantors_accepted', u'feature_gut_renovated',
       u'feature_gym', u'feature_gym/fitness', u'feature_gym_in_building',
       u'feature_hardwood', u'feature_hardwood_floors',
       u'feature_health_club', u'feature_hi_rise',
       u'feature_high-speed_internet', u'feature_high_ceiling',
       u'feature_high_ceilings', u'feature_high_speed_internet',
       u'feature_highrise', u'feature_housekeeping_service',
       u'feature_in-unit_washer/dryer', u'feature_indoor_pool',
       u'feature_intercom', u'feature_jacuzzi',
       u'feature_large_living_room', u'feature_laundry',
       u'feature_laundry_&_housekeeping', u'feature_laundry_in_building',
       u'feature_laundry_in_unit', u'feature_laundry_on_every_floor',
       u'feature_laundry_on_floor', u'feature_laundry_room',
       u'feature_light', u'feature_live-in_super',
       u'feature_live-in_superintendent', u'feature_live/work',
       u'feature_live_in_super', u'feature_loft', u'feature_lounge',
       u'feature_lounge_room', u'feature_lowrise',
       u'feature_luxury_building', u'feature_magnificent_venetian-style',
       u'feature_mail_room', u'feature_marble_bath',
       u'feature_marble_bathroom', u'feature_media_room',
       u'feature_media_screening_room', u'feature_microwave',
       u'feature_midrise', u'feature_multi-level',
       u'feature_new_construction', u'feature_newly_renovated',
       u'feature_no_fee', u'feature_no_pets',
       u'feature_on-site_atm_machine', u'feature_on-site_attended_garage',
       u'feature_on-site_garage', u'feature_on-site_laundry',
#        u'feature_on-site_lifestyle_concierge_by_luxury_attach\xe9',
       u'feature_on-site_parking', u'feature_on-site_parking_available',
       u'feature_on-site_parking_lot', u'feature_on-site_super',
       u'feature_one_month_free', u'feature_outdoor_areas',
       u'feature_outdoor_entertainment_space', u'feature_outdoor_pool',
       u'feature_outdoor_roof_deck_overlooking_new_york_harbor_and_battery_park',
       u'feature_outdoor_space', u'feature_package_room',
       u'feature_parking', u'feature_parking_available',
       u'feature_parking_space', u'feature_part-time_doorman',
       u'feature_party_room', u'feature_patio', u'feature_penthouse',
       u'feature_pet_friendly', u'feature_pets', u'feature_pets_allowed',
       u'feature_pets_on_approval', u'feature_playroom',
       u'feature_playroom/nursery', u'feature_pool', u'feature_post-war',
       u'feature_post_war', u'feature_pre-war', u'feature_pre_war',
       u'feature_prewar', u'feature_private-balcony',
       u'feature_private-outdoor-space', u'feature_private_backyard',
       u'feature_private_balcony', u'feature_private_deck',
       u'feature_private_garden',
       u'feature_private_laundry_room_on_every_floor',
       u'feature_private_outdoor_space', u'feature_private_parking',
       u'feature_private_roof_deck', u'feature_private_roofdeck',
       u'feature_private_terrace', u'feature_publicoutdoor',
       u'feature_queen_size_bedrooms', u'feature_queen_sized_rooms',
       u'feature_reduced_fee', u'feature_renovated',
       u'feature_renovated_kitchen', u'feature_residents_garden',
       u'feature_residents_lounge', u'feature_roof-deck',
       u'feature_roof_access', u'feature_roof_deck',
       u'feature_roof_deck_with_grills', u'feature_roofdeck',
       u'feature_rooftop_deck', u'feature_rooftop_terrace',
       u'feature_s/s_appliances', u'feature_sauna',
       u'feature_screening_room', u'feature_separate_kitchen',
       u'feature_shared_backyard', u'feature_shared_garden',
       u'feature_shares_ok', u'feature_short_term_allowed',
       u'feature_simplex', u'feature_skylight', u'feature_skylight_atrium',
       u'feature_southern_exposure', u'feature_spa_services',
       u'feature_ss_appliances', u'feature_stainless_steel',
       u'feature_stainless_steel_appliances',
       u'feature_state-of-the-art_fitness_center', u'feature_storage',
       u'feature_storage_available',
       u'feature_storage_facilities_available', u'feature_storage_room',
       u'feature_sublet', u'feature_subway', u'feature_sundeck',
       u'feature_swimming_pool', u'feature_tenant_lounge',
       u'feature_terrace', u'feature_terraces_/_balconies',
       u'feature_tons_of_natural_light', u'feature_valet',
       u'feature_valet_parking', u'feature_valet_service',
       u'feature_valet_services',
       u'feature_valet_services_including_dry_cleaning',
       u'feature_video_intercom', u'feature_view',
       u'feature_virtual_doorman', u'feature_virtual_tour',
       u'feature_walk-in_closet', u'feature_walk-up',
       u'feature_walk_in_closet', u'feature_walk_in_closet(s)',
       u'feature_washer/dryer', u'feature_washer/dryer_hookup',
       u'feature_washer/dryer_in-unit',
       u'feature_washer/dryer_in_building',
       u'feature_washer/dryer_in_unit', u'feature_washer_&_dryer',
       u'feature_washer_in_unit', u'feature_wheelchair_access',
       u'feature_wheelchair_ramp', u'feature_wifi', u'feature_wifi_access',
       u'feature_wood-burning_fireplace', u'feature_yard',
       u'feature_yoga_classes']

In [28]:
train_X = x_train.merge(X_train[X_train_feature],on='listing_id',how='left')
test_X = x_test.merge(X_test[X_train_feature],on='listing_id',how='left')
print train_X.shape
print test_X.shape

(49352, 376)
(74659, 376)


In [29]:
data_path = "../input/"
train_X_0322 = pd.read_csv(data_path + 'train_BM_MB_add03052240.csv')
test_X_0322 = pd.read_csv(data_path + 'test_BM_MB_add03052240.csv')
train_y_0322 = np.ravel(pd.read_csv(data_path + 'labels_BrandenMurray.csv'))
sub_id = test_X_0322.listing_id.astype('int32').values
# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X_0322.shape, test_X_0322.shape, train_y_0322.shape

(49352, 322) (74659, 322) (49352L,)


train_X_0322.columns.values

In [30]:
train_X_0322_features = ['building_id_mean_med',
       'building_id_mean_high', 'manager_id_mean_med',
       'manager_id_mean_high','median_price_bed', 'ratio_bed',
#                          'created_month', 'created_hour',
       'compound', 'neg', 'neu', 'pos', 'street',
       'avenue', 'east', 'west', 'north', 'south', 'other_address',
       'top_10_manager', 'top_25_manager', 'top_5_manager',
       'top_50_manager', 'top_1_manager', 'top_2_manager',
       'top_15_manager', 'top_20_manager', 'top_30_manager',
       'Zero_building_id', 'top_10_building', 'top_25_building',
       'top_5_building', 'top_50_building', 'top_1_building',
       'top_2_building', 'top_15_building', 'top_20_building',
       'top_30_building','listing_id']

In [31]:
train_X_0322 = train_X_0322[train_X_0322_features].merge(train_X,on='listing_id',how='left')
test_X_0322 = test_X_0322[train_X_0322_features].merge(test_X,on='listing_id',how='left')

print train_X_0322.shape
print test_X_0322.shape

(49352, 412)
(74659, 412)


In [33]:
from sklearn.model_selection import train_test_split

train_X_new, val_X_new, y_train, y_val = train_test_split(train_X_0322, train_y_0322, train_size=.80, random_state=1234)
print train_X_new.shape
print val_X_new.shape

(39481, 412)
(9871, 412)


In [34]:
import xgboost as xgb

rgr = xgb.XGBClassifier(objective = 'multi:softprob',
                       learning_rate = 0.1,
                       n_estimators = 10000,
                       nthread = -1,
                       max_depth=6)

rgr.fit(train_X_new,y_train,
        eval_set=[(val_X_new,y_val)],
        eval_metric='mlogloss',
#         num_class = 3,
        early_stopping_rounds=50,
        verbose=25
       )

[0]	validation_0-mlogloss:1.03163
Will train until validation_0-mlogloss hasn't improved in 50 rounds.
[25]	validation_0-mlogloss:0.600488
[50]	validation_0-mlogloss:0.560549
[75]	validation_0-mlogloss:0.548337
[100]	validation_0-mlogloss:0.542184
[125]	validation_0-mlogloss:0.539399
[150]	validation_0-mlogloss:0.537335
[175]	validation_0-mlogloss:0.535876
[200]	validation_0-mlogloss:0.535486
[225]	validation_0-mlogloss:0.535061
[250]	validation_0-mlogloss:0.535267
[275]	validation_0-mlogloss:0.534973
[300]	validation_0-mlogloss:0.535203
[325]	validation_0-mlogloss:0.535747
Stopping. Best iteration:
[276]	validation_0-mlogloss:0.534806



XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=10000, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [None]:
# [271]	validation_0-mlogloss:0.533953

In [73]:
# [1030]	validation_0-mlogloss:0.534277    394

# [915]	validation_0-mlogloss:0.53396  396

# [893]	validation_0-mlogloss:0.533652 409

In [34]:
train_X_0322.to_csv(data_path + 'train_BM_0331.csv',index=False)
test_X_0322.to_csv(data_path + 'test_BM_0331.csv',index=False)

In [36]:
pred_y = rgr.predict_proba(test_X_0322, ntree_limit = rgr.best_iteration)
pred_y

array([[  4.65621829e-01,   4.78014857e-01,   5.63633256e-02],
       [  9.69607055e-01,   1.60864182e-02,   1.43064726e-02],
       [  9.32496428e-01,   6.15736768e-02,   5.92987984e-03],
       ..., 
       [  9.82476592e-01,   1.69446655e-02,   5.78705280e-04],
       [  9.67458904e-01,   3.18887234e-02,   6.52391405e-04],
       [  6.58273041e-01,   3.22936296e-01,   1.87906120e-02]], dtype=float32)

In [None]:
test_X_0322.isnull().values.any()