In [2]:
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy import sparse
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, Imputer, LabelBinarizer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
seed = 1234


# Load Data

In [3]:
data_path = "../input/"
train_X = pd.read_csv(data_path + 'train_BM_MB_add03052240.csv')
test_X = pd.read_csv(data_path + 'test_BM_MB_add03052240.csv')
train_y = np.ravel(pd.read_csv(data_path + 'labels_BrandenMurray.csv'))
sub_id = test_X.listing_id.astype('int32').values
# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X.shape, test_X.shape, train_y.shape

(49352, 322) (74659, 322) (49352L,)


In [5]:
feat_col = [x for x in train_X.columns.values]# if 'feature_' not in x]
feat_col.remove('bathrooms')
feat_col.remove('bedrooms')
feat_col.remove('latitude')
feat_col.remove('longitude')
feat_col.remove('price')
feat_col.remove('top_50_building')
feat_col.remove('top_50_manager')
# feat_col

['listing_id',
 'building_id',
 'display_address',
 'manager_id',
 'street_address',
 'feature_1_month_free',
 'feature_24/7_concierge',
 'feature_24/7_doorman',
 'feature_24/7_doorman_concierge',
 'feature_actual_apt._photos',
 'feature_air_conditioning',
 'feature_all_pets_ok',
 'feature_all_utilities_included',
 'feature_assigned-parking-space',
 'feature_attended_lobby',
 'feature_backyard',
 'feature_balcony',
 'feature_basement_storage',
 'feature_basketball_court',
 'feature_bike_room',
 'feature_bike_storage',
 'feature_billiards_room',
 'feature_billiards_table_and_wet_bar',
 'feature_brand_new',
 'feature_breakfast_bar',
 'feature_bright',
 'feature_brownstone',
 'feature_building-common-outdoor-space',
 'feature_business_center',
 'feature_cable/satellite_tv',
 'feature_cable_ready',
 'feature_call/text_abraham_caro_@_917-373-0862',
 'feature_cats_allowed',
 'feature_central_a/c',
 'feature_central_ac',
 'feature_central_air',
 'feature_chefs_kitchen',
 "feature_children's_p

In [6]:
train_X_new = train_X[feat_col]
test_X_new = test_X[feat_col]
print train_X_new.shape
print test_X_new.shape

(49352, 315)
(74659, 315)


In [7]:
def basic_preprocess(df_train, df_test, n_min=50, precision=3):
    
    # Interest: Numerical encoding of interest level
    df_train['y'] = 0.0
    df_train.loc[df_train.interest_level=='medium', 'y'] = 1.0
    df_train.loc[df_train.interest_level=='high', 'y'] = 2.0
    
    # Location features: Latitude, longitude
    df_train['num_latitude'] = df_train.latitude.values
    df_test['num_latitude'] = df_test.latitude.values
    df_train['num_longitude'] = df_train.longitude.values
    df_test['num_longitude'] = df_test.longitude.values
    x = np.sqrt(((df_train.latitude - df_train.latitude.median())**2) + (df_train.longitude - df_train.longitude.median())**2)
    df_train['num_dist_from_center'] = x.values
    x = np.sqrt(((df_test.latitude - df_train.latitude.median())**2) + (df_test.longitude - df_train.longitude.median())**2)
    df_test['num_dist_from_center'] = x.values
    df_train['position'] = df_train.longitude.round(precision).astype(str) + '_' + df_train.latitude.round(precision).astype(str)
    df_test['position'] = df_test.longitude.round(precision).astype(str) + '_' + df_test.latitude.round(precision).astype(str)
    
    # Degree of "outlierness"
    OutlierAggregated = (df_train.bedrooms > 4).astype(float)
    OutlierAggregated2 = (df_test.bedrooms > 4).astype(float)
    OutlierAggregated += (df_train.bathrooms > 3).astype(float)
    OutlierAggregated2 += (df_test.bathrooms > 3).astype(float)
    OutlierAggregated += (df_train.bathrooms < 1).astype(float)
    OutlierAggregated2 += (df_test.bathrooms < 1).astype(float)
    x = np.abs((df_train.price - df_train.price.median())/df_train.price.std()) > 0.30
    OutlierAggregated += x.astype(float)
    x2 = np.abs((df_test.price - df_train.price.median())/df_train.price.std()) > 0.30
    OutlierAggregated2 += x2.astype(float)
    x = np.log1p(df_train.price/(df_train.bedrooms.clip(1,3) + df_train.bathrooms.clip(1,2))) > 8.2
    OutlierAggregated += x.astype(float)
    x2 = np.log1p(df_test.price/(df_test.bedrooms.clip(1,3) + df_test.bathrooms.clip(1,2))) > 8.2
    OutlierAggregated2 += x2.astype(float)
    x = np.sqrt(((df_train.latitude - df_train.latitude.median())**2) + (df_train.longitude - df_train.longitude.median())**2) > 0.30
    OutlierAggregated += x.astype(float)
    x2 = np.sqrt(((df_test.latitude - df_train.latitude.median())**2) + (df_test.longitude - df_train.longitude.median())**2) > 0.30
    OutlierAggregated2 += x2.astype(float)
    df_train['num_OutlierAggregated'] = OutlierAggregated.values
    df_test['num_OutlierAggregated'] = OutlierAggregated2.values
    
    # Average interest in unique locations at given precision
    x = df_train.groupby('position')['y'].aggregate(['count', 'mean'])
    d = x.loc[x['count'] >= n_min, 'mean'].to_dict()
    impute = df_train.y.mean()
    df_train['num_pos'] = df_train.position.apply(lambda x: d.get(x, impute))
    df_test['num_pos'] = df_test.position.apply(lambda x: d.get(x, impute))
    
    # Density in unique locations at given precision
    vals = df_train['position'].value_counts()
    dvals = vals.to_dict()
    df_train['num_pos_density'] = df_train['position'].apply(lambda x: dvals.get(x, vals.min()))
    df_test['num_pos_density'] = df_test['position'].apply(lambda x: dvals.get(x, vals.min()))

    # Building null
    df_train['num_building_null'] = (df_train.building_id=='0').astype(float)
    df_test['num_building_null'] = (df_test.building_id=='0').astype(float)
    
    # Building supervised
    x = df_train.groupby('building_id')['y'].aggregate(['count', 'mean'])
    d = x.loc[x['count'] >= n_min, 'mean'].to_dict()
    impute = df_train.y.mean()
    df_train['num_building_id'] = df_train.building_id.apply(lambda x: d.get(x, impute))
    df_test['num_building_id'] = df_test.building_id.apply(lambda x: d.get(x, impute))
    
    # Building frequency
    d = np.log1p(df_train.building_id.value_counts()).to_dict()
    impute = np.min(np.array(list(d.values())))
    df_train['num_fbuilding'] = df_train.building_id.apply(lambda x: d.get(x, impute))
    df_test['num_fbuilding'] = df_test.building_id.apply(lambda x: d.get(x, impute))
    
    # Manager supervised
    x = df_train.groupby('manager_id')['y'].aggregate(['count', 'mean'])
    d = x.loc[x['count'] >= n_min, 'mean'].to_dict()
    impute = df_train.y.mean()
    df_train['num_manager'] = df_train.manager_id.apply(lambda x: d.get(x, impute))
    df_test['num_manager'] = df_test.manager_id.apply(lambda x: d.get(x, impute))

    # Manager frequency
    d = np.log1p(df_train.manager_id.value_counts()).to_dict()
    impute = np.min(np.array(list(d.values())))
    df_train['num_fmanager'] = df_train.manager_id.apply(lambda x: d.get(x, impute))
    df_test['num_fmanager'] = df_test.manager_id.apply(lambda x: d.get(x, impute))
    
#     # Creation time features
#     df_train['created'] = pd.to_datetime(df_train.created)
#     df_train['num_created_weekday'] = df_train.created.dt.dayofweek.astype(float)
#     df_train['num_created_weekofyear'] = df_train.created.dt.weekofyear
#     df_test['created'] = pd.to_datetime(df_test.created)
#     df_test['num_created_weekday'] = df_test.created.dt.dayofweek
#     df_test['num_created_weekofyear'] = df_test.created.dt.weekofyear
    
    # Bedrooms/Bathrooms/Price
    df_train['num_bathrooms'] = df_train.bathrooms.clip_upper(4)
    df_test['num_bathrooms'] = df_test.bathrooms.clip_upper(4)
    df_train['num_bedrooms'] = df_train.bedrooms.clip_upper(5)
    df_test['num_bedrooms'] = df_test.bedrooms.clip_upper(5)
    df_train['num_price'] = df_train.price.clip_upper(10000)
    df_test['num_price'] = df_test.price.clip_upper(10000)
    bins = df_train.price.quantile(np.arange(0.05, 1, 0.05))
    df_train['num_price_q'] = np.digitize(df_train.price, bins)
    df_test['num_price_q'] = np.digitize(df_test.price, bins)
    
    # Composite features based on: 
    # https://www.kaggle.com/arnaldcat/two-sigma-connect-rental-listing-inquiries/a-proxy-for-sqft-and-the-interest-on-1-2-baths
    df_train['num_priceXroom'] = (df_train.price / (1 + df_train.bedrooms.clip(1, 4) + 0.5*df_train.bathrooms.clip(0, 2))).values
    df_test['num_priceXroom'] = (df_test.price / (1 + df_test.bedrooms.clip(1, 4) + 0.5*df_test.bathrooms.clip(0, 2))).values
    df_train['num_even_bathrooms'] = ((np.round(df_train.bathrooms) - df_train.bathrooms)==0).astype(float)
    df_test['num_even_bathrooms'] = ((np.round(df_test.bathrooms) - df_test.bathrooms)==0).astype(float)
    
    # Other features
#     df_train['num_features'] = df_train.features.apply(lambda x: len(x))
#     df_test['num_features'] = df_test.features.apply(lambda x: len(x))
#     df_train['num_photos'] = df_train.photos.apply(lambda x: len(x))
#     df_test['num_photos'] = df_test.photos.apply(lambda x: len(x))
#     df_train['num_desc_length'] = df_train.description.str.split(' ').str.len()
#     df_test['num_desc_length'] = df_test.description.str.split(' ').str.len()
    df_train['num_desc_length_null'] = (df_train.description.str.len()==0).astype(float)
    df_test['num_desc_length_null'] = (df_test.description.str.len()==0).astype(float)
    
#     # Features/Description Features
#     bows = {'nofee': ['no fee', 'no-fee', 'no  fee', 'nofee', 'no_fee'],
#             'lowfee': ['reduced_fee', 'low_fee','reduced fee', 'low fee'],
#             'furnished': ['furnished'],
#             'parquet': ['parquet', 'hardwood'],
#             'concierge': ['concierge', 'doorman', 'housekeep','in_super'],
#             'prewar': ['prewar', 'pre_war', 'pre war', 'pre-war'],
#             'laundry': ['laundry', 'lndry'],
#             'health': ['health', 'gym', 'fitness', 'training'],
#             'transport': ['train', 'subway', 'transport'],
#             'parking': ['parking'],
#             'utilities': ['utilities', 'heat water', 'water included'],
#             'cats':['cats allowed','pets allowed','all pets ok','pet friendly','small dogs'],
#             'dogs':['dogs allowed','pets allowed','all pets ok','pet friendly','small dogs'],
#             'cable':['cable ready', 'satellite tv'],
#             'centerAC' : ['central a','central ac','central air','central heat'],
#             'child': ['childrens playroom','s playroom'],
#             'wifi' : ['free wifi'],
#             'pool' : ['pool'],
#             'luxury': ['luxury'],
#             'marble': ['marble'],
#             'newfeature' : ['new construction','new kitchen','newly renovated'],
#             'onemounthfree': ['one month fee','one month free','one month free rent','1 month free'],
#             'share': ['share ok','shares ok'],
#             'stainless': ['stainless steel','stainless steel appliances','stainless steel kitchen','s appliances','ss appliances'],
#             'valet' :['valet'],
#             'washer' :['washer','dryer']
#           }
#     for fname, bow in bows.items():
#         x1 = df_train.description.str.lower().apply(lambda x: np.sum([1 for i in bow if i in x]))
#         x2 = df_train.features.apply(lambda x: np.sum([1 for i in bow if i in ' '.join(x).lower()]))
#         df_train['num_'+fname] = ((x1 + x2) > 0).astype(float).values
#         x1 = df_test.description.str.lower().apply(lambda x: np.sum([1 for i in bow if i in x]))
#         x2 = df_test.features.apply(lambda x: np.sum([1 for i in bow if i in ' '.join(x).lower()]))
#         df_test['num_'+fname] = ((x1 + x2) > 0).astype(float).values

    return df_train, df_test

In [27]:
df = pd.read_json('../input/train.json')
df_test = pd.read_json('../input/test.json')
df['created'] = pd.to_datetime(df.created)
df_test['created'] = pd.to_datetime(df_test.created)

In [49]:
X_train, X_val, y_train, y_val = train_test_split(train_X_new, train_y, 
                                                                  train_size=.80, random_state=1234)

In [66]:
df_X_train = X_train[['listing_id']].merge(df,on='listing_id',how='left')
df_X_val = X_val[['listing_id']].merge(df,on='listing_id',how='left')
print df_X_train.shape, df_X_val.shape

(39481, 15) (9871, 15)


In [68]:
# Get relevant features
df_X_train, df_X_val = basic_preprocess(df_X_train, df_X_val, n_min=15, precision=3)
# feats = [i for i in df_X_train.columns.values if i.startswith('num_')]
# x_train = df_X_train[feats].values
# x_test = df_X_val[feats].values
# print x_train.shape, x_test.shape
print df_X_train.shape, df_X_val.shape

(39481, 35) (9871, 34)


In [69]:
dftemp = df_X_train.copy()
for i in ['latitude', 'longitude']:
    while(1):
        x = dftemp[i].median()
        ix = abs(dftemp[i] - x) > 3*dftemp[i].std()
        if ix.sum()==0:
            break
        dftemp.loc[ix, i] = np.nan
dftemp = dftemp.loc[dftemp[['latitude', 'longitude']].isnull().sum(1) == 0, :]

dfm = DataFrameMapper([(['latitude'], [StandardScaler()]), (['longitude'], [StandardScaler()])])

for i in [5, 10, 20, 40]:
    pipe_location = make_pipeline(dfm, KMeans(n_clusters=i, random_state=1))
    pipe_location.fit(dftemp);
    df_X_train['location_'+str(i)] = pipe_location.predict(df_X_train).astype(str)
    df_X_val['location_'+str(i)] = pipe_location.predict(df_X_val).astype(str)
for i in df_X_train.location_10.unique():
    df_X_train['num_location_10_'+str(i)] = (df_X_train.location_10==i).astype(float)
    df_X_val['num_location_10_'+str(i)] = (df_X_val.location_10==i).astype(float)
    
# for i in df.location_5.unique():
#     df['num_location_5_'+str(i)] = (df.location_10==i).astype(float)
#     df_test['num_location_5_'+str(i)] = (df_test.location_10==i).astype(float)

# for i in df.location_20.unique():
#     df['num_location_20_'+str(i)] = (df.location_10==i).astype(float)
#     df_test['num_location_20_'+str(i)] = (df_test.location_10==i).astype(float)

# for i in df.location_40.unique():
#     df['num_location_40_'+str(i)] = (df.location_10==i).astype(float)
#     df_test['num_location_40_'+str(i)] = (df_test.location_10==i).astype(float)
print df_X_train.shape, df_X_val.shape

(39481, 49) (9871, 48)


In [10]:


# # Get relevant features
# df, df_test = basic_preprocess(df, df_test, n_min=15, precision=3)
# feats = [i for i in df.columns.values if i.startswith('num_')]
# x_train = df[feats].values
# x_test = df_test[feats].values
# print(x_train.shape, x_test.shape)



((49352L, 15L), (74659L, 15L))


In [73]:
feats = [i for i in df_X_train.columns.values if i.startswith('num_')]
feats.extend(['listing_id'])
x_train = df_X_train[feats]
x_val = df_X_val[feats]
print x_train.shape
print x_val.shape

(39481, 29)
(9871, 29)


In [74]:
feats

['num_latitude',
 'num_longitude',
 'num_dist_from_center',
 'num_OutlierAggregated',
 'num_pos',
 'num_pos_density',
 'num_building_null',
 'num_building_id',
 'num_fbuilding',
 'num_manager',
 'num_fmanager',
 'num_bathrooms',
 'num_bedrooms',
 'num_price',
 'num_price_q',
 'num_priceXroom',
 'num_even_bathrooms',
 'num_desc_length_null',
 'num_location_10_1',
 'num_location_10_0',
 'num_location_10_9',
 'num_location_10_3',
 'num_location_10_4',
 'num_location_10_2',
 'num_location_10_6',
 'num_location_10_5',
 'num_location_10_8',
 'num_location_10_7',
 'listing_id']

In [75]:
train_X_new = X_train.merge(x_train, on='listing_id',how='left')
val_X_new = X_val.merge(x_val, on='listing_id',how='left')
print train_X_new.shape
print val_X_new.shape

(39481, 343)
(9871, 343)


In [76]:
train_X_new.columns.values

array(['listing_id', 'building_id', 'display_address', 'manager_id',
       'street_address', 'feature_1_month_free', 'feature_24/7_concierge',
       'feature_24/7_doorman', 'feature_24/7_doorman_concierge',
       'feature_actual_apt._photos', 'feature_air_conditioning',
       'feature_all_pets_ok', 'feature_all_utilities_included',
       'feature_assigned-parking-space', 'feature_attended_lobby',
       'feature_backyard', 'feature_balcony', 'feature_basement_storage',
       'feature_basketball_court', 'feature_bike_room',
       'feature_bike_storage', 'feature_billiards_room',
       'feature_billiards_table_and_wet_bar', 'feature_brand_new',
       'feature_breakfast_bar', 'feature_bright', 'feature_brownstone',
       'feature_building-common-outdoor-space', 'feature_business_center',
       'feature_cable/satellite_tv', 'feature_cable_ready',
       'feature_call/text_abraham_caro_@_917-373-0862',
       'feature_cats_allowed', 'feature_central_a/c', 'feature_central_ac',
  

In [77]:
import xgboost as xgb

rgr = xgb.XGBClassifier(objective = 'multi:softprob',
                       learning_rate = 0.1,
                       n_estimators = 10000,
                       nthread = -1)

rgr.fit(train_X_new,y_train,
        eval_set=[(val_X_new,y_val)],
        eval_metric='mlogloss',
#         num_class = 3,
        early_stopping_rounds=50,
        verbose=25
       )

[0]	validation_0-mlogloss:1.0371
Will train until validation_0-mlogloss hasn't improved in 50 rounds.
[25]	validation_0-mlogloss:0.639744
[50]	validation_0-mlogloss:0.599818
[75]	validation_0-mlogloss:0.584143
[100]	validation_0-mlogloss:0.574409
[125]	validation_0-mlogloss:0.567671
[150]	validation_0-mlogloss:0.562274
[175]	validation_0-mlogloss:0.558453
[200]	validation_0-mlogloss:0.555646
[225]	validation_0-mlogloss:0.553024
[250]	validation_0-mlogloss:0.551416
[275]	validation_0-mlogloss:0.549784
[300]	validation_0-mlogloss:0.548915
[325]	validation_0-mlogloss:0.547888
[350]	validation_0-mlogloss:0.547228
[375]	validation_0-mlogloss:0.546295
[400]	validation_0-mlogloss:0.545615
[425]	validation_0-mlogloss:0.545164
[450]	validation_0-mlogloss:0.544549
[475]	validation_0-mlogloss:0.543808
[500]	validation_0-mlogloss:0.543535
[525]	validation_0-mlogloss:0.543324
[550]	validation_0-mlogloss:0.542826
[575]	validation_0-mlogloss:0.542808
[600]	validation_0-mlogloss:0.542646
[625]	validat

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=10000, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [None]:
# [771]	validation_0-mlogloss:0.53567

In [19]:
import xgbfir
xgbfir.saveXgbFI(rgr, feature_names=X_train.columns, OutputXlsxFile = '../FE/FI.xlsx')

In [21]:
test_X_new.shape

(74659, 340)

In [22]:
import sys  
stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr
reload(sys)  
sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde
sys.setdefaultencoding('utf8')

In [23]:
pred_y = rgr.predict_proba(test_X_new, ntree_limit = rgr.best_iteration)

In [26]:
test_X_new.isnull().values.any()

False

In [27]:
now = datetime.now()
sub_name = '../output/sub_xgb_0329_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(pred_y[:,:3])
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = sub_id
out_df.to_csv(sub_name, index=False)

In [129]:
import xgboost as xgb

rgr = xgb.XGBClassifier(objective = 'multi:softprob',
                       learning_rate = 0.1,
                       n_estimators = 10000,
                       nthread = -1)

rgr.fit(X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
#         num_class = 3,
        early_stopping_rounds=50,
        verbose=25
       )

[0]	validation_0-mlogloss:1.03575
Will train until validation_0-mlogloss hasn't improved in 50 rounds.
[25]	validation_0-mlogloss:0.628286
[50]	validation_0-mlogloss:0.587661
[75]	validation_0-mlogloss:0.573604
[100]	validation_0-mlogloss:0.565028
[125]	validation_0-mlogloss:0.559209
[150]	validation_0-mlogloss:0.555046
[175]	validation_0-mlogloss:0.552027
[200]	validation_0-mlogloss:0.549703
[225]	validation_0-mlogloss:0.547794
[250]	validation_0-mlogloss:0.546045
[275]	validation_0-mlogloss:0.544577
[300]	validation_0-mlogloss:0.543602
[325]	validation_0-mlogloss:0.542495
[350]	validation_0-mlogloss:0.541781
[375]	validation_0-mlogloss:0.54112
[400]	validation_0-mlogloss:0.540564
[425]	validation_0-mlogloss:0.540117
[450]	validation_0-mlogloss:0.539685
[475]	validation_0-mlogloss:0.539161
[500]	validation_0-mlogloss:0.538755
[525]	validation_0-mlogloss:0.538302
[550]	validation_0-mlogloss:0.537899
[575]	validation_0-mlogloss:0.537721
[600]	validation_0-mlogloss:0.537644
[625]	validat

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=10000, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [53]:
# [1038]	validation_0-mlogloss:0.53579

In [14]:
new_col = [x for x in train_X_pre.columns.values if 'feature_' not in x]

In [18]:
train_X_pre = train_X_pre[new_col]
test_X_pre = test_X_pre[new_col]
print train_X_pre.shape, test_X_pre.shape

(49352, 260) (74659, 260)


In [36]:
train_X_0322 = train_X_pre.merge(train_X, on='listing_id', how='left')
test_X_0322 = test_X_pre.merge(test_X, on='listing_id', how='left')

print train_X_0322.shape,  test_X_0322.shape

In [40]:
train_X_0322.to_csv(data_path + 'train_BM_0322.csv',index=False)
test_X_0322.to_csv(data_path + 'test_BM_0322.csv',index=False)