In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV,StratifiedKFold, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import random
from sklearn import preprocessing
import gc
from scipy.stats import skew, boxcox
from bayes_opt import BayesianOptimization
from scipy import sparse
from sklearn.metrics import log_loss
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

seed = 2017

In [2]:
data_path = "../input/"
train_X = pd.read_csv(data_path + 'train_BrandenMurray_MedianBedroom.csv')
test_X = pd.read_csv(data_path + 'test_BrandenMurray_MedianBedroom.csv')
train_y = np.ravel(pd.read_csv(data_path + 'labels_BrandenMurray.csv'))
ntrain = train_X.shape[0]
# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X.shape, test_X.shape, train_y.shape

(49352, 287) (74659, 287) (49352L,)


In [3]:
data_path = "../input/"

train_df = pd.read_pickle(data_path + 'train_2017-03-05-22-40.pkl')
train_y = pd.read_pickle(data_path + 'y_2017-03-05-22-40.pkl')
test_df = pd.read_pickle(data_path + 'test_2017-03-05-22-40.pkl')
features_to_use = pd.read_pickle(data_path + 'featurestouse_2017-03-05-22-40.pkl')

tr_desc_sparse = pd.read_pickle(data_path + 'tr_desc_sparse_2017-03-05-22-40.pkl')
tr_feat_sparse = pd.read_pickle(data_path + 'tr_feat_sparse_2017-03-05-22-40.pkl')
te_desc_sparse = pd.read_pickle(data_path + 'te_desc_sparse_2017-03-05-22-40.pkl')
te_feat_sparse = pd.read_pickle(data_path + 'te_feat_sparse_2017-03-05-22-40.pkl')

desc_sparse_cols = pd.read_pickle(data_path + 'desc_sparse_cols_2017-03-05-22-40.pkl')
feat_sparse_cols = pd.read_pickle(data_path + 'feat_sparse_cols_2017-03-05-22-40.pkl')

In [7]:
features_to_use

['sc_price',
 'sc_ba_price',
 'sc_be_price',
 'sc_bathrooms',
 'sc_bedrooms',
 'sc_babe',
 'sc_longitude',
 'sc_latitude',
 'num_photos',
 'num_features',
 'created_month',
 'created_day',
 'created_hour',
 'created_weekday',
 'num_description_words',
 'num_description_sent',
 'compound',
 'neg',
 'neu',
 'pos',
 'display_address_lbl',
 'manager_id_lbl',
 'building_id_lbl',
 'street_address_lbl',
 'price_outlier',
 'bedrooms0',
 'bathrooms0',
 'bathrooms_outlier',
 'bedrooms_outlier',
 'latitude_outlier',
 'longitude_outlier',
 'street',
 'avenue',
 'east',
 'west',
 'north',
 'south',
 'other_address',
 'top_10_manager',
 'top_25_manager',
 'top_5_manager',
 'top_50_manager',
 'top_1_manager',
 'top_2_manager',
 'top_15_manager',
 'top_20_manager',
 'top_30_manager',
 'Zero_building_id',
 'top_10_building',
 'top_25_building',
 'top_5_building',
 'top_50_building',
 'top_1_building',
 'top_2_building',
 'top_15_building',
 'top_20_building',
 'top_30_building']

In [5]:
features_to_add =['num_features','created_month','created_day','created_hour','created_weekday',
                 'compound','neg','neu','pos','street','avenue','east','west','north','south',
                  'other_address','top_10_manager','top_25_manager','top_5_manager','top_50_manager',
                  'top_1_manager','top_2_manager','top_15_manager','top_20_manager','top_30_manager',
                  'Zero_building_id','top_10_building','top_25_building','top_5_building','top_50_building',
                  'top_1_building','top_2_building','top_15_building','top_20_building','top_30_building']
features_to_add.append('listing_id')

In [6]:
train_X_add = train_X.merge(train_df[features_to_add], on = 'listing_id', how = 'left')
test_X_add = test_X.merge(test_df[features_to_add], on = 'listing_id', how = 'left')

In [8]:
train_X.head()

Unnamed: 0,listing_id,bathrooms,bedrooms,building_id,display_address,latitude,longitude,manager_id,price,street_address,...,desc_wordcount,pricePerBed,pricePerBath,pricePerRoom,bedPerBath,bedBathDiff,bedBathSum,bedsPerc,median_price_bed,ratio_bed
0,7170325,1.0,1,3387,281,40.7108,-73.9539,2104,2400.0,2947,...,553,2400.0,2400.0,1200.0,2400.0,0.0,2.0,0.5,2900.0,0.827586
1,7092344,1.0,2,4758,3434,40.7513,-73.9722,1964,3800.0,8215,...,827,1900.0,3800.0,1266.666667,3800.0,1.0,3.0,0.666667,3350.0,1.134328
2,7158677,1.0,2,5289,3457,40.7575,-73.9625,2627,3495.0,15314,...,799,1747.5,3495.0,1165.0,3495.0,1.0,3.0,0.666667,3350.0,1.043284
3,7211212,1.5,3,2104,4018,40.7145,-73.9425,1204,3000.0,21701,...,588,1000.0,2000.0,666.666667,2000.0,1.5,4.5,0.666667,4500.0,0.666667
4,7225292,1.0,0,4930,3411,40.7439,-73.9743,610,2795.0,13511,...,344,-1.0,2795.0,2795.0,2795.0,-1.0,1.0,0.0,2400.0,1.164583


In [9]:
tmp_feature = features_to_add[:9]
tmp_feature.append('listing_id')
print tmp_feature

['num_features', 'created_month', 'created_day', 'created_hour', 'created_weekday', 'compound', 'neg', 'neu', 'pos', 'listing_id']


In [10]:
test_X_add[tmp_feature].head()

Unnamed: 0,num_features,created_month,created_day,created_hour,created_weekday,compound,neg,neu,pos,listing_id
0,0.339694,1.193093,-0.505312,0.498371,1.159588,-1.569938,1.635868,0.045259,0.477879,7142618
1,-0.489211,1.193093,1.068638,0.739991,0.598349,-1.34078,-0.779948,0.993037,-1.653623,7210040
2,-0.489211,1.193093,0.221126,-1.716515,0.598349,-0.071426,-0.779948,0.208093,0.653913,7174566
3,-0.489211,1.193093,0.705419,0.498371,-1.085366,1.283644,-0.779948,-0.155152,1.375685,7191391
4,1.142222,1.193093,0.100053,0.937869,0.037111,0.878496,1.452939,0.004898,0.68164,7171695


In [11]:
test_df[test_df.listing_id == 7174566][tmp_feature]

Unnamed: 0,num_features,created_month,created_day,created_hour,created_weekday,compound,neg,neu,pos,listing_id
21140,-0.489211,1.193093,0.221126,-1.716515,0.598349,-0.071426,-0.779948,0.208093,0.653913,7174566


In [12]:
train_X_add.to_csv(data_path + 'train_BM_MB_add03052240.csv', index = False)
test_X_add.to_csv(data_path + 'test_BM_MB_add03052240.csv', index = False)