In [1]:
import numpy as np
import pandas as pd

In [2]:
def load_train_data():
    train_file = 'train.json.zip'
    train = pd.read_json(train_file, orient='records', convert_dates=['created'])
    train.reset_index(drop=True, inplace=True)
    return train

def load_test_data():
    test_file = 'test.json.zip'
    test = pd.read_json(test_file, orient='records', convert_dates=['created'])
    test.reset_index(drop=True, inplace=True)
    return test

In [3]:
# load data
train = load_train_data()
test = load_train_data()

# remove outliers
# latitude
lower = np.percentile(train['latitude'].values, 1)
upper = np.percentile(train['latitude'].values, 99)
train = train[train['latitude'] > lower]
train = train[train['latitude'] < upper]
# train.latitude.hist()

# longitude
lower = np.percentile(train['longitude'].values, 1)
upper = np.percentile(train['longitude'].values, 99)
train = train[train['longitude'] > lower]
train = train[train['longitude'] < upper]
# train.longitude.hist()

# price
upper = np.percentile(train['price'].values, 99)
train = train[train['price'] < upper]
# train.price.hist()

train.reset_index(drop=True, inplace=True)

In [5]:
# Adding feature manager id
manager_id_df = train.groupby(['manager_id']).agg({'building_id': 'count'}).reset_index()
manager_number = manager_id_df['building_id'].shape[0]
manager_id_df['building_id'] = np.linspace(1, manager_number, num=manager_number, dtype=int)

train = train.merge(manager_id_df, how='left', on='manager_id')
del train['building_id_x']
train = train.rename(columns={'building_id_y': 'manager_number'})

# using external dataset
subway = pd.read_csv('NYC_Transit_Subway_Entrance_And_Exit_Data.csv')
subway = subway[['Station Name', 'Station Latitude', 'Station Longitude']]
subway = subway.groupby(['Station Name']).mean().reset_index(drop=True)

from math import cos, asin, sqrt
# https://stackoverflow.com/questions/27928/calculate-distance-between-two-latitude-longitude-points-haversine-formula/21623206
def distance_pair(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295     #Pi/180
    a = 0.5 - cos((lat2 - lat1) * p)/2 +  cos(lat1 * p) * cos(lat2 * p) * (1 - cos((lon2 - lon1) * p)) / 2
    d_2_point = 6371 *2 * asin(sqrt(a)) #2*R*asin...
    return d_2_point
distance_pairs = np.vectorize(distance_pair)

def get_nearby_subway(location):
    distances = distance_pairs(location[0], location[1], subway['Station Latitude'], subway['Station Longitude'])    
    return distances[distances < 1].shape[0]
    
def get_subway_distance(location):
    distances = distance_pairs(location[0], location[1], subway['Station Latitude'], subway['Station Longitude'])    
    return min(distances)

train['feature_number'] = train['features'].apply(len)
train['room_number'] = train['bedrooms'] + train['bathrooms']
train['photo_number'] = train['photos'].apply(len)
train['location'] = train[['latitude', 'longitude']].values.tolist()
train['nearby_subway'] = train['location'].apply(get_nearby_subway)
train['subway_distance'] = train['location'].apply(get_subway_distance)
train['target'] = train['interest_level'].apply(lambda x: 0 if x=='low' else 1 if x=='medium' else 2)


In [14]:
X_features = ['feature_number', 'room_number', 'photo_number', 'nearby_subway', 'subway_distance', 'price']
# X_features = ['price']
X = train[X_features]
y = train['target']

In [26]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score

sgd = SGDClassifier()

scores = cross_val_score(sgd, X, y, cv=5)
print("Accuracy: %f (+/- %f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.541455 (+/- 0.398324)


In [21]:
# from lightgbm import LGBMRegressor
# from sklearn.model_selection import cross_val_score

# lgbm = LGBMRegressor()

# scores = cross_val_score(lgbm, X, y, cv=5)
# print("Accuracy: %f (+/- %f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.075794 (+/- 0.008466)


In [30]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

gb = GradientBoostingClassifier(max_depth=10,
                                max_features=5,
                                subsample=0.5)

scores = cross_val_score(gb, X, y, cv=5)
print("Accuracy: %f (+/- %f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.696558 (+/- 0.001151)
