# Logistic Regression

## Train set

In [59]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split

In [60]:
all_df = pd.read_json('train.json')
# all_df = pd.read_csv('train_feats.csv')

In [61]:
all_df.shape

(49352, 15)

In [62]:
x_train, x_val, y_train, y_val = train_test_split(all_df.drop(['interest_level'], 1),all_df[['interest_level']], test_size=0.2, random_state=42)


In [63]:
for col in ['interest_level']:
    y_train[col] = y_train[col].astype('category')
    y_val[col] = y_val[col].astype('category')

In [64]:
y_train['interest_level'].head()

2683      low
38369     low
118394    low
117210    low
70734     low
Name: interest_level, dtype: category
Categories (3, object): [high, low, medium]

In [65]:
all_df.head(2)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue


In [66]:
list(x_train)

[u'bathrooms',
 u'bedrooms',
 u'building_id',
 u'created',
 u'description',
 u'display_address',
 u'features',
 u'latitude',
 u'listing_id',
 u'longitude',
 u'manager_id',
 u'photos',
 u'price',
 u'street_address']

In [67]:
x_train_small = x_train[[u'bathrooms',
 u'bedrooms',
#  u'building_id',
#  u'created',
#  u'description', --- look for specific words
#  u'display_address',
#  u'features', --- make dummy variables
 u'latitude',
#  u'listing_id',
 u'longitude',
#  u'manager_id',
#  u'photos', --- number of photos
 u'price']]
#  u'street_address']]

x_val_small = x_val[[u'bathrooms',
 u'bedrooms',
 u'latitude',
 u'longitude',
 u'price']]

In [68]:
x_train_small.head()

Unnamed: 0,bathrooms,bedrooms,latitude,longitude,price
2683,1.0,0,40.7538,-73.974,2500
38369,1.0,0,40.7048,-74.0102,3800
118394,1.0,3,40.7062,-74.0099,3780
117210,1.0,1,40.7059,-73.8342,1800
70734,1.0,1,40.7281,-73.8603,1850


In [69]:
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(x_train_small, y_train)
# model
# check the accuracy on the training set
# model.score(y, y)

  y = column_or_1d(y, warn=True)


In [70]:
predicted_train = pd.DataFrame(model.predict_proba(x_train_small))
# predicted = model.predict_proba(x)
predicted_train.columns = ['high', 'low', 'medium']
predicted_train.head()
# predicted

Unnamed: 0,high,low,medium
0,0.06025,0.754195,0.185555
1,0.018893,0.86685,0.114257
2,0.090486,0.599993,0.309522
3,0.16785,0.524744,0.307406
4,0.16204,0.534153,0.303806


In [71]:
log_loss_train = log_loss(y_train, predicted_train.as_matrix())
log_loss_train

0.73111773009780701

In [72]:
predicted_val = pd.DataFrame(model.predict_proba(x_val_small))
# predicted = model.predict_proba(x)
predicted_val.columns = ['high', 'low', 'medium']
predicted_val.head()
# predicted

Unnamed: 0,high,low,medium
0,0.088961,0.603429,0.307611
1,0.098741,0.646591,0.254668
2,0.028385,0.835822,0.135793
3,0.04245,0.796842,0.160707
4,0.171724,0.475974,0.352302


In [73]:
log_loss_val = log_loss(y_val, predicted_val.as_matrix())
log_loss_val

0.73256197734220618

## Competition Test Set

In [74]:
test_df = pd.read_json('test.json')

In [75]:
test_df.head(2)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,manager_id,photos,price,street_address
0,1.0,1,79780be1514f645d7e6be99a3de696c5,2016-06-11 05:29:41,Large with awesome terrace--accessible via bed...,Suffolk Street,"[Elevator, Laundry in Building, Laundry in Uni...",40.7185,7142618,-73.9865,b1b1852c416d78d7765d746cb1b8921f,[https://photos.renthop.com/2/7142618_1c45a2c8...,2950,99 Suffolk Street
1,1.0,2,0,2016-06-24 06:36:34,Prime Soho - between Bleecker and Houston - Ne...,Thompson Street,"[Pre-War, Dogs Allowed, Cats Allowed]",40.7278,7210040,-74.0,d0b5648017832b2427eeb9956d966a14,[https://photos.renthop.com/2/7210040_d824cc71...,2850,176 Thompson Street


In [76]:
test_df_small = test_df[[u'bathrooms',
 u'bedrooms',
 u'latitude',
 u'longitude',
 u'price']]

In [79]:
pred_x = pd.DataFrame(model.predict_proba(test_df_small))
pred_x.columns = ['high', 'low', 'medium']
pred_x.head()

Unnamed: 0,high,low,medium
0,0.067766,0.711921,0.220313
1,0.119209,0.575498,0.305293
2,0.033221,0.801573,0.165205
3,0.082822,0.649276,0.267902
4,0.021856,0.814888,0.163255


In [81]:
subm = pd.merge(test_df[['listing_id']].reset_index(), pred_x.reset_index(), left_index=True, right_index=True)

In [82]:
subm = subm[['listing_id', 'high', 'medium', 'low']]

In [83]:
subm.shape

(74659, 4)

In [None]:
subm.to_csv('Submission_Logistic_regression.csv', index=None)

In [None]:
model