# SVM + 'features'

## Train set

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import chi2
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.grid_search import GridSearchCV
import cPickle
import warnings
from sklearn.svm import SVC
warnings.filterwarnings("ignore")

pd.options.display.max_columns = 160

In [2]:
# all_df = pd.read_json('train.json')
all_df = pd.read_csv('train_feats_max.csv')

In [3]:
all_df.shape

(49352, 152)

In [4]:
# all_df.head(2)

In [5]:
x_train, x_val, y_train, y_val = train_test_split(all_df.drop(['interest_level'], 1),all_df[['interest_level']], test_size=0.2, random_state=42)


In [6]:
cat_feats = cPickle.load(open('cat_feats.p', 'rb'))

for col in ['interest_level']:
    y_train[col] = y_train[col].astype('category')
    y_val[col] = y_val[col].astype('category')
    
for col in cat_feats:
    x_train[col] = x_train[col].astype('category')
    x_val[col] = x_val[col].astype('category')

In [7]:
drop_list = [ u'features', u'listing_id', 'index']
x_train_small = x_train.drop(drop_list,1)

x_val_small = x_val.drop(drop_list,1)

In [8]:
low_pvalues_cols = cPickle.load(open('low_pvalues_cols.p', 'rb'))
x_train_best = x_train_small[low_pvalues_cols]
x_val_best = x_val_small[low_pvalues_cols]
x_train_best.shape
# x_train_best.head(2)

(39481, 87)

In [9]:
# C_range = np.logspace(-2, 10, 13)
# gamma_range = np.logspace(-9, 3, 13)
# param_grid = dict(gamma=gamma_range, C=C_range)
# cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
# grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
# grid.fit(x_train_best[:10], y_train[:10])

# print("The best parameters are %s with a score of %0.2f"
#       % (grid.best_params_, grid.best_score_))

# # Now we need to fit a classifier for all parameters in the 2d version
# # (we use a smaller set of parameters here because it takes a while to train)

# C_2d_range = [1e-2, 1, 1e2]
# gamma_2d_range = [1e-1, 1, 1e1]
# classifiers = []
# for C in C_2d_range:
#     for gamma in gamma_2d_range:
#         clf = SVC(C=C, gamma=gamma)
#         clf.fit(X_2d, y_2d)
#         classifiers.append((C, gamma, clf))

In [None]:
model = SVC(kernel = 'poly', probability=True)
model = model.fit(x_train_best, y_train)

In [9]:
# instantiate a logistic regression model, and fit with X and y
# model = LogisticRegression()
# model = model.fit(x_train_best, y_train)
# fit a SVM model to the data
model = SVC(probability=True)
model = model.fit(x_train_best, y_train)
# check the accuracy on the training set
# model.score(y, y)

In [None]:
predicted_train = pd.DataFrame(model.predict_proba(x_train_best))
# predicted = model.predict_proba(x)
predicted_train.columns = ['high', 'low', 'medium']
predicted_train.head()
# predicted

In [None]:
log_loss_train = log_loss(y_train, predicted_train.as_matrix())
log_loss_train

In [12]:
predicted_val = pd.DataFrame(model.predict_proba(x_val_best))
# predicted = model.predict_proba(x)
predicted_val.columns = ['high', 'low', 'medium']
predicted_val.head()
# predicted

Unnamed: 0,high,low,medium
0,0.065388,0.722926,0.211686
1,0.065382,0.727425,0.207193
2,0.064455,0.72469,0.210855
3,0.064657,0.729989,0.205354
4,0.065325,0.717472,0.217203


In [13]:
log_loss_val = log_loss(y_val, predicted_val.as_matrix())
log_loss_val

0.75913813476378111

## Competition Test Set

In [165]:
test_df = pd.read_json('test_feats.json')

In [166]:
test_df.head(2)

Unnamed: 0,bathrooms,bedrooms,building_id,cats allowed,created,description,dishwasher,display_address,dogs allowed,doorman,elevator,features,fitness center,hardwood floors,latitude,laundry in building,laundry in unit,listing_id,longitude,manager_id,no fee,photos,price,street_address
0,1.0,1,79780be1514f645d7e6be99a3de696c5,0,2016-06-11 05:29:41,Large with awesome terrace--accessible via bed...,1,Suffolk Street,0,0,1,"[Elevator, Laundry in Building, Laundry in Uni...",0,1,40.7185,1,1,7142618,-73.9865,b1b1852c416d78d7765d746cb1b8921f,0,[https://photos.renthop.com/2/7142618_1c45a2c8...,2950,99 Suffolk Street
1,1.0,2,0,1,2016-06-24 06:36:34,Prime Soho - between Bleecker and Houston - Ne...,0,Thompson Street,1,0,0,"[Pre-War, Dogs Allowed, Cats Allowed]",0,0,40.7278,0,0,7210040,-74.0,d0b5648017832b2427eeb9956d966a14,0,[https://photos.renthop.com/2/7210040_d824cc71...,2850,176 Thompson Street


In [169]:
x_test = test_df[x_train_best.columns]

In [170]:
pred_x = pd.DataFrame(model.predict_proba(x_test))
pred_x.columns = ['high', 'low', 'medium']
pred_x.head()

Unnamed: 0,high,low,medium
0,0.083731,0.555681,0.360588
1,0.094366,0.71715,0.188484
2,0.03298,0.826921,0.140099
3,0.090088,0.506235,0.403677
4,0.019506,0.772445,0.208049


In [173]:
subm = pd.merge(test_df[['listing_id']].reset_index(), pred_x.reset_index(), left_index=True, right_index=True)

In [174]:
subm = subm[['listing_id', 'high', 'medium', 'low']]

In [176]:
subm.shape
# (74659, 4)

(74659, 4)

In [177]:
subm.to_csv('Submission_Logistic_regression+10feats.csv', index=None)

In [178]:
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)