# Full Dataset: 

The first 50 are the primary variables including all of the dummy variables. The next 560 are interactions with the dummy variables. The final variables are the adjacency variables that aren't in the primary variable section. 

In [1]:
import numpy as np 
import pandas as pd
import re
from IPython.display import display, HTML

In [2]:
from IPython.display import display, Markdown
import bokeh

from bokeh.io import show, output_notebook, output_file
from bokeh.layouts import gridplot
from bokeh.charts import Bar
from bokeh.charts.attributes import cat
from bokeh.models import HoverTool
output_notebook ()

In [3]:
combined = pd.read_csv("new data/Final_Project_Variables - combined.csv")
print(combined.shape)

(124011, 825)


In [5]:
train_all = pd.read_csv("new data/Final_Project_Variables - train.csv")
print(train_all.shape)
train_all.head()

(49352, 825)


Unnamed: 0,observation,listing_id,interest_code,days_between_adj_5,log_price_adj_3,interest_adj_13,feature_count_adj_25,bathrooms_adj_4,descript_len_adj_11,bedrooms_adj_3,...,bathrooms_adj_21,bathrooms_adj_22,bathrooms_adj_23,bathrooms_adj_24,bathrooms_adj_25,bathrooms_adj_26,bathrooms_adj_27,bathrooms_adj_28,bathrooms_adj_29,bathrooms_adj_30
0,4,7170325,2,1318,23.485804,19,130,4.0,5398,4,...,21.0,22.0,23.0,24.0,25.0,26.0,27.0,28.0,29.0,30.0
1,6,7092344,1,1173,24.941265,18,168,6.0,7174,7,...,23.0,24.0,26.0,27.0,28.0,29.0,30.0,31.0,33.0,34.0
2,9,7158677,2,1170,24.302001,21,127,4.0,6583,5,...,25.0,26.0,27.0,28.0,31.0,34.0,35.0,36.0,37.0,38.0
3,10,7211212,2,1189,23.65446,15,46,4.5,5388,5,...,23.0,24.0,25.0,26.0,27.0,28.0,29.0,30.0,31.0,32.0
4,15,7225292,1,1207,25.326354,17,195,6.0,6230,5,...,24.0,25.0,26.0,27.0,28.0,29.0,30.0,31.0,32.0,33.0


In [6]:
holdout = pd.read_csv("new data/Final_Project_Variables - test.csv")
print(holdout.shape)

(74659, 825)


In [None]:
#list(train_all.columns)

### Class Distribution: 

In [7]:
interest = train_all["interest_code"]
print("percentage of Lows: ", len(interest[interest== 1])/len(interest) * 100)
print("percentage of Mediums: ", len(interest[interest== 2])/len(interest) * 100)
print("percentage of Highs: ", len(interest[interest== 3])/len(interest) * 100)

percentage of Lows:  69.46830928837737
percentage of Mediums:  22.752877289674178
percentage of Highs:  7.778813421948453


### Data prep:

In [108]:
#convert from pandas to numpy arrays: 

mat_train = train_all.as_matrix()
X_train_all = mat_train[:,3:]
y_train_all = mat_train[:, 2]
mat_holdout = holdout.as_matrix()
X_holdout = mat_holdout[:,3:]

In [109]:
#Because MLP is sensitive to scaling, scale all the predictors in the train and test data:

from sklearn.preprocessing import StandardScaler  

scaler = StandardScaler()  
scaler.fit(X_train_all)  
X_train_all_scale = scaler.transform(X_train_all)  
X_holdout_scale = scaler.transform(X_holdout)  

In [141]:
# test/train split:   

from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X_train_all_scale, y_train_all)    # could also try stratified K_fold function

In [111]:
print(X_train.shape)
print(X_test.shape)
print(train_all.shape)

(37014, 822)
(12338, 822)
(49352, 825)


#### Baseline (Random) Classifier Performance:

In [112]:
per_y1 = len(y_train[y_train== 1])/len(y_train)
per_y2 = len(y_train[y_train== 2])/len(y_train)
per_y3 = len(y_train[y_train== 3])/len(y_train)

y1 = len(y_train[y_train== 1])
y2 = len(y_train[y_train== 2])
y3 = len(y_train[y_train== 3])

print("baseline log loss:")
- (1/X_train.shape[0])*(np.log(per_y1)*y1  + np.log(per_y2)*y2 + np.log(per_y3)*y3)


baseline log loss:


0.78391113651128785

# Multi Layer Perceptron Neural Net:

##### The advantages of Multi-layer Perceptron are:
- Capability to learn non-linear models.

##### The disadvantages of Multi-layer Perceptron are:
- MLP with hidden layers have a non-convex loss function where there exists more than one local minimum. Therefore different random weight initializations can lead to different validation accuracy.
- MLP requires tuning a number of hyperparameters such as the number of hidden neurons, layers, and iterations.
- MLP is sensitive to feature scaling.

##### How it works: 
- MLP trains using Backpropagation. 
- More precisely, it trains using some form of gradient descent and the gradients are calculated using Backpropagation. 
- For classification, it minimizes the Cross-Entropy loss function. 

##### About the parameters: 
- Alpha: 
 - (L2 regularization) term which helps in avoiding overfitting by penalizing weights with large magnitudes.
- Activation
 - Softmax:  Notice that no matter what values are plugged into predict_proba(), the output probability vector always sums up to 1. This can only be achieved by the Softmax activation function. Using an activation other that Softmax there is no guarantee that the sum of the activations in the final layer will be exactly one, specially for an unseen sample
- Solvers: 
  - Empirically, we observed that L-BFGS converges faster and with better solutions on small datasets. 
  - For relatively large datasets, however, Adam is very robust. It usually converges quickly and gives pretty good performance. 
  - SGD with momentum or nesterov’s momentum, on the other hand, can perform better than those two algorithms if learning rate is correctly tuned.

### Fit the MLP classifier: 

In [131]:
from sklearn.neural_network import MLPClassifier

#### Grid search:

In [168]:
from sklearn.grid_search import GridSearchCV

gs = GridSearchCV(  MLPClassifier(), 
        param_grid={
        'alpha': 10.0 ** -np.arange(1, 7),     
        'solver': ['adam', 'lbfgs'],
         }, 
        scoring = 'neg_log_loss')         #use scoring = neg_log_loss (so it knows that lower is better)

gs.fit(X_train,y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'solver': ['adam', 'lbfgs'], 'alpha': array([  1.00000e-01,   1.00000e-02,   1.00000e-03,   1.00000e-04,
         1.00000e-05,   1.00000e-06])},
       pre_dispatch='2*n_jobs', refit=True, scoring='neg_log_loss',
       verbose=0)

In [170]:
y_pred2 = gs.predict(X_test)

In [171]:
print("percentage of Lows: ", len(y_pred2[y_pred2 == 1])/len(y_pred2) * 100)
print("percentage of Mediums: ", len(y_pred2[y_pred2 == 2])/len(y_pred2) * 100)
print("percentage of Highs: ", len(y_pred2[y_pred2 == 3])/len(y_pred2)* 100)

percentage of Lows:  78.35143459231642
percentage of Mediums:  17.020586804992703
percentage of Highs:  4.627978602690874


In [172]:
y_pred_proba2 = gs.predict_proba(X_test)
y_pred_proba2

array([[  3.61349172e-01,   4.95775261e-01,   1.42875567e-01],
       [  9.41301132e-01,   5.73203208e-02,   1.37854721e-03],
       [  2.05067442e-01,   3.04701411e-01,   4.90231146e-01],
       ..., 
       [  9.98763774e-01,   1.23229413e-03,   3.93160251e-06],
       [  4.48459735e-01,   4.30502389e-01,   1.21037876e-01],
       [  5.21456940e-01,   3.32636886e-01,   1.45906174e-01]])

In [176]:
from sklearn.metrics import log_loss

print("log loss from grid search parameters:")
log_loss(y_test, y_pred_proba2)   

log loss from grid search parameters:


0.66355456222180464

In [None]:
#Other things to try in grid search: 
        #"random_state" = 1,
    #'hidden0__units': [4, 8, 12],
    #'activation': ["relu", "logistic", "Tanh"],
        #'learning_rate': [0.005, 0.001],
        #hidden_layer_sizes=(5, 2)

In [174]:
from sklearn.metrics import classification_report

print("Best parameters set found on development set:")
print()
print(gs.best_params_)
print()

Best parameters set found on development set:

{'solver': 'adam', 'alpha': 0.10000000000000001}



#### Without grid search:

In [151]:
clf = MLPClassifier(solver='adam', alpha=1e-2,
                   hidden_layer_sizes=(5, 2), random_state=1)
clf

MLPClassifier(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [152]:
clf.fit(X_train,y_train)

MLPClassifier(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

#### Using cross validation:

In [164]:
from sklearn.model_selection import cross_val_score

#Stratigied K-fold perserves the number of examples from each class 
cv_scores = cross_val_score(clf, X_train_all_scale, y_train_all, scoring='neg_log_loss', cv = 5) 
cv_scores

array([-0.62389776, -0.61783595, -0.6190104 , -0.63598702, -0.6368969 ])

In [166]:
print("mean log loss:", -1*np.mean(cv_scores))

mean log loss: 0.626725605282


### Results from MLP classifier: 

In [153]:
y_pred = clf.predict(X_test)

In [154]:
print("percentage of Lows: ", len(y_pred[y_pred== 1])/len(y_pred) * 100)
print("percentage of Mediums: ", len(y_pred[y_pred== 2])/len(y_pred) * 100)
print("percentage of Highs: ", len(y_pred[y_pred== 3])/len(y_pred)* 100)

percentage of Lows:  79.32403955260172
percentage of Mediums:  20.675960447398282
percentage of Highs:  0.0


In [147]:
#clf.coefs_ contains the weight matrices that constitute the model parameters:
[coef.shape for coef in clf.coefs_]

[(822, 5), (5, 2), (2, 3)]

In [155]:
y_pred_proba = clf.predict_proba(X_test)
y_pred_proba

array([[ 0.32654734,  0.39491991,  0.27853275],
       [ 0.91528705,  0.08025899,  0.00445396],
       [ 0.38119487,  0.4153941 ,  0.20341103],
       ..., 
       [ 0.97597012,  0.02242089,  0.00160899],
       [ 0.37515921,  0.4136875 ,  0.2111533 ],
       [ 0.74149368,  0.19757999,  0.06092633]])

In [156]:
#log loss of acutal vs. predicting on the testing data (not the holdout data yet)
from sklearn.metrics import log_loss
log_loss(y_test, y_pred_proba)   

0.64433181807869266

In [175]:
#percent accuracy 
clf.score(X_test,y_test)

0.69962716809855729

### Results: 

logloss = 0.62709157338049504    
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                   hidden_layer_sizes=(5, 2), random_state=1)
       
logloss = 0.61944758672494526    
clf = MLPClassifier(solver='lbfgs', alpha=1e-2,
                   hidden_layer_sizes=(5, 2), random_state=1)

### Make a kaggle submission: 

In [187]:
X_holdout.shape

(74659, 822)

In [189]:
y_pred_proba_holdout = gs.predict_proba(X_holdout)
y_pred_holdout = gs.predict(X_holdout)

print("percentage predicted low label:")
print(len(y_pred_holdout[y_pred_holdout== 1])/X_holdout.shape[0]) 

y_pred_proba_holdout

percentage predicted low label:
1.0


array([[ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       ..., 
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.]])

In [191]:
submission = pd.DataFrame({
        "listing_id": holdout["listing_id"],
        "high": y_pred_proba_holdout[:,2],
        "medium":y_pred_proba_holdout[:,1],
        "low": y_pred_proba_holdout[:,0]
    })

In [192]:
columnsTitles=["listing_id","high","medium","low"]
submission=submission.reindex(columns=columnsTitles)
submission.head()   

Unnamed: 0,listing_id,high,medium,low
0,7142618,0.0,0.0,1.0
1,7210040,0.0,0.0,1.0
2,7174566,0.0,0.0,1.0
3,7191391,0.0,0.0,1.0
4,7171695,0.0,0.0,1.0


In [None]:
submission.to_csv('submission.csv', index=False)

# Other types of Neural Nets:

# Naive Bayes

In [67]:
#subset to only the integer predictors: 

ind = list(range(3,17))
ind.extend(list(range(20,81)))
ind.extend(list(range(110, 441)))
ind.extend(list(range(441, 560)))
ind.extend(list(range(560, 739)))
ind.extend(list(range(739, 767)))
ind.extend(list(range(767,825)))
X_train_all_ints = mat_train[:,ind]

In [71]:
# test/train split:   

from sklearn.model_selection import train_test_split 
X_train_int, X_test_int, y_train_int, y_test_int = train_test_split(X_train_all_ints, y_train_all)    # could also try stratified K_fold function

In [84]:
from sklearn.naive_bayes import MultinomialNB
gnb = MultinomialNB()
gnb

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [85]:
y_pred = gnb.fit(X_train_int, y_train_int).predict(X_test_int)
y_pred_proba = gnb.fit(X_train_int, y_train_int).predict_proba(X_test_int)

In [86]:
y_pred_proba

array([[ 0.,  0.,  1.],
       [ 1.,  0.,  0.],
       [ 0.,  0.,  1.],
       ..., 
       [ 1.,  0.,  0.],
       [ 0.,  0.,  1.],
       [ 0.,  1.,  0.]])

In [87]:
log_loss(y_test_int, y_pred_proba)   

22.328615662322395

In [79]:
print("Number of mislabeled points out of a total %d points : %d"
       % (X_test_int.shape[0],(y_test_int != y_pred).sum()))

Number of mislabeled points out of a total 12338 points : 10159
