In [None]:
from ProjectData import ProjectData
from sklearn.ensemble import GradientBoostingClassifier
import time

# In the cell below, we import the data, add custom features, hash the text and create numpy matrices from the data.

In [None]:
stime = time.time()

###############################
# Init our data wrapper object
###############################
projd = ProjectData('../train.json','../test.json')

print 'Loading ProjectData took :' + str(time.time() - stime) + ' seconds.'

##################
# Data options
###################

# go to log price? Default = True
projd.log_price = True

# features to include. 
# By default, ProjectData includes all hand-crafted features.
#self.dense_matrix_columns = [  'feature1', 'feature2', 'etc'  ]

# how many neighborhood clusters should we use? Default = 50
projd.num_neighborhood_clusters = 100

# Do tfidf analysis? 
# Default = False
projd.tfidf = False

# how many slots for text hashing?
# Default = -1 i.e. no hashing 
projd.feature_hash_n = 1000

###################
# Process the data
###################

stime = time.time()

# create handcrafted features
projd.add_handcrafted_features()

# has the description text
projd.add_text_features()

# build the (sparse) matrices
projd.build_matrices()
print 'Data processing took: ' + str(time.time() - stime) + ' seconds.'

# To speed things up, if projd.features_hash_n is small enough (for your computer's memory) 
# then we can safely make our data matrices dense by running:

projd.make_training_matrix_dense()
projd.make_test_matrix_dense()

# To return them to a sparse representation, we can run:
projd.make_training_matrix_sparse()
projd.make_test_matrix_sparse()


# In the cell below, we describe the relavent instance variables

In [None]:
print '###################\n#Training Data\n###################'
print 'Training data             : projd.train_data ('  + str(type(projd.train_data))   + ')'
print 'Number of training entries: projd.train_n    ('  + str(type(projd.train_n))      + ')'
print 'Training data matrix      : projd.train_matrix ('+ str(type(projd.train_matrix)) + ')'
print 'Training data labels      : projd.train_labels ('+ str(type(projd.train_labels)) + ')'
print ''
print '###################\n#Test Data\n###################'
print 'Test data                 : projd.test_data ('  + str(type(projd.train_data))  + ')'
print 'Number of training entries: projd.train_n   ('  + str(type(projd.train_n))     + ')'
print 'Test data matrix          : projd.test_matrix ('+ str(type(projd.test_matrix)) + ')'

# Here, we train a Gradient Boosting Classifier (GBC)

In [None]:
GBCclf = GradientBoostingClassifier()

stime  = time.time()

projd.make_training_matrix_dense()
GBCclf.fit(projd.train_matrix,projd.train_labels)
projd.make_training_matrix_sparse()

print "Training took: " + str(time.time() - stime) + ' seconds'

# Here, we use our trained GBC to make predictions on our test set

### Note the use of the ProjectData instance method "sparse_prediction_proba". GBCclf.predict_proba requires dense numpy arrays as input, while our matrix is sparse. To get around this, ProjectData.sparse_prediction_proba converts batches of the test data to dense matrices. The batchsize is the number of rows of the test data that get converted to dense arrays per iteration. If, for example, your laptop has less memory available then you should decrease the "batchsize" argument. "verbose = True" just outputs progress.

In [None]:
# mapping of columns to interest levels
classes = GBCclf.classes_

# make predictions on our test set.
preds = projd.sparse_prediction_proba(GBCclf.predict_proba, batchsize = 10000, verbose = True)

# output the predictions to "gbc_neighborhood.csv"
projd.kaggle_output_from_probs(preds,classes,'gbc_neighborhood.csv')

## Below, we show how to use ProjectData to make the test set a holdout set from the training data, train a model, and compute the multi-class loss on the holdout set. 

## The test set now becomes a subset of the training set, and there is a new instance variable called self.test_labels. 

## We also need to shuffle the dataset, because by default it just partitions the training set by drawing a line 


In [None]:
stime = time.time()

###############################
# Init our data wrapper object
###############################
projd = ProjectData('../train.json',
                    '../test.json',
                    holdout_set = True,       # HERE IS WHERE THE MAGIC HAPPENS
                    holdout_set_ratio = 0.2)  # Let the test set be 0.2 * "size of training set"

print 'Loading ProjectData took : ' + str(time.time() - stime) + ' seconds.'

##################
# Data options
###################

projd.num_neighborhood_clusters = 90
projd.feature_hash_n = 1000


###################
# Process the data
###################

stime = time.time()

# create handcrafted features
projd.add_handcrafted_features()

# has the description text
#projd.add_text_features()

# build the (sparse) matrices
projd.build_matrices()
print 'Data processing took     : ' + str(time.time() - stime) + ' seconds.'

# To speed things up, if projd.features_hash_n is small enough (for your computer's memory) 
# then we can safely make our data matrices dense by running:


# shuffle the dataset
projd.shuffle_matrices(seed = 1)


## Train a model on the test set

In [None]:
from imblearn.over_sampling import SMOTE
import numpy as np

In [None]:
low_X = projd.train_matrix[projd.train_labels==0]
low_num=len(low_X)

# Changing to 1's if medium interest, 0 otherwise for the purpose of oversampling
newlabels_med=(projd.train_labels==1).astype(int)
sm_med = SMOTE(ratio=0.45)
sm_high = SMOTE(ratio=0.16)

med_X, med_Y=sm_med.fit_sample(projd.train_matrix, newlabels_med)
med_X=med_X[med_Y==1]
med_num=len(med_X)

# New labels for high interest
newlabels_high=(projd.train_labels==2).astype(int)
high_X, high_Y=sm_high.fit_sample(projd.train_matrix, newlabels_high)
high_X=high_X[high_Y==1]
high_num=len(high_X)

# Concatenate
new_data=np.concatenate((low_X,med_X,high_X),axis=0)

# Make new labels
x0=[0]*low_num
x0=np.array(x0)
x1=[1]*med_num
x1=np.array(x1)
x2=[2]*high_num
x2=np.array(x2)
new_labs=np.concatenate((x0,x1,x2),axis=0)
#print new_labs

# Shuffle
permutation = np.random.permutation(new_labs.shape[0])
shuff_data=new_data[permutation]
shuff_labels=new_labs[permutation]

In [None]:
GBCclf = GradientBoostingClassifier()

stime  = time.time()
GBCclf.fit(shuff_data,shuff_labels)

print "Training took: " + str(time.time() - stime) + ' seconds'

## Compute the log-loss on the holdout set

In [None]:
y = GBCclf.predict_proba(projd.test_matrix)
print projd.get_log_loss(y,GBCclf.classes_)