#Imports

In [1]:
import time
#
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
#
import xgboost as xgb
#
# pd.set_option('display.max_columns', 500)  # force pandas to display all columns for better visual inspection
# plot plots inline
# %matplotlib inline  

OSError: /home/jingchu/anaconda2/lib/python2.7/site-packages/xgboost/./wrapper/libxgboostwrapper.so: invalid ELF header

#Dataset IO

## Method 1: read from raw training set and sub sample

In [None]:
# Read training set into memory
trn = pd.read_csv('../data/train.csv')
# Generate random sample and save into file
trn_10 = trn.sample(frac=0.1)  # random sample 10% from whole sample
trn_10.to_csv('../data/train_10.csv')  # save randome sub-samples to a csv file
trn_1 = trn.sample(frac=0.01)  # random sample 1% from whole sample
trn_1.to_csv('../data/train_1.csv')  # save randome sub-samples to a csv file

## Method 2: read from temp file

In [None]:
trn_10 = pd.read_csv('../data/train_10.csv', index_col=0)  # column #0 in our file is index
# trn_1 = pd.read_csv('../data/train_1.csv', index_col=0)

# ML

## Cleaning

In [None]:
t = time.time()  # tic
## filter out observations with non-valid 'Ref'
# trn_1_withRef = trn_1[trn_1['Ref'].notnull()]
# trn_10_withRef = trn_10[trn_10['Ref'].notnull()]
trn_withRef = trn[trn['Ref'].notnull()]
print 'Time elapsed: {:.4f} secs'.format(time.time()-t)  # toc
## dimension checking
# print trn_1.shape, trn_1_withRef.shape, trn_1_withRef['Id'].nunique(), trn_1['Id'].nunique()
# print trn_10.shape, trn_10_withRef.shape, trn_10_withRef['Id'].nunique(), trn_10['Id'].nunique()
print trn.shape, trn_withRef.shape, trn_withRef['Id'].nunique(), trn['Id'].nunique()

In [None]:
np.percentile(trn['Expected'], [92])

In [None]:
trn_withRef_normal = trn_withRef[trn_withRef['Expected']<69]
trn_withRef_normal.shape

## Preprocessing 

### Combine observation

In [None]:
# Combine observations by 'Id'
t = time.time()
# trn_1_withRef_comb = trn_1_withRef.groupby('Id').agg(np.mean)
# trn_10_withRef_comb = trn_10_withRef.groupby('Id').agg(np.mean)
trn_withRef_comb = trn_withRef.groupby('Id').agg(np.mean)
print time.time()-t

### Extract and split 

In [None]:
# Extract matrix-form data from pandas df
X = trn_withRef_comb.loc[:, 'minutes_past':'Kdp_5x5_90th'].values  # NOTE: used range slicing on column
y = np.log10(trn_withRef_comb['Expected'].values)

In [None]:
# Split data as training and validation set
[X_trn, X_val, y_trn, y_val] = train_test_split(X, y, test_size = 0.3)

### Missing values

In [None]:
# Deal with missing values
imp = Imputer(strategy='mean')  # Get a imputor with column-mean filling config
X_trn = imp.fit_transform(X_trn)
X_val = imp.fit_transform(X_val)

In [None]:
print X_trn.shape, X_val.shape, y_trn.shape, y_val.shape

### Standardization

In [None]:
ss = StandardScaler(copy=False, with_mean=True, with_std=True)
ss.fit(X_trn, y_trn)
ss.transform(X_trn, y_trn)
ss.transform(X_val, y_val);

In [None]:
print X_trn.mean(axis=0), X_trn.std(axis=0)
print y_trn.mean(axis=0)
print X_val.mean(axis=0), X_trn.std(axis=0)
print y_val.mean(axis=0)

In [None]:
plt.hist(X_trn, alpha=0.3)
plt.show()

In [None]:
plt.hist(y_trn)
plt.show()

## Training and validation

In [None]:
t = time.time()
clf = RandomForestRegressor(n_jobs=-1)  # NOTE: n_jobs=-1 will use all of your cores, set to a prefered number
clf.fit(X_trn, y_trn)
y_val_predict = clf.predict(X_val)
print time.time()-t

## Performance evaluation 

In [None]:
plt.scatter(y_val_predict, y_val, alpha=0.5)
plt.xlabel('Predict')
plt.ylabel('Real')

In [None]:
# Results with log10(y) + Standardization
print 'MAE = {}'.format(abs(10**y_val_predict-10**y_val).mean())
plt.hist(np.log10(1+abs(y_val_predict-y_val)))
plt.xlabel('log10(1+MSE)')

In [None]:
# Results with log10(1+y) + Standardization
print 'MAE = {}'.format(abs(10**y_val_predict-10**y_val).mean())
plt.hist(np.log10(1+abs(y_val_predict-y_val)))
plt.xlabel('log10(1+MSE)')

## Generate submission

In [None]:
test = pd.read_csv('../data/test.csv')
test_withRef = test[test['Ref'].notnull()]
test_withRef_comb = test_withRef.groupby('Id').agg(np.mean)
test_X = test_withRef_comb.loc[:, 'minutes_past':'Kdp_5x5_90th'].values
test_X = imp.fit_transform(test_X)
ss.transform(test_X)
test_y_predict = clf.predict(test_X)

In [None]:
test_result_withRef = pd.DataFrame()
test_result_withRef['Id'] = test_withRef_comb.index
test_result_withRef['Expected'] = test_y_predict

In [None]:
test_result = pd.DataFrame()
test_result['Id'] = test['Id'].unique()
test_result = pd.merge(test_result, test_result_withRef, how='left', on=['Id'], sort=True)
test_result.loc[test_result['Expected'].isnull(), 'Expected'] = -1
test_result
test_result.to_csv('../data/result.csv', index=False)

# Test code below

In [None]:
# print trn[trn['Id']==8]
print trn[trn['Ref'].isnull()]['Id'].nunique()
print trn['Id'].nunique()


In [None]:
trn[trn['Ref'].isnull()&(trn['Id']==4)].head(20)

In [None]:
np.log10(trn[trn['Ref'].isnull()]['Expected']).hist()
# plt.show()

In [None]:
np.log10(trn['Expected']).hist()

In [None]:
np.log10(trn[~trn['Ref'].isnull()]['Expected']).hist()

In [None]:
y_trn = y_trn.transpose()
y_val = y_val.transpose()

In [None]:
print y_trn.shape
print y_val.shape