In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# %matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [3]:
pml = pd.read_csv('pml_train.csv')
print('data loaded')
pml.head()

data loaded


Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,0,A,A,A,A,B,A,A,A,A,...,0.669748,0.51594,0.55421,0.53774,0.47225,0.4922,0.481306,0.756454,0.344502,2152.8
1,1,B,A,A,A,A,B,A,A,A,...,0.484775,0.698278,0.35533,0.40657,0.40666,0.468839,0.458493,0.30435,0.470455,1019.89
2,2,A,A,A,B,A,A,A,A,A,...,0.350956,0.363768,0.58354,0.44352,0.39599,0.341813,0.352251,0.339244,0.283969,4477.83
3,3,B,A,A,A,A,B,A,A,A,...,0.748243,0.538694,0.54829,0.5042,0.51111,0.711942,0.698722,0.709578,0.776114,907.11
4,4,A,B,A,A,A,A,A,A,B,...,0.321058,0.32643,0.3128,0.39648,0.38016,0.24541,0.241676,0.26115,0.342082,974.62


In [4]:
pml_train_y = pml.loss
pml_train_X = pml.drop(columns=['id', 'loss'])
pml_train_X.shape

(131822, 130)

In [5]:
pml_train_y.describe()

count    131822.000000
mean       3039.973828
std        2913.957535
min           5.250000
25%        1204.890000
50%        2116.585000
75%        3865.105000
max      121012.250000
Name: loss, dtype: float64

In [6]:
pml_train_d_X = pd.get_dummies(pml_train_X)
pml_train_d_X.shape

(131822, 1111)

In [7]:
pml_test = pd.read_csv('pml_test_features.csv')
pml_test_X = pml_test.drop(columns=['id'])
pml_test_X.shape

(56496, 130)

In [8]:
# # one-hot encoded test
# pml_test_d_X = pd.get_dummies(pml_test)
# pml_test_d_X.shape

In [9]:
# cDrop = [c for c in pml_test_d_X.columns if c not in pml_train_d_X.columns]
# print(cDrop)
# pml_test_d_X.drop(columns = cDrop, inplace=True)

# for c in pml_train_d_X.columns:
#     if c not in pml_test_d_X.columns:
#         pml_test_d_X[c] = 0
# print(pml_test_d_X.shape)
# pml_test_d_X.head()

In [10]:
# pml_train_cont = pml_train_X.filter(regex=("cont\d*"))
# pml_train_cont.head()

In [11]:
#Generate a correlation matrix between features
# x_corr = pml_train_cont.corr()
# mask = np.zeros_like(x_corr, dtype=np.bool)
# mask[np.triu_indices_from(mask)] = True
# f, ax = plt.subplots(figsize=(11, 9))
# cmap = sns.diverging_palette(220, 10, as_cmap=True)
# sns.heatmap(x_corr, mask=mask, vmax=1, cmap=cmap, center=0,
#             square=True, linewidths=.5)

In [12]:
def col_op(col):
    return col.astype('category').cat.codes

In [13]:
def digit_op(code):
    num = 0
    for alpha in code:
        num *= 26
        num += ord(alpha) - ord('A') + 1
    return num

def to_digit(col):
    return col.apply(digit_op)

In [14]:
to_digit(pd.Series(['AA','BB','A']))

0    27
1    54
2     1
dtype: int64

In [15]:
def encode_X_to_digit(orig_train_X):
    c_X = orig_train_X.copy()
    tmp = orig_train_X.select_dtypes(exclude=['float64','int64'])
    # tmp = pd.Categorical(tmp)

    c_X.loc[:, tmp.columns] = tmp.apply(lambda col: col.astype('category').cat.codes)
    return c_X

In [16]:
pml_train_c_X = encode_X_to_digit(pml_train_X)
pml_test_c_X = encode_X_to_digit(pml_test_X)
print('data encoded')

data encoded


In [17]:
pml_train_c_X.head()

Unnamed: 0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14
0,0,0,0,0,1,0,0,0,0,0,...,0.577339,0.669748,0.51594,0.55421,0.53774,0.47225,0.4922,0.481306,0.756454,0.344502
1,1,0,0,0,0,1,0,0,0,0,...,0.281143,0.484775,0.698278,0.35533,0.40657,0.40666,0.468839,0.458493,0.30435,0.470455
2,0,0,0,1,0,0,0,0,0,0,...,0.499798,0.350956,0.363768,0.58354,0.44352,0.39599,0.341813,0.352251,0.339244,0.283969
3,1,0,0,0,0,1,0,0,0,0,...,0.281143,0.748243,0.538694,0.54829,0.5042,0.51111,0.711942,0.698722,0.709578,0.776114
4,0,1,0,0,0,0,0,0,1,0,...,0.491114,0.321058,0.32643,0.3128,0.39648,0.38016,0.24541,0.241676,0.26115,0.342082


In [18]:
# #Generate a correlation matrix between features
# x_corr = pml_train_c_X.corr()
# mask = np.zeros_like(x_corr, dtype=np.bool)
# mask[np.triu_indices_from(mask)] = True
# f, ax = plt.subplots(figsize=(22, 18))
# cmap = sns.diverging_palette(220, 10, as_cmap=True)
# sns.heatmap(x_corr, mask=mask, vmax=1, cmap=cmap, center=0,
#             square=True, linewidths=.5)

In [19]:
# # standardization
# scaler = preprocessing.StandardScaler()
# pml_train_c_X = scaler.transform(pml_train_c_X)
# pml_test_c_X = scaler.transform(pml_test_c_X)

# pml_test_c_X.shape

In [20]:
# normalize
normalizer = MinMaxScaler().fit(pml_train_c_X)
pml_train_c_X = normalizer.transform(pml_train_c_X)
pml_test_c_X = normalizer.transform(pml_test_c_X)

print(pml_test_c_X.shape)

(56496, 130)


In [21]:
kBest = SelectKBest( k=50)
pml_train_sfs_X = kBest.fit_transform(pml_train_c_X, pml_train_y)
pml_test_sfs_X = kBest.transform(pml_test_c_X)

In [22]:
pml_test_sfs_X.shape

(56496, 50)

In [23]:
# test/train split
split_train_X = pml_train_c_X
split_test_X = pml_test_c_X
val_train_X, val_test_X, val_train_y, val_test_y = train_test_split(split_train_X, pml_train_y, random_state=2018, test_size=0.05)
print('data prep finished')

data prep finished


In [24]:
def mse_score(clf, X, y):
    prd_y = clf.predict(X)
    return np.sqrt(np.sum((prd_y-y)**2)/len(y))

In [25]:
# pipeline:
def try_clf(clf, clf_name='', train_X = val_train_X, train_y = val_train_y, test_X = val_test_X, test_y = val_test_y):
    print('start training ' + clf_name )
    clf.fit(train_X, train_y)
    print(clf_name + ' train :' + str(clf.score(train_X, train_y)))
    print(clf_name + ' test  :' + str(clf.score(test_X, test_y)))
    print(mse_score(clf, test_X, test_y))
    
    return clf

In [26]:
def use_clf(clf, clf_name='clf', pml_X = split_test_X):
    ans = clf.predict(pml_X)
    filename = clf_name + '.csv'
    pd.DataFrame({'id':pml_test['id'], 
          'loss':ans}).to_csv(filename,index = False)
    print('exported as ' + filename)

### Tuning

In [None]:
def tune_para_my(clf, train_X, train_y, test_X, test_y)
    clf = 
    clf = try_clf(clf, 'svr rbf',svr_train_X,  svr_train_y, svr_test_X, svr_test_y)

In [27]:
def tune_para_cv(estimator, X, y, parameter_name ,parameters, k_fold = 10):
    
    from sklearn.model_selection import KFold
    from sklearn.model_selection import GridSearchCV
    
    tuned_parameters = [{parameter_name: parameters}]
    n_folds = k_fold

    clf = GridSearchCV(estimator, tuned_parameters, cv=n_folds, refit=False)
    clf.fit(X, y)
    scores = clf.cv_results_['mean_test_score']
    scores_std = clf.cv_results_['std_test_score']
    plt.figure().set_size_inches(8, 6)

    plt.plot(parameters, scores)

    # plot error lines showing +/- std. errors of the scores
    std_error = scores_std / np.sqrt(n_folds)

    plt.plot(parameters, scores + std_error, 'b--')
    plt.plot(parameters, scores - std_error, 'b--')

    # alpha=0.2 controls the translucency of the fill color
    #plt.fill_between(cc, scores + std_error, scores - std_error, alpha=0.2)

    plt.ylabel('CV score +/- std error')
    plt.xlabel(parameter_name)
    plt.axhline(np.max(scores), linestyle='--', color='.5')
    plt.xlim([parameters[0], parameters[-1]])

    plt.show()
    
def print_cv(scores):
    print(scores)
    print('Mean:\t %f' % np.mean(scores))
    print('Var :\t %f' % np.var(scores))

## Linear

In [28]:
lin = LinearRegression()
lin = try_clf(lin)

start training 
 train :0.48288038512837705
 test  :0.45985514773794783
2122.736654367221


In [29]:
use_clf(lin, 'lin_default')

exported as lin_default.csv


## SVR

In [30]:
var_keep = np.array([False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False,  True, False, False,  True,
       False, False,  True,  True, False, False, False, False, False,
       False,  True,  True,  True, False, False, False, False, False,
       False, False,  True, False, False,  True, False, False,  True,
       False, False, False, False, False, False, False, False,  True,
       False,  True, False, False, False,  True, False, False, False,
        True, False, False, False,  True,  True,  True, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False])

In [32]:
svr_X = pml_train_c_X[:,var_keep==False]
svr_y = pml_train_y
pml_svr_test_X = pml_test_c_X[:,var_keep]

# svr_train_X, svr_test_X, svr_train_y, svr_test_y = train_test_split(svr_X, svr_y, random_state=2018, test_size=0.05)

In [36]:
idx.shape

(10000,)

In [47]:
idx = np.random.choice(svr_X.shape[0], 10000, replace=False)
svr_X = svr_X[idx, :]
svr_y = svr_y.reshape(-1,1)[idx, :].squeeze()

In [48]:
svr_train_X, svr_test_X, svr_train_y, svr_test_y = train_test_split(svr_X, svr_y, random_state=2018, test_size=0.1)

In [49]:
svr_X.shape

(10000, 111)

In [50]:
svr = SVR(kernel='poly', gamma = 0.01, C=0.1, degree=2, tol = 0.1, cache_size=300)
svr = try_clf(svr, 'svr rbf',svr_train_X,  svr_train_y, svr_test_X, svr_test_y)

start training svr rbf
svr rbf train :-0.10531409562258331
svr rbf test  :-0.09077491843494645
3036.266110219221


In [None]:
use_clf(svr,'svr_default',pml_svr_test_X)

In [None]:
pml_train_y.shape

In [None]:
dgr = np.arange(2,5)
svr = SVR( gamma = 0.01, C=0.1, tol = 0.1, cache_size=300)
tune_para_cv(svr, split_train_X, pml_train_y, 'degree', dgr)

## tree

In [44]:
#. ec2-spotter/fast_ai/create_vpc.sh

In [55]:
rf = RandomForestRegressor(n_estimators=20, max_leaf_nodes = 20, 
                            random_state = 2018)
rf = try_clf(rf, 'rf',,,,,)

start training rf
rf train :0.37759226016599257
rf test  :0.35921548261220154
187717.86349091865


In [58]:
use_clf(rf,'csv/rf_20tree_20maxnode')

exported as rf_20tree_20maxnodes.csv


In [61]:
rf2 = RandomForestRegressor(n_estimators=10, max_leaf_nodes = 100, 
                            random_state = 2018)
rf2 = try_clf(rf2, 'rf2')

start training rf
rf train :0.5312358252664292
rf test  :0.4477747364011039
174264.02201728785


In [83]:
use_clf(rf2,'csv/rf_10tree_100maxnode')

exported as rf_10tree_100maxnodes.csv


In [85]:
rf3 = RandomForestRegressor(n_estimators=30, max_leaf_nodes = 120, 
                            min_samples_split=10, random_state = 2018)
rf3 = try_clf(rf3, 'rf3')

start training rf3
rf3 train :0.542120449658501
rf3 test  :0.45601904833664086
172958.3143448042


In [87]:
use_clf(rf3,'csv/rf_30tree_120maxnode')

exported as csv/rf_30tree_120maxnodes.csv


In [None]:
temp_prd_y = rf3.predict(val_train_y)
temp_y = val_test_y
plt.scatter(temp_prd_y,temp_y)

In [90]:
rf3.feature_importances_ == 0

array([False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False,  True, False, False,  True,
       False, False,  True,  True, False, False, False, False, False,
       False,  True,  True,  True, False, False, False, False, False,
       False, False,  True, False, False,  True, False, False,  True,
       False, False, False, False, False, False, False, False,  True,
       False,  True, False, False, False,  True, False, False, False,
        True, False, False, False,  True,  True,  True, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [88]:
rf4 = RandomForestRegressor(n_estimators=50, max_leaf_nodes = 100, 
                             min_impurity_decrease=0.1, random_state = 2018)
rf4 = try_clf(rf4, 'rf4')

start training rf4
rf4 train :0.5357728740110539
rf4 test  :0.44944416723412933
174000.4140706748


In [None]:
use_clf(rf4,'csv/rf_50tree_100maxnode')

In [None]:
c = 'AB'

In [None]:
for a in reversed(c):
    print(a)

In [30]:
temp = pd.DataFrame({'a':['A','B','C','B'],
             'b':['DC','BD','BF','CC']})
temp

Unnamed: 0,a,b
0,A,DC
1,B,BD
2,C,BF
3,B,CC


In [31]:
def op(col):
    return col.astype('category').cat.codes

In [39]:
# uniques = np.sort(pd.unique(temp.values.ravel()))
# temp.apply(lambda x: x.astype('category', categories=uniques))

le = LabelEncoder()
le.fit(temp.values.flat)

# Convert to digits.
temp = temp.apply(le.transform)
temp

NameError: name 'LabelEncoder' is not defined