In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

from sklearn.linear_model import Lasso, ElasticNet, SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.decomposition import PCA

np.random.seed(42)
%matplotlib inline

In [None]:
def extract_model_weight(model):    
    if hasattr(model, "intercept_"):
        w = np.zeros(len(model.coef_.ravel()) + 1)
        w[0] = model.intercept_
        w[1:] = model.coef_.ravel()
    else:
        w = model.coef_

    return w

def print_model_weight(ws, column=4):
    w_format = 'w[{:4d}]: {: .4e} '
    w_line = ''
    cnt = 0
    for i, w in enumerate(ws.ravel()):
        
        if (cnt % column) == 0:
            print(w_line)
            w_line = ''
        
        w_line += (w_format.format(i, w))
        cnt = cnt + 1

### Read and check data

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [None]:
train.info()

In [None]:
test.info()

In [None]:
name_cols = ['worker_group_name', 'department_name',
               'union_name', 'job_group',
               'job']
code_cols = ['worker_group_code', 'department_code',
               'union_code', 'job_group_code', 
               'job_code']

for name, code in zip(name_cols, code_cols):
    
    names = test[name]
    codes = test[code]
    print('Unique value {} in {}'.format(len(names.unique()), name))
    print('Unique value {} in {}'.format(len(codes.unique()), code))

In [None]:
print('Salary > 0 have {} samples'.format(train[train.salary > 0]['salary'].count()))

### One hot encoding for all category columns

In [None]:
dummy_cols = name_cols + code_cols
enc = pd.concat((train, test))
enc_dummy = pd.get_dummies(enc, columns=dummy_cols, dummy_na=True)

In [None]:
train_full_enc = enc_dummy[:len(train)]
test_full_enc = enc_dummy[(len(train)):]

#train_full_enc.loc[train_full_enc.salary <= 0, 'salary'] = np.nan
#imputer = Imputer(missing_values='NaN', strategy='median')
#train_full_enc.loc[:, 'salary'] = imputer.fit_transform(train_full_enc['salary'].values.reshape(-1, 1))
train_full_enc = train_full_enc[train_full_enc['salary'] > 0]

y_train = train_full_enc['salary'].values.reshape(-1, 1)

train_full_enc = train_full_enc.drop(['id', 'salary'], axis=1)
test_full_enc = test_full_enc.drop(['id', 'salary'], axis=1)

In [None]:
X_test = test_full_enc.values

X_train = train_full_enc.values
X_train, X_valid, y_train, y_valid = \
    train_test_split(train_full_enc.values, y_train, test_size=0.2, random_state=42)

In [None]:
# pca = PCA(n_components=0.995)
# X_train_pc = pca.fit_transform(X_train)
# X_valid_pc = pca.transform(X_valid)
# X_test_pc = pca.transform(X_test)

In [None]:
# n_iter = 15000
# l1_ratio = 0.001

# lasso = Lasso(alpha=l1_ratio, max_iter=n_iter, random_state=42, selection='cyclic', tol=1e-12)
# lasso.fit(X_train, y_train)
# train_mse = mean_squared_error(y_train, lasso.predict(X_train))
# valid_mse = mean_squared_error(y_valid, lasso.predict(X_valid))
# train_r2 = lasso.score(X_train, y_train)
# valid_r2 = lasso.score(X_valid, y_valid)
# print('Lasso train mse: {:4e}, valid mse: {:4e}'.format(train_mse, valid_mse))

# y_test = lasso.predict(X_test)
# result = pd.DataFrame()
# result['id'] = test['id']
# result['salary'] = y_test
# fn = 'lasso_l1_{:}_demo'.format(str(float(l1_ratio)).replace('.', 'p'))

# result.to_csv('{}.csv'.format(fn), index=False)
# joblib.dump(lasso, '{}.pkl'.format(fn))

In [None]:
# print_model_weight(extract_model_weight(lasso))

## ElasticNet (hyper L1 and L2 regularization)

In [None]:
# n_iter = 15000
# l1_ratio = 0.001
# l2_ratio = 0.001

# elsnet = ElasticNet(alpha=l2_ratio, l1_ratio=l1_ratio, max_iter=n_iter, random_state=42, selection='cyclic', tol=1e-12)
# elsnet.fit(X_train, y_train)
# train_mse = mean_squared_error(y_train, elsnet.predict(X_train))
# valid_mse = mean_squared_error(y_valid, elsnet.predict(X_valid))
# train_r2 = elsnet.score(X_train, y_train)
# valid_r2 = elsnet.score(X_valid, y_valid)
# print('MLP train mse: {:4e}, valid mse: {:4e}'.format(train_mse, valid_mse))

# y_test = elsnet.predict(X_test)
# result = pd.DataFrame()
# result['id'] = test['id']
# result['salary'] = y_test
# fn = 'elsnet_l1_{}_l2_{:}_demo'.format(str(float(l1_ratio)).replace('.', 'p'),
#                                   str(float(l2_ratio)).replace('.', 'p'))

# result.to_csv('{}.csv'.format(fn), index=False)
# joblib.dump(elsnet, '{}.pkl'.format(fn))

In [None]:
n_iter = 100
l1_ratio = 0.0001
l2_ratio = 0.0001

sgd = SGDRegressor(loss='huber', penalty='elasticnet', 
                   alpha=l2_ratio, l1_ratio=l1_ratio, max_iter=n_iter, 
                   learning_rate='optimal')

sgd.fit(X_train, y_train.ravel())
train_mse = mean_squared_error(y_train, sgd.predict(X_train))
valid_mse = mean_squared_error(y_valid, sgd.predict(X_valid))
train_r2 = sgd.score(X_train, y_train)
valid_r2 = sgd.score(X_valid, y_valid)
print('SGD train mse: {:4e}, valid mse: {:4e}'.format(train_mse, valid_mse))

y_test = sgd.predict(X_test)
result = pd.DataFrame()
result['id'] = test['id']
result['salary'] = y_test
fn = 'sgd_l1_{}_l2_{:}_demo'.format(str(float(l1_ratio)).replace('.', 'p'),
                                    str(float(l2_ratio)).replace('.', 'p'))

result.to_csv('{}.csv'.format(fn), index=False)
joblib.dump(sgd, '{}.pkl'.format(fn))

In [None]:
import keras as K
from keras.models import Sequential
from keras.layers import Dense
from keras import regularizers
from keras import optimizers
from keras.callbacks import EarlyStopping, ModelCheckpoint


In [None]:
train_full_enc = enc_dummy[:len(train)]
test_full_enc = enc_dummy[(len(train)):]

#train_full_enc.loc[train_full_enc.salary <= 0, 'salary'] = np.nan
#imputer = Imputer(missing_values='NaN', strategy='median')
#train_full_enc.loc[:, 'salary'] = imputer.fit_transform(train_full_enc['salary'].values.reshape(-1, 1))
train_full_enc = train_full_enc[train_full_enc['salary'] > 0]

y_train = train_full_enc['salary'].values.reshape(-1, 1)

train_full_enc = train_full_enc.drop(['id', 'salary'], axis=1)
test_full_enc = test_full_enc.drop(['id', 'salary'], axis=1)

X_train = train_full_enc.values

In [None]:
n_iter = 100
l1_ratio = 0.0001
l2_ratio = 0.0001

model = Sequential()
model.add(Dense(1, input_dim=X_train.shape[1], 
                #kernel_regularizer=regularizers.l2(l2_ratio), 
                kernel_regularizer=regularizers.l1_l2(l1_ratio, l2_ratio), 
                activation='linear'))
adam = K.optimizers.Adam()
model.compile(optimizer=adam, loss='mse')

In [None]:
callbacks = [
    EarlyStopping(min_delta=1e-4, patience=3)
]

In [None]:
model.fit(X_train, y_train, epochs=n_iter, batch_size=16,
             verbose=1, callbacks=callbacks, validation_split=0.2)

result = pd.DataFrame()
y_test = model.predict(X_test)
result['id'] = test['id']
result['salary'] = y_test
fn = 'keras_l1_{:}_l2_{:}_demo'.format(
            str(float(l1_ratio)).replace('.', 'p'), 
            str(float(l2_ratio)).replace('.', 'p'))
result.to_csv('{}.csv'.format(fn), index=False)
    
model.get_weights()
model.save('{}.hdf5'.format(fn))

In [None]:
# ws = np.zeros(X_train.shape[1] + 1)
# ws[0] = model.get_weights()[1]
# ws[1:] = model.get_weights()[0].ravel()
# print_model_weight(ws)