In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
from sklearn.ensemble import ExtraTreesClassifier, RandomForestRegressor
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from keras.models import Sequential, load_model
from keras.layers import Activation,Dense,Dropout,BatchNormalization,PReLU
from keras.optimizers import SGD
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
import math
from sklearn.metrics import r2_score, mean_squared_error

## Read files

In [None]:
df_train_x = pd.read_csv('../model/train_x.csv', index_col='device_id')
print(df_train_x.shape)

df_test_x = pd.read_csv('../model/test_x.csv', index_col='device_id')
print(df_test_x.shape)

## Preprocessing

### save index

In [None]:
df_train_id = df_train_x.index
print(df_train_id.shape)

df_test_id = df_test_x.index
print(df_test_id.shape)

### normalization

In [None]:
array_train_x = preprocessing.normalize(df_train_x)
array_test_x = preprocessing.normalize(df_test_x)

### prepare train_y

In [None]:
df_train_y = pd.read_csv('../matrix_for_model/train_y.csv', header=None)
print(df_train_y.shape)

df_train_y_onehot = pd.get_dummies(df_train_y)
array_train_y = df_train_y_onehot.values
array_train_y.shape

### Feature selection according to random forest

In [None]:
clf = ExtraTreesClassifier(random_state=1).fit(array_train_x, array_train_y)
print("clf.feature_importances_ :",clf.feature_importances_)

col_filter = SelectFromModel(clf,prefit=True,threshold=1.0*(clf.feature_importances_.mean()))
array_train_x = col_filter.transform(array_train_x)
array_train_x.shape   # (74645, 1581)

array_test_x = col_filter.transform(array_test_x)
array_test_x.shape  # (112071, 1581)

## Model

### Define function of submit dataframe

In [None]:
def submit_df(df_test_id, array_test_y):
    cols = ['device_id', 'F23-','F24-26','F27-28','F29-32','F33-42','F43+','M22-','M23-26','M27-28','M29-31','M32-38','M39+']
    df_result = pd.DataFrame(columns=cols)
    df_result['device_id'] = df_test_id
    df_result[['F23-','F24-26','F27-28','F29-32','F33-42','F43+','M22-','M23-26','M27-28','M29-31','M32-38','M39+']] = array_test_y
    return df_result

### 1. Backpropagation Neural Network

#### Define model¶

In [None]:
def nn_modeling(train_x):
    model = Sequential()
    model.add(Dense(200, input_dim=train_x.shape[1], init='normal'))
    model.add(PReLU())
    model.add(Dropout(0.4))
    model.add(Dense(50, init='normal'))
    model.add(PReLU())
    model.add(Dropout(0.2))
    model.add(Dense(12, init='normal', activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])
    return model

#### Train model

In [None]:
nn_model = nn_modeling(array_train_x)

In [None]:
history = nn_model.fit(array_train_x, array_train_y, validation_split=0.1, batch_size=512, epochs=7)
plt.plot(history.history['acc'],ms=5,marker='o',label='train accuracy')
plt.plot(history.history['val_acc'],ms=5,marker='o',label='val accuracy')
plt.legend()
plt.show()

#### Predict test data

In [None]:
array_test_y = nn_model.predict(array_test_x)
df_result = submit_df(df_test_id, array_test_y)
df_result.to_csv('../submit/nn_result.csv', index=False)

### 2. Random Forest

#### Define model

In [None]:
def rf_modeling(train_x, train_y):
    rf = RandomForestRegressor()
    param_grid = { 
    'bootstrap': [True],
    'max_depth': [100],
    'max_features': [3],
    'min_samples_leaf': [4],
    'min_samples_split': [10],
    'n_estimators': [500]
    }
    grid_rf = GridSearchCV(rf, param_grid, n_jobs=-1, cv=3)
    model = grid_rf.fit(train_x, train_y)
    return model

#### Train model

In [None]:
# %env JOBLIB_TEMP_FOLDER=/tmp
forest_model = rf_modeling(array_train_x, array_train_y)

#### Predict test data

In [None]:
array_test_y = forest_model.predict(array_test_x)
df_result = submit_df(df_test_id, array_test_y)
df_result.to_csv('../submit/forest_result.csv', index=False)

### 3. XGBoost

#### Factorize train_y

In [None]:
factor = pd.factorize(df_train_y[0])
array_train_y = factor[0]
definitions = factor[1]
print(array_train_y)
print(definitions)

#### Split train to 90% for training and 10% for validation

In [None]:
train_x, val_x, train_y, val_y = train_test_split(array_train_x, array_train_y, test_size=0.1)
print(train_x.shape)
print(train_y.shape)
print(val_x.shape)
print(val_y.shape)

#### Define model

In [None]:
def xgb_modeling(train_x, train_y, val_x, val_y):
    # Save data to xgb.DMatrix
    data_val  = xgb.DMatrix(val_x, label=val_y)
    data_train = xgb.DMatrix(train_x, label=train_y)
    
    #Set parameters
    param = {}
    param['booster']='gbtree'
    param['objective'] = 'multi:softprob'
    param['tree_method'] = 'hist'
    param['silent']=1
    param['max_depth']= 6
    param['num_class'] = 12
    
    eval_list  = [(data_train,'train'),(data_val,'validation')]
    num_round = 20
    eval_history={}

    # Train model
    xgb_model = xgb.train(param, data_train, num_round, eval_list, 
                          evals_result=eval_history, verbose_eval=False)
    
    #Show process
    mlogloss_train=eval_history['train']['mlogloss']
    mlogloss_validation=eval_history['validation']['mlogloss']
    plt.plot(mlogloss_train,ms=10,marker='.',label='train_eval')
    plt.plot(mlogloss_validation,ms=10,marker='v',label='validation_eval')
    plt.legend()
    plt.show()
    
    # Evaluate result
    print("mlogloss:", xgb_model.eval(data_val))
  
    return xgb_model

#### Train model

In [None]:
xgb_model = xgb_modeling(train_x, train_y, val_x, val_y)

#### Predict test data

In [None]:
x = xgb.DMatrix(array_test_x)
array_test_y = xgb_model.predict(x)
df_result = submit_df(df_test_id, array_test_y)
df_result.to_csv('../submit/xgb_result.csv', index=False)