## Column Descriptions


> - ACTION:	ACTION is 1 if the resource was approved, 0 if the resource was not
- RESOURCE:	An ID for each resource
- MGR_ID :	The EMPLOYEE ID of the manager of the current EMPLOYEE ID record; an employee may have only one manager at a time
- ROLE_ROLLUP_1	:Company role grouping category id 1 (e.g. US Engineering)
- ROLE_ROLLUP_2	:Company role grouping category id 2 (e.g. US Retail)
- ROLE_DEPTNAME	:Company role department description (e.g. Retail)
- ROLE_TITLE:	Company role business title description (e.g. Senior Engineering Retail Manager)
- ROLE_FAMILY_DESC:	Company role family extended description (e.g. Retail Manager, Software Engineering)
- ROLE_FAMILY:	Company role family description (e.g. Retail Manager)
- ROLE_CODE : 	Company role code; this code is unique to each role (e.g. Manager)


##  ref :

- http://www.chioka.in/kaggle-competition-solutions/
- https://github.com/codelibra/Amazon-Employee-Access-Challenge/blob/master/Amazon-Employee-Access-Challenge.ipynb

### In V3, we do 2 approaches :

    - 1. Normalization, regularization  <---- seems would make predict accuracy WORSE
    - 2. Resampling : Oversampling, Undersampling

In [88]:
# Load basics library 

import pandas as pd, numpy as np
%matplotlib inline
%pylab inline
import seaborn  as sns 
import pylab as pl
import matplotlib.pyplot as plt
import pickle

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [2]:
# load CSVs

df_train = pd.read_csv('~/Kaggle.com_mini_project/Amazon_access/train.csv')
df_test = pd.read_csv('~/Kaggle.com_mini_project/Amazon_access/test.csv')
sampleSubmission = pd.read_csv('~/Kaggle.com_mini_project/Amazon_access/sampleSubmission.csv')

In [114]:
# help function 

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score, mean_absolute_error
from imblearn.under_sampling import RandomUnderSampler 
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix

def sample_split(data):
    #data =  data[selected_feature]
    relevent_cols = list(data)
    data_=data.values.astype(float)             
    Y = data_[:,0]
    X = data_[:,1:]
    test_size = .3
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state = 3)
    return X_train, X_test, y_train, y_test


def reg_analysis(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    #Calculate Variance score
    Variance_score = explained_variance_score(y_test, prediction)
    print ('Variance score : %.2f' %Variance_score)
    #Mean Absolute Error
    MAE = mean_absolute_error(y_test, prediction)
    print ('Mean Absolute Error : %.2f' %MAE)
    #Root Mean Squared Error
    RMSE = mean_squared_error(y_test, prediction)**0.5
    print ('Mean Squared Error : %.2f' %RMSE)
    #R² score, the coefficient of determination
    r2s = r2_score(y_test, prediction)
    print ('R2  score : %.2f' %r2s)
    return model



def normalize_(df):
    result = df.copy()
    feature_names = list(df.columns[1:])
    for feature_name in feature_names:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result


def oversampling(df):
    y = df['ACTION']
    X = df[df.columns.difference(['ACTION'])].as_matrix()
    # Apply random over-sampling
    ros = RandomOverSampler()
    X_oversampled, y_oversampled = ros.fit_sample(X, y)
    return X_oversampled, y_oversampled



def undersampling(df):
    y = df['ACTION']
    X = df[df.columns.difference(['ACTION'])].as_matrix()
    # Apply random under-sampling
    ros = RandomUnderSampler()
    X_undersampled, y_undersampled = ros.fit_sample(X, y)
    return X_undersampled, y_undersampled


def test_data_predict(df):
    X_train_, X_test_, y_train_, y_test_ = sample_split(df)
    df_predict=pd.DataFrame()
    # submit prediction from TEST data 
    df_predict['Action'] = clf_svr.predict(df_test.iloc[:,1:])
    df_predict.index.name = 'ID'
    # make index feat submission form 
    # https://www.kaggle.com/c/amazon-employee-access-challenge/submit
    df_predict.index = df_predict.index + 1
    print (df_predict.head())
    return df_predict

def save_model(model):
    try:
        with open('/Users/yennanliu/Kaggle.com_mini_project/Amazon_access/final_tuned_model.pkl', 'wb') as fid:
            pickle.dump(model, fid)
            print ('model save success')
    except:
        print ('saving fail')
    
def load_model():
    with open('/Users/yennanliu/Kaggle.com_mini_project/Amazon_access/final_tuned_model.pkl', 'rb') as fid:
        loaded_model = pickle.load(fid)
        return loaded_model
    
    
    

## 1) Normalization 

In [22]:
#df_train.head(3)

In [33]:
df_train_norm = normalize_(df_train)

In [65]:
#X_train, X_test, y_train, y_test = sample_split(df_train_norm)

X_train, X_test, y_train, y_test = sample_split(df_train)

In [81]:
#X_train

## 1') Selected features

In [38]:
selescted_feature = ['ACTION','ROLE_ROLLUP_1', 'ROLE_ROLLUP_2', 'ROLE_FAMILY']

In [39]:
#df_train[selescted_feature]

df_train_feautre = df_train[selescted_feature] 
df_train_norm = normalize_(df_train_feautre)

In [41]:
#df_train_norm

## 2) Resampling 

In [68]:
# oversample 

X_oversampled, y_oversampled = oversampling(df_train)
#X_oversampled, y_oversampled = oversampling(df_train_norm)

pd.DataFrame(y_oversampled)[0].value_counts()

1    30872
0    30872
Name: 0, dtype: int64

In [70]:
# train / test split for oversample data 

X_train_overs, X_test_overs, y_train_overs, y_test_overs = \
 train_test_split(X_oversampled, y_oversampled)

In [132]:
len(X_oversampled) + len(y_oversampled )

123488

In [131]:
len(X_train_overs) + len(X_test_overs) + len(y_train_overs) + len(y_test_overs)

123488

In [48]:
##############

In [71]:
# undersample 

X_undersampled, y_undersampled = undersampling(df_train)
#X_undersampled, y_undersampled = undersampling(df_train_norm)

pd.DataFrame(y_undersampled)[0].value_counts()

1    1897
0    1897
Name: 0, dtype: int64

In [72]:
X_train_unders, X_test_unders, y_train_unders, y_test_unders = \
train_test_split(X_undersampled, y_undersampled)

In [24]:
############## ML ##############

In [128]:

from sklearn import svm

# for oversample dataset 

clf_svr = svm.SVC()
clf_svr.fit(X_train_overs,y_train_overs)
y_test_predict = clf_svr.predict(X_test_overs)

print ('SVM classifier score = ', clf_svr.score(X_test_overs,y_test_overs))
print (confusion_matrix(y_test_predict,y_test_overs ))



SVM classifier score =  0.999805649132
[[7780    3]
 [   0 7653]]


In [86]:
scores = cross_val_score(clf_svr, X_oversampled, y_oversampled, cv=5)

In [87]:
scores

array([ 0.99967611,  0.99967611,  0.99951409,  0.99967606,  0.99959508])

In [91]:
clf_svr

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [115]:
### save model (pickle)

save_model(clf_svr)

model save success


In [116]:
### load model 

loaded_model = load_model()

In [117]:
loaded_model

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [118]:

# for unsersample dataset 

clf_svr = svm.SVC()
clf_svr.fit(X_train_unders,y_train_unders)
y_test_predict = clf_svr.predict(X_test_unders)

print ('SVM classifier score = ', clf_svr.score(X_test_unders,y_test_unders))
print (confusion_matrix(y_test_predict,y_test_unders ))





SVM classifier score =  0.481559536354
[[439 492]
 [  0  18]]


In [77]:
# SVM grid search 

from sklearn.model_selection import GridSearchCV

#parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
#svr = svm.SVC()
#clf = GridSearchCV(svr, parameters)
#clf.fit(X_train_unders,y_train_unders)


In [78]:
# Random forest 

from sklearn.ensemble import RandomForestClassifier

clf_forest = RandomForestClassifier(random_state=0)
clf_forest.fit( X_train_overs, y_train_overs)
y_test_predict = clf_forest.predict(X_test_overs)
print ('clf_forest score : = ', clf_forest.score(X_test_overs,y_test_overs))

print (confusion_matrix(y_test_predict,y_test_overs ))



clf_forest score : =  0.98127753304
[[7780  289]
 [   0 7367]]


In [80]:
# AdaBoostClassifier

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
# Create and fit an AdaBoosted decision tree
bdt = AdaBoostClassifier(RandomForestClassifier(n_estimators=1000, bootstrap=True, oob_score=True, n_jobs=-1, class_weight='balanced_subsample',max_depth=10),
                         algorithm="SAMME",
                         n_estimators=1)

bdt.fit(X_train_overs, y_train_overs)
y_test_predict = bdt.predict(X_test_overs)
print ('bdt score = ',bdt.score(X_test_overs,y_test_overs))
print (confusion_matrix(y_test_predict,y_test_overs ))




bdt score =  0.873736719357
[[6970 1139]
 [ 810 6517]]
