<h1>Reference from: </h1>
<ul>
<li><a href="https://github.com/jeffheaton/t81_558_deep_learning/blob/master/t81_558_class04_training.ipynb">Part 4.3: Cross-Validation for Neural Networks from <i>jeffheaton</i></a>
    </li>
</ul>

In [1]:
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shutil
import os
import requests
import base64


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to a single dummy variable.  The new columns (which do not replace the old) will have a 1
# at every location where the original column (name) matches each of the target_values.  One column is added for
# each target value.
def encode_text_single_dummy(df, name, target_values):
    for tv in target_values:
        l = list(df[name].astype(str))
        l = [1 if str(x) == str(tv) else 0 for x in l]
        name2 = "{}-{}".format(name, tv)
        df[name2] = l


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float32), df.as_matrix([target]).astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low
        
# This function submits an assignment.  You can submit an assignment as much as you like, only the final
# submission counts.  The paramaters are as follows:
# data - Pandas dataframe output.
# key - Your student key that was emailed to you.
# no - The assignment class number, should be 1 through 1.
# source_file - The full path to your Python or IPYNB file.  This must have "_class1" as part of its name.  
# .             The number must match your assignment number.  For example "_class2" for class assignment #2.
def submit(data,key,no,source_file=None):
    if source_file is None and '__file__' not in globals(): raise Exception('Must specify a filename when a Jupyter notebook.')
    if source_file is None: source_file = __file__
    suffix = '_class{}'.format(no)
    if suffix not in source_file: raise Exception('{} must be part of the filename.'.format(suffix))
    with open(source_file, "rb") as image_file:
        encoded_python = base64.b64encode(image_file.read()).decode('ascii')
    ext = os.path.splitext(source_file)[-1].lower()
    if ext not in ['.ipynb','.py']: raise Exception("Source file is {} must be .py or .ipynb".format(ext))
    r = requests.post("https://api.heatonresearch.com/assignment-submit",
        headers={'x-api-key':key}, json={'csv':base64.b64encode(data.to_csv(index=False).encode('ascii')).decode("ascii"),
        'assignment': no, 'ext':ext, 'py':encoded_python})
    if r.status_code == 200:
        print("Success: {}".format(r.text))
    else: print("Failure: {}".format(r.text))

In [2]:
import pandas as pd
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
from keras.layers.core import Dense,Dropout,Activation,Flatten
from sklearn.metrics import confusion_matrix
'''
path = "./data/"

#filename_read = os.path.join(path,"iris.csv")
#filename_write = os.path.join(path,"iris-out-of-sample.csv")
df = pd.read_csv("iris.csv")

# Shuffle
np.random.seed(42)
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)

# Encode to a 2D matrix for training
species = encode_text_index(df,"species")
x,y = to_xy(df,"species")

'''




def show_train_history(train_history,train,validation):
    plt.plot(train_history.history[train])
    plt.plot(train_history.history[validation])
    plt.title('Train History')
    plt.ylabel(train)
    plt.xlabel('Epoch')
    plt.legend(['train','validation'], loc='upper left')
    plt.show()

cols=['genre__Action',
      'genre__Adventure', 
      'genre__Animation', 
      'genre__Biography', 
      'genre__Comedy', 
      'genre__Crime', 
      'genre__Documentary', 
      'genre__Drama', 
      'genre__Family', 
      'genre__Horror', 
      'genre__Music', 
      'genre__Mystery',
      'genre__Sci-Fi',
      'star', 
      'runtime',
      'user',
      'critc',
      'dir_popu',
      'ac1_popu',
      'ac2_popu',
      'ac3_popu',
      #'n_of_posters', 
      #'opening_box_office',
      'budget_mixed',
      #'year_encode'
     ] 

target = "target_paper_label"
###################################################    

all_df = pd.read_excel('6cate/O_H_all_6cate_3436_movie_0830.xlsx')
y_df = pd.read_excel('6cate/O_H_all_6cate_3436_movie_0830.xlsx')
all_df = all_df[cols]
#print(train_df)
y_all=y_df[target]

y_all = np_utils.to_categorical(y_all, num_classes=6)
print(y_all.shape)

all_df = all_df.values


minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))  
all_df = minmax_scale.fit_transform(all_df)


# Cross-validate
kf = KFold(5)
    
oos_y = []
oos_pred = []
fold = 0

for train, test in kf.split(all_df):
    fold+=1
    print("Fold #{}".format(fold))
        
    x_train = all_df[train]
    y_train = y_all[train]
    x_test = all_df[test]
    y_test = y_all[test]
    
    model = Sequential()  
            
    model.add(Dense(units=72, input_dim=len(cols), kernel_initializer='uniform', activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(units=108, kernel_initializer='uniform', activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(units=128, kernel_initializer='uniform', activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(units=192, kernel_initializer='uniform', activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(units=256, kernel_initializer='uniform', activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(units=192, kernel_initializer='uniform', activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(units=128, kernel_initializer='uniform', activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(units=108, kernel_initializer='uniform', activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(units=72, kernel_initializer='uniform', activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(units=6,activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=25, verbose=1, mode='auto')

    model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=0,epochs=1000)
    
    pred = model.predict(x_test)
    
    oos_y.append(y_test)
    pred = np.argmax(pred,axis=1) # raw probabilities to chosen class (highest probability)
    oos_pred.append(pred)        

    # Measure this fold's accuracy
    y_compare = np.argmax(y_test,axis=1) # For accuracy calculation
    score = metrics.accuracy_score(y_compare, pred)
    print("Fold score (bingo-accuracy): {}".format(score))

    CM = confusion_matrix(y_compare, pred)    
    bingo = 0
    one_away = 0
    for i in range(0, 6):
        #print(CM[i][i]) ## bingo
        bingo = bingo + CM[i][i]
       # print('BINGO: ', bingo)
    one_away = CM[0][1] + CM[1][2] + CM[2][3] +  CM[3][4] +  CM[4][5] +  CM[1][0] +  CM[2][1] +  CM[3][2] +  CM[4][3] +  CM[5][4] 
    #print('1-AWAY: ',one_away)
    #print('total: ',bingo+one_away)
    score_1away = (bingo+one_away)/len(y_compare)
    print("Fold score (1 away-accuracy): {}".format(score_1away))

print()
# Build the oos prediction list and calculate the error.
oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
oos_y_compare = np.argmax(oos_y,axis=1) # For accuracy calculation

score = metrics.accuracy_score(oos_y_compare, oos_pred)
print("Final score (accuracy): {}".format(score))    
    
# Write the cross-validated prediction
#oos_y = pd.DataFrame(oos_y)
#oos_pred = pd.DataFrame(oos_pred)
#oosDF = pd.concat( [all_df, oos_y, oos_pred],axis=1 )
#oosDF.to_csv('dadada.csv',index=False)

Using TensorFlow backend.


(3436, 6)
Fold #1
Epoch 00094: early stopping
Fold score (bingo-accuracy): 0.39680232558139533
Fold score (1 away-accuracy): 0.8183139534883721
Fold #2
Epoch 00107: early stopping
Fold score (bingo-accuracy): 0.5109170305676856
Fold score (1 away-accuracy): 0.9068413391557496
Fold #3
Epoch 00142: early stopping
Fold score (bingo-accuracy): 0.4512372634643377
Fold score (1 away-accuracy): 0.87627365356623
Fold #4
Epoch 00086: early stopping
Fold score (bingo-accuracy): 0.4963609898107715
Fold score (1 away-accuracy): 0.8748180494905385
Fold #5
Epoch 00047: early stopping
Fold score (bingo-accuracy): 0.42212518195050946
Fold score (1 away-accuracy): 0.8049490538573508

Final score (accuracy): 0.45547147846332947


In [3]:
oos_y

array([[ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  0.,  0.,  0.,  0.,  1.]])

In [4]:
oos_pred

array([2, 1, 0, ..., 5, 5, 5])

In [5]:
oos_y_compare

array([0, 0, 0, ..., 5, 5, 5])

In [6]:
#y_compare
from sklearn.metrics import confusion_matrix
CM = confusion_matrix(y_compare, pred)
CM

array([[ 20,  48,   7,  10,  10,  10],
       [ 13,  68,  17,   5,   7,   4],
       [  7,  39,  30,  20,   6,   2],
       [  0,  22,  28,  25,  25,   4],
       [  0,   7,  17,  19,  36,  16],
       [  0,   2,   7,   7,  38, 111]])

In [7]:
bingo = 0
one_away = 0

for i in range(0, 6):
    #print(CM[i][i]) ## bingo
    bingo = bingo + CM[i][i]
print('BINGO: ', bingo)

one_away = CM[0][1] + CM[1][2] + CM[2][3] +  CM[3][4] +  CM[4][5] +  CM[1][0] +  CM[2][1] +  CM[3][2] +  CM[4][3] +  CM[5][4] 
print('1-AWAY: ',one_away)
print('total: ',bingo+one_away)

(bingo+one_away)/len(y_compare)

BINGO:  290
1-AWAY:  263
total:  553


0.80494905385735083