In [None]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/My\ Drive/
%pwd

In [None]:
import pickle
import pandas as pd
import numpy as np
import string
import re
from itertools import *
import itertools
import time
import os
%matplotlib inline
import matplotlib.pyplot as plt
from scipy import interpolate
from sklearn import preprocessing
%tensorflow_version 1.x
import tensorflow as tf
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout

##import .py files
import evaluate
import importlib
importlib.reload(evaluate)

startTotal = time.time()

def getModel(x_size, x_train, y_train):
  model = Sequential()
  model.add(Dense(64, input_dim=x_size, activation='relu'))
  # model.add(Dropout(0.5))
  model.add(Dense(64, activation='relu'))
  # model.add(Dropout(0.5))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])
  callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss')]

  model.fit(x_train, y_train, epochs=100, callbacks=callbacks, batch_size=32, validation_split=0.1, verbose=0) #default shuffle

  return model

## set up
dataset = 'sod'
frac = 100
version = 0
direction = 'Backward' #Forward Backward Two Bi
if dataset == 'pku' or dataset == 'msr':
  nameData = 'nameOri'
else:
  nameData = 'nameNon'
labelSeg = 'labelSeg'
if direction == 'Forward':
  rnnSize = 128
  embSize = 128
  cellSize = str(rnnSize)+'_'+str(embSize)
elif direction == 'Backward':
  rnnSize = 16
  embSize = 32
  cellSize = str(rnnSize)+'_'+str(embSize)

savePath = [direction, cellSize, 'V'+str(version)]

info = {}
info['dataset'] = dataset
info['frac'] = frac
info['version'] = version
info['direction'] = direction
info['rnnSize'] = rnnSize
info['embSize'] = embSize
pickle.dump(info, open( './data/'+dataset+'/info/'+''.join(savePath)+'.p', "wb" ))

## data
d = pickle.load(open( './data/'+dataset+'/'+dataset+'.p', "rb" ))
d = d.sample(frac=1, random_state=version)

if frac != 100:
  dataset = dataset + str(frac)
  d = d[:int(len(d)*(frac/100))]

train = d
test = d
info['sampleLen'] = len(d)
print('num of samples = {}'.format(len(d)))

# LM

### load data

In [None]:
def toFile(inputFile, filename, direction='Forward', padding=False, maxLength=30):
  if direction == 'Backward':
    if padding:
      inputFile = inputFile.apply(lambda x: '$'*(maxLength-len(x))+x).values.tolist()
      assert all([len(x)==maxLength for x in inputFile])
      
    inputFile = '^$'.join(inputFile)
    inputFile = '$'+inputFile+'^'
    inputFile = inputFile[::-1]
  else:
    if padding:
      inputFile = inputFile.apply(lambda x: x+'$'*(maxLength-len(x))).values.tolist()
      assert all([len(x)==maxLength for x in inputFile])
    inputFile = '$^'.join(inputFile)
    inputFile = '^'+inputFile+'$'
  
  with open('./data/'+filename, "w") as f: #w,r,a
    f.write(inputFile)

  return len(inputFile)

maxLength = max(d[nameData].apply(lambda x: len(x)).values.tolist())

fileLen = toFile(train[nameData], dataset+'/input.txt', direction=direction, padding=True, maxLength=maxLength)
info['inputFileLen'] = fileLen
print('length of {} = {}'.format(dataset+'/input.txt',fileLen))

fileLen = toFile(test[nameData],  dataset+'/test.txt', direction=direction, padding=False, maxLength=maxLength)
info['outputFileLen'] = fileLen
print('length of {} = {}'.format(dataset+'/test.txt', fileLen))

assert info['inputFileLen'] >= info['outputFileLen']

maxLength += 2

### LM-Train

In [None]:
if direction == 'Forward':
  rnnSizeList = [rnnSize]
  embSizeList = [embSize]
elif direction == 'Backward':
  rnnSizeList = [rnnSize]
  embSizeList = [embSize]
save_dir = './save/'+'/'.join([dataset,direction,cellSize])
temp = ''.join(savePath)
start = time.time()
for rnnSize in rnnSizeList:
  for embSize in embSizeList:
    cellSize = str(rnnSize) + '_' + str(embSize)
    #learn
    !python train.py --num_epochs=100 --data_dir=./data/$dataset --info_dir='./data/'$dataset'/info/'$temp'.p' --save_dir=$save_dir --rnn_size=$rnnSize --emb_size=$embSize --seq_length=$maxLength 
    #sample
    !python sample.py --task='corpus_all' --save_path=$temp --save_dir=$save_dir --dataset=$dataset --seq_length=$maxLength
end = time.time()
info = pickle.load(open('./data/'+dataset+'/info/'+''.join(savePath)+'.p' , 'rb' ))
info['LmRuntime'] = (end-start)/60
print('time=',(end-start)/60)

# Ensemble

### Threshold

In [None]:
showFig = False
showGT = False
dTest = pickle.load(open( './data/'+dataset+'/test/'+''.join(savePath)+'.p', "rb" ))
eva = evaluate.EVA(test=test, dTest=dTest,nameData=nameData, labelSeg=labelSeg, direction=direction)
test = eva.test
split = 0
eDev = eva.dTest[int(len(eva.dTest)*split):]
precision = 3
thresholds = np.arange(round(0.1**precision,precision),1,round(0.1**precision,precision))
thresholdValues, _ = np.histogram(eDev['nextP'].values.tolist(),bins=thresholds, density=False)
thresholdValues = thresholdValues/1000
mid = int((len(thresholds)+1)/2)
offset = int((len(thresholds)+1)*0.05)
t1 = (2+np.argmax(thresholdValues[mid+offset-2:-offset-2])-2+mid+offset)/(len(thresholds)+1) #[550,950]
t0 = (2+np.argmax(thresholdValues[offset-2:150-2])-2+offset)/(len(thresholds)+1) #[50,150]
info['t1'] = t1
info['t0'] = t0
print('t1=', t1,'t0=', t0)

if dataset == 'pku' or dataset == 'msr':
  t1=0.3
  t0 = 0.01
if showFig:
  fig, ax1 = plt.subplots()
  ax2 = ax1.twinx()
  ax2.plot(thresholds[1:], thresholdValues,color='red',lw=1,label='Probability Density Function', alpha=0.7)
  if showGT:
    dStats = pickle.load(open('./data/'+dataset+'/'+direction+'V'+str(version)+'GS.p', 'rb' ))
    ax1.plot(thresholds, dStats['F1List'],color='green',lw=1)
    ax1.vlines(dStats['bestThreshold'], 0, 1,lw=2, colors='black',linestyles='dotted',label='The Best Threshold')
  plt.show()

In [None]:
def FW(method='Stats'):
  global rnnSize,embSize

  if method == 'Stats':
    thresholds = [t1]
    split = 0
  elif method == 'GS':
    thresholds = np.arange(round(0.1**precision,precision),1,round(0.1**precision,precision))
    split = 0
  elif method == 'GSSplit':
    thresholds = np.arange(round(0.1**precision,precision),1,round(0.1**precision,precision))
    split = 0.2

  eva = evaluate.EVA(test=test, dTest=dTest, direction=direction, savePath=dataset+'/'+direction+'V'+str(version),nameData=nameData) #nameData=nameData, labelSeg=labelSeg
  results = eva.reportByThresholds(thresholds, split=split, verbose=2, ignore=2)

  if method == 'GS':
    info['bestThreshold'] = results[-1]
    info['GS'] = results
  elif method == 'Stats':
    info['Stats'] = results

  print(method, results)
  

FW(method='Stats')
FW(method='GS')

### Ensemble-Train

In [None]:
thresholds1 = [t1]
thresholds0 = [t0]
boost=True

start = time.time()
split = 0

x_test = np.stack(eDev['hidden'].values)
x_size = len(eDev.iloc[0]['hidden'])

if boost:
  T=100
  frac = 0.1
else:
  T=1
  frac = 1

best = [((0,),),]

for t1 in thresholds1:

  dTest1 = eDev[eDev['nextP']>=t1]
  len1 = len(dTest1)
  # if len1 <= n_least:
  #   continue

  dTest1['next'] = [1]*len1

  for t0 in thresholds0:

    dTest0 = eDev[eDev['nextP']<=t0] #& (eDev['nextP']>0)
    len0 = len(dTest0)
    # if len0 <= n_least:
    #   continue
    dTest0['next'] = [0]*len0

    
    n0 = int(min(len1,len0)*frac)
    n1 = n0

    predList = []
    for i in range(T):

      train = pd.concat([dTest0.sample(n=n0, random_state=i) 
      , dTest1.sample(n=n1, random_state=i)]) 

      x_train = np.stack(train['hidden'].values)
      y_train = np.stack(train['next'].values)


      model = getModel(x_size, x_train, y_train)
      # model.save('./save/'+dataset+'/'+direction+'/'+cellSize+'/'+i+'.h5')
      pred = model.predict(x_test)
      predList.append(pred)
      del model  # deletes the existing model
      # model = load_model('./save/'+dataset+'/'+direction+'/'+cellSize+'/'+i+'.h5')

    eDev['next'] = np.mean(predList, axis=0)
    eva.dTest = eDev.copy()

    thresholds = [0.5]

    results = eva.reportByThresholds(thresholds, split=split, verbose=2, ignore=2)
    end = time.time()
    print('time=',(end-start)/60)
    params = [t1, t0, len1, len0, n1, n0, frac, boost]
    print('results={}, params={}'.format(results, params))
    
    if results[0][0] > best[0][0][0]:
      best = [results,params]

info['bestRuntime'] = (end-start)/60
info['best'] = [best[0], best[1]]
print('best: results={}, params={}'.format(best[0], best[1]))

### save and final report

In [None]:
## hyperparameters and results
pickle.dump(info, open('./data/'+dataset+'/test/'+''.join(savePath)+'.p', "wb" ))
print(info)
## error analysis
dEval = eva.showSeg('predIgnore')
pickle.dump(dEval, open('./data/'+dataset+'/test/'+''.join(savePath)+'.p', "wb" ))
dEval

In [None]:
endTotal = time.time()
print('totalTime=',(endTotal-startTotal)/60)