In [9]:
import sys
import os.path
import csv
import random
import glob
import subprocess
import os
path = '../cynet'
sys.path.append(path)
from cynet import cynet as cn
from tqdm import tqdm
import pandas as pd

assert os.path.isdir('models'), "Make sure complete_notebook.ipynb is run first"


In [2]:
def pertub_file(file,newfile,theta=0.1,negative=False):
    '''
    Takes a split file, which is typically only one row with many columns. 
    If we are doing a positive perturbation, we take all the zero events 
    in the file and with a probability, theta, change them into positive 
    events. If negative perturbation, change positive events into zeros.
    Inputs:
        file(str)- name of the original split file
        newfile(str)- name of the new file to be written out.
        theta(float)- probability of zero events being converted to 1's.
        negative
    '''
    with open(file) as csvfile:
        reader = csv.reader(csvfile, delimiter= ' ')
        with open(newfile,'w') as newcsvfile:
            writer = csv.writer(newcsvfile, delimiter= ' ')

            if negative:
                for row in reader:
                    for n in range(len(row)):
                        if int(row[n]) > 0:
                            if random.uniform(0,1) < theta:
                                row[n] = 0
                    writer.writerow(row)
            else:
                for row in reader:
                    for n in range(len(row)):
                        if row[n] == '0':
                            if random.uniform(0,1) < theta:
                                row[n] = 1
                    writer.writerow(row)


def alter_splitfiles(globpath, new_dir, theta=0.1,negative=False):
    '''
    Takes all split files that matches the glob path and outputs the pertubed
    version of those files into a new directory.
    Inputs:
        globpath(str)- path to all split files.
        new_dir(str)- directory to send files to.
        theta(float)- probability of zero events being converted to 1's.
        negative(bool)- Whether to do a negative perturbation.
    '''
    split_files = glob.glob(globpath)
    for file in split_files:
        newfile_name = new_dir + file.split('/')[-1]
        pertub_file(file, newfile_name, theta=theta, negative=negative)

def getEventFreq(filename, threshold):
    df = pd.read_csv(filename)
    df = df[df['predictions'] == 1]
    return df

def getChange(logfile, logfileP, threshold):
    '''
    Compare the two logfiles. Baseline log files
    and perturbed log files.
    '''

    nE = getEventFreq(logfile, threshold).index.size
    nEP = getEventFreq(logfileP, threshold).index.size
    if nE == nEP:
        return 0.0
    if (nE == 0) and (nEP > 0):
        nE = 1

    return (nEP-nE)/(nE+0.0)

In [3]:
# Pipeline parameters
var1 = 'Armed_Assault-Assassination-Hijacking-Hostage_Taking_Barricade_Incident-Hostage_Taking_Kidnapping'
var2 = 'Bombing_Explosion-Facility_Infrastructure_Attack'
master_folder = "./perturbtion_temp"
models_folder = "models"
split_path = "./split"
baseline_dir = 'models'
targets = [var1, var2]
var_dic = {var1: 'v1',var2:'v2'}

percentages = [-0.10, 0.10]
result = {t: pd.DataFrame() for t in targets}
model_nums = [20]
MODEL_GLOB = f'models/*model.json'
horizon = 7
RUNLEN = 1827
FLEX_TAIL_LEN = 366
threshold = 0.85
suffix = "2012-01-01_2016-12-31_"

In [4]:
subprocess.call('cp -r {} {}'.format(models_folder, master_folder), shell=True)
new_model_folder = os.path.join(master_folder, MODEL_GLOB)
for p1 in percentages:
    log_dir = master_folder+'/logs/'
    if not os.path.isdir(log_dir):
        os.mkdir(log_dir)
    for p2 in percentages:
        neg1 = p1 < 0.0
        neg2 = p2 < 0.0
        # New directory name.
        new_dir = '{}/split/'.format(master_folder)
        if not os.path.isdir(new_dir):
            os.mkdir(new_dir)
        # Copy over the split files. Might not be necessary (if there are only two variables)
        subprocess.call('cp {}/* {}'.format(split_path,new_dir), shell=True)

        # Alter the split files. Overwrites the ones that are copied over.
        alter_splitfiles("{}/*{}".format(split_path,var1),\
            new_dir, theta=abs(p1),negative=neg1)
        # Alter the split files.
        alter_splitfiles("{}/*{}".format(split_path,var2),\
            new_dir, theta=abs(p2),negative=neg2)
        cn.run_pipeline(
            os.path.join(master_folder, MODEL_GLOB),
            model_nums, 
            horizon, 
            new_dir +'/' + suffix, 
            RUNLEN, 
            ['ALL'], 
            new_model_folder, 
            FLEX_TAIL_LEN=FLEX_TAIL_LEN,
            cores=1,
            gamma=True)

        # Move the new log files.
        subprocess.call('mv ./perturbtion_temp/models/*.log {}'.format(log_dir), shell=True)
        # Get rid of intermediate files
        subprocess.call('rm ./perturbtion_temp/models/*model_sel*', shell=True)
        subprocess.call('rm ./perturbtion_temp/models/*.res', shell=True)

        cn.peturbation_parallel(
            'models/',
            log_dir,
            f'*{model_nums[0]}models#ALL.log',
            tpr_threshold=threshold,
            fpr_threshold=None,
            FLEX_TAIL_LEN=FLEX_TAIL_LEN, 
            cores=1)

        # Glob path for baseline glob. For matching.
        for VAR in targets:

            glob_string = '*ALL#' + VAR + '.csv'
            baseline_glob = baseline_dir +"/"+ glob_string
            baseline_files = glob.glob(baseline_glob)

            # Get the change of each log file pair. 
            # Accumulate in sum.
            sum, l = 0, 0
            for basefile in baseline_files:
                filename = basefile.split('/')[-1]
                perturb_file = log_dir + filename
                change = getChange(basefile, perturb_file, threshold)
                sum += change
                l += 1
                
            # Get the average change. 
            result[VAR].loc[var1+"_"+str(p1), var2+"_"+str(p2)] = (sum/l)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  43 out of  43 | elapsed:   48.0s finished


43 Pairs found


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  43 out of  43 | elapsed:   32.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  43 out of  43 | elapsed:   46.6s finished


43 Pairs found


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  43 out of  43 | elapsed:   32.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  43 out of  43 | elapsed:   47.0s finished


43 Pairs found


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  43 out of  43 | elapsed:   33.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  43 out of  43 | elapsed:   48.1s finished


43 Pairs found


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  43 out of  43 | elapsed:   32.6s finished


In [None]:
for var, data in result.items():
    data.to_csv(f'{var}_perturb_results.csv')