In [24]:
import cynet.cynet as cn
import pandas as pd
import numpy as np
import os
from glob import glob

## Process Event Log File and Generate Split and Triplet

### 1. Parameter setting

#### a. File
In the following cell, we specify the event log input file as `LOGFILE`. Since the `STOREFILE` is only used internally, we can just name it after the `LOGFILE`. 

In [14]:
LOGFILE = '/project2/ishanu/YI_terror/data/terror.csv'
STOREFILE = os.path.join('/project2/ishanu/YI_terror/ntb', os.path.basename(LOGFILE).split('.')[0] + '.p')
# In this case, STOREFILE = '/project2/ishanu/YI_terror/ntb/terror.p'

#### b. Spatial Range and Discretization

In the following cell, we specify the tiles used for spatial discretization.
We cut latitude (longitude) between `lat_min` and `lat_max` (`lon_min` and `lon_max`) into `lat_eps` (`lon_eps`) equal parts. Each tiles is one longitude step size wide and one latitude step size high.

In [3]:
# Column names in the event log file for coordinate 1 and 2
coord1, coord2 ='latitude', 'longitude'

# Tiles
lat_min, lat_max = -4, 49
lon_min, lon_max = -16, 84
lat_eps, lon_eps = 50, 50

lat = np.around(np.linspace(lat_min, lat_max, lat_eps + 1), decimals=5)
lon = np.around(np.linspace(lon_min, lon_max, lon_eps + 1), decimals=5)
tiles = [[lat[i], lat[i + 1], lon[j], lon[j + 1]] for i in np.arange(lat_eps) for j in np.arange(lon_eps)]

#### c. Time Range and Discretization

In [4]:
# Column names in the event log file for year, month, and day
year, month, day='iyear', 'imonth', 'iday'
init_date, end_date, freq = '2012-01-01', '2016-12-31', 'D'

#### d. Event
If a time series has an event frequency less than `threshold`, discard the time series.

In [5]:
event_dict = {
    'number_of_kills': {
        'col_name': 'nkill',
        'value_limits': [0, 10000],
        'threshold': 0.025,
        'csvname_prefix': 'NKILL'
    },
    'BEFIA': {
        'col_name': 'attacktype1_txt',
        'types': [[
            'Bombing/Explosion', 
            'Facility/Infrastructure Attack'
        ]],
        'threshold': 0.025,
        'csvname_prefix': 'BEFIA'
    },
    'AAHHH': {
        'col_name': 'attacktype1_txt',
        'types': [[
            'Armed Assault', 
            'Assassination',
            'Hijacking',
            'Hostage Taking (Barricade Incident)',
            'Hostage Taking (Kidnapping)'
        ]],
        'threshold': 0.025,
        'csvname_prefix': 'AAHHH'
    }
}

### 2. Generating Time Series for Training and Test

#### a. Time Series for the Number of kills
Our first fit is `S0` for time series of number of kills. 
Essentially, we are looking for tiles that meet a certain number of kills (deaths in the column `nkill`). 
We are looking for tiles with number of kills that are greater than a certain `threshold`. 
Here that `threshold` is $0.025$.
A file named `NKILL.csv` is outputted. 
And, more importantly, the internal timeseries dataframe is changed.

In [None]:
S0 = cn.spatioTemporal(
    # File
    log_file=LOGFILE,
    log_store=STOREFILE,
    # Spatial
    coord1=coord1,
    coord2=coord2,
    grid=tiles,
    # Temporal
    year=year,
    month=month,
    day=day,
    init_date=init_date,
    end_date=end_date,
    freq=freq,
    # Event
    EVENT=event_dict['number_of_kills']['col_name'],
    value_limits=event_dict['number_of_kills']['value_limits'],
    threshold=event_dict['number_of_kills']['threshold'])

S0.fit(csvPREF=event_dict['number_of_kills']['csvname_prefix'])

**Note** that we are now going to use the tiles selected for in `S0`. 

In [8]:
tiles = S0.getGrid()

100%|██████████| 2500/2500 [47:54<00:00,  1.15s/it]


#### b. Time Series for Bombing/Explosion and Facility/Infrastructure Attack
`S1` will be our fitting for attack types in the categories 
 - `Bombing/Explosion` and 
 - `Facility/Infrastructure Attack`.

We are counting the number of these types of events that happen in these tiles.
Output is written to `BEFIA.csv`, which contains the timeseries for those types of attacks in the selected tiles.

In [14]:
S1 = cn.spatioTemporal(
    log_store=STOREFILE,
    # Spatial
    coord1=coord1,
    coord2=coord2,
    grid=tiles,
    # Temporal
    year=year,
    month=month,
    day=day,
    init_date=init_date,
    end_date=end_date,
    freq=freq,
    # Event
    EVENT=event_dict['BEFIA']['col_name'],
    types=event_dict['BEFIA']['types'],
    threshold=event_dict['BEFIA']['threshold'])

S1.fit(csvPREF=event_dict['BEFIA']['csvname_prefix'])

100%|██████████| 2500/2500 [49:36<00:00,  1.19s/it]


#### c. Time Series for Armed Assault, Assassination, Hijacking, and Hostage Taking
`S2` fits for the attack types:
 - `Armed Assault`, 
 - `Hostage Taking (Barricade Incident)`, 
 - `Hijacking`, 
 - `Assassination`,
 - `Hostage Taking (Kidnapping) `.

Output is written to `AAHHH.csv`.

In [23]:
S2 = cn.spatioTemporal(
    log_store=STOREFILE,
    # Spatial
    coord1=coord1,
    coord2=coord2,
    grid=tiles,
    # Temporal
    year=year,
    month=month,
    day=day,
    init_date=init_date,
    end_date=end_date,
    freq=freq,
    # Event
    EVENT=event_dict['AAHHH']['col_name'],
    types=event_dict['AAHHH']['types'],
    threshold=event_dict['AAHHH']['threshold'])

S2.fit(csvPREF=event_dict['AAHHH']['csvname_prefix'])

100%|██████████| 2500/2500 [46:34<00:00,  1.12s/it]


### 3. Generate Triplet for Training and Split for Testing
Now we use the csv files created in previous steps (listed in `CSVfiles`) to generate the triplet files for training and split files for testing. 

 - The triplet files are generated with `readTS`.
    The training period is defined by `begin` and `end`. 
 - The split files are generated with `splitTS`. 
    The split files contains data from `begin` to `extended_end`. 
    The data for testing are those beyond the `end` and before the `extended_end`
    Here we set the `extended_end` to be one year beyond the `end`.

In [15]:
CSVfiles = [val['csvname_prefix'] + '.csv' for _, val in event_dict.items()]

begin, end, extended_end = init_date, '2015-12-31', end_date

# Make sure the triplet folder and split folder exist
os.mkdir('/project2/ishanu/YI_terror/ntb/triplet/')
os.mkdir('/project2/ishanu/YI_terror/ntb/split/')

In [17]:
# Triplet
triplet_fnames_prefix = '/project2/ishanu/YI_terror/ntb/triplet/TERROR_' + begin + '_' + end
cn.readTS(
    CSVfiles, 
    csvNAME=triplet_fnames_prefix, 
    BEG=begin, 
    END=end)

# Split
split_dirname = '/project2/ishanu/YI_terror/ntb/split/'
split_prefix = begin + '_' + extended_end + '_'
cn.splitTS(
    CSVfiles, 
    BEG=begin, 
    END=extended_end, 
    dirname=split_dirname, 
    prefix=split_prefix)

####  Optional cleanup of out-of-use files.

In [None]:
for CSVfile in CSVfiles:
    os.remove(CSVfile)
os.remove(STOREFILE)

## Model Generation
Now that we training and testing data ready, it is time to create the models.

**Input and Output of this step**
 - Input: training data (the triplet files) produced by `readTS`;
 - Output: model json files which each represnts a model.

### 1. Parameter setting

**Note:** It is highly recommended that we use absolute paths.

**Explanations:**
 - `PARTITION`: Since we work with event counts, a single partitioning at $-.5$ makes "no event" a $0$, and "any number of events more than $1$" a $1$.
 - `RUN_LOCAL`: 
     - If `False`, `xgModels` will produce a list of calls `program_calls.txt` that needs to be run to produce the models.
     - If `True`, `xgModels` will generate models locally. 
 - `NUM_RERUNS`: Since `XgenESeSS` is random, we usually run it several times to get the averaged result.
 - `XgenESeSS`: The location of the `XgenESeSS` binary.
     - it only work for Linux;

In [19]:
# File parameters
TS_PATH = triplet_fnames_prefix + '.csv' # The time series (data only)
NAME_PATH = triplet_fnames_prefix + '.coords' # The names for each time series
FILEPATH = '/project2/ishanu/YI_terror/ntb/models/' # Make sure to create a folder with name `FILEPATH` below
LOG_PATH = 'log.txt'

# XgenESSeS parameters
BEG = 1  # minimum delay considered
END = 60 # maximum delay considered
NUM_RERUNS = 2 # number of reruns
PARTITION = [.5] # partitioning points. 
XgenESeSS = '/project2/ishanu/YI_terror/bin/XgenESeSS'
RUN_LOCAL = False

# make sure a folder named `models` is created
os.mkdir(FILEPATH)

### 2. Running `xgModels` to generate model or model generating calls

In [20]:
XG = cn.xgModels(
    TS_PATH,
    NAME_PATH, 
    LOG_PATH,
    FILEPATH, 
    BEG, 
    END, 
    NUM_RERUNS, 
    PARTITION,
    XgenESeSS,
    RUN_LOCAL)
XG.run(workers=4)

## Model Evaluation
Here we evaluate our models by the AUC of the their prediction. 

The inner working of `run_pipeline`:
1. It first select `model_nums` number of models either by gamma or distance. 
    Then it creates a model_sel json file which is a filtered version of the models.
1. It applies the `cynet` binary to the model_sel files, which generates a log file containing predictions.
1. It applies the `flexroc` binary to the log files, once for each target type.
1. Finally, it writes test statistics (AUC, fpr, and, tpr) and output a `res_all.csv` file.

### 1. Parameter setting

**Explanation:**
 - `RUNLEN`: number of time steps in training and testing;
 - `FLEX_TAIL_LEN`: number of time steps in testing;
 - `model_nums`: maximum number of models to use in prediction;
 - `horizon`: prediction horizon;
 - `VARNAME`: the predicting variable types;
    Here we use individual variable types and `ALL` meaning all types of predicting variables are used together.
 - `gamma`: If `gamma` is true, the models are sorted with gamma (coefficient of causal dependence) and the best `model_nums` models will be used in the prediction;
 - To sort models by distance, use `distance=True` instead of `gamma=True` in `run_pipeline`;
 - `cores`: Number of cores running in parallel. 

In [28]:
# File parameters
MODEL_GLOB = '/project2/ishanu/YI_terror/ntb/models/*model.json'
RESPATH = '/project2/ishanu/YI_terror/ntb/models/*model*res'
DATA_PATH = os.path.join(split_dirname, split_prefix) # the split files path prefix 

# Prediction parameters
RUNLEN = len(pd.date_range(start=begin, end=extended_end, freq=freq))
FLEX_TAIL_LEN = len(pd.date_range(start=end, end=extended_end, freq=freq))
model_nums = [20]
horizon = 7
VARNAME = list(set([fname.split('#')[-1] for fname in glob(DATA_PATH + "*")])) + ['ALL']

# Running parameters
# Make sure you have multi-core access when using cores greater than 1. 
cores = 4

### 2. Run prediction pipeline

In [29]:
cn.run_pipeline(
    MODEL_GLOB,
    model_nums, 
    horizon, 
    DATA_PATH, 
    RUNLEN, 
    VARNAME, 
    RESPATH, 
    FLEX_TAIL_LEN=FLEX_TAIL_LEN,
    cores=cores,
    gamma=True)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 305 out of 305 | elapsed: 34.5min finished


Let use see a summary of the aucs

In [35]:
res = pd.read_csv('res_all.csv')
res[ (res['varsrc'] == 'ALL') & (res['auc'] < .999)]['auc'].describe()

count    301.000000
mean       0.789952
std        0.057548
min        0.318740
25%        0.761022
50%        0.789685
75%        0.818124
max        0.933031
Name: auc, dtype: float64