# Terror5 - Preprocessing
### Step 1. Load data
Load csv file in a dataframe, check encoding and low_memory=False because some columns are mix types.
Columns (4,61,62,66,116,117,123) have mixed types.

In [6]:
%matplotlib inline
import pandas as pd
gtd = pd.read_csv('gtd.csv', encoding='latin1', low_memory=False)
gtd.columns

Index(['eventid', 'iyear', 'imonth', 'iday', 'approxdate', 'extended',
       'resolution', 'country', 'country_txt', 'region',
       ...
       'addnotes', 'scite1', 'scite2', 'scite3', 'dbsource', 'INT_LOG',
       'INT_IDEO', 'INT_MISC', 'INT_ANY', 'related'],
      dtype='object', length=137)

**SLOW !!!** Save columns with _txt mappings in CSV files

In [7]:
import csv
from tqdm import tqdm_notebook

# # Mapping for _txt columns
# txt_maps = {col[:-4]:{} for col in gtd.columns if col[-4:]=='_txt'}
# for index, row in tqdm_notebook(gtd.iterrows(), desc='Rows'):
#     if index%5000 == 0:
#         print('.', end='')
#     for key in txt_maps:
#         if pd.notnull(row[key]):
#             txt_maps[key][int(row[key])] = row[key+'_txt']
            
# # Save maps to CSV
# with open('mapping.csv', 'w', newline='') as csvfile:
#     writer = csv.writer(csvfile)
#     writer.writerows(txt_maps.items())

### Step 2. Preprocessing
**a)** Columns to keep

In [8]:
keep_cols = ['gname', 'iyear', 'country', 'region', 'attacktype1']
# gtd = gtd[keep_cols]

**b)** Rows to keep: all groups with at least *n_attacks*

In [9]:
from collections import Counter

n_attacks = 5
group_count = Counter(gtd['gname'])
# Remove groups with less than n_attacks
keep_groups = [group for group, counter in group_count.items() if counter >= n_attacks]
# Remove 'Unknown'
keep_groups.remove('Unknown')

gtd = gtd[gtd['gname'].isin(keep_groups)]

In [10]:
gtd.to_csv('gtd_clean.csv', encoding='utf-8')