In [1]:
!pip install datasets transformers rouge-score nltk wandb accelerate scikit-learn --upgrade
!apt install git-lfs

Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[K     |################################| 325 kB 21.8 MB/s eta 0:00:01
[?25hCollecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |################################| 4.0 MB 32.1 MB/s eta 0:00:01
[?25hCollecting rouge-score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Collecting nltk
  Downloading nltk-3.6.7-py3-none-any.whl (1.5 MB)
[K     |################################| 1.5 MB 29.5 MB/s eta 0:00:01
[?25hCollecting wandb
  Downloading wandb-0.12.14-py2.py3-none-any.whl (1.8 MB)
[K     |################################| 1.8 MB 21.4 MB/s eta 0:00:01
[?25hCollecting accelerate
  Downloading accelerate-0.6.2-py3-none-any.whl (65 kB)
[K     |################################| 65 kB 3.8 MB/s eta 0:00:01
[?25hCollecting scikit-learn
  Downloading scikit_learn-0.24.2-cp36-cp36m-manylinux2010_x86_64.whl (22.2 MB)
[K     |################################| 22

In [2]:
import os 
import math

import torch
import nltk
import transformers
import wandb

from tqdm.auto import tqdm
from tqdm.notebook import tqdm as tqdm_n
import numpy as np
import pandas as pd
import torch.nn.functional as F
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate import meteor
from nltk import word_tokenize
from rouge_score import rouge_scorer
from sklearn.metrics import f1_score

from transformers import *
from torch.utils.data import *
from accelerate import Accelerator
from huggingface_hub import *
from datasets import *

HF_TOKEN = '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'
WANDB_TOKEN = '3a3cacf27e02d09574765f66bf5fb73d99dcf716'

os.environ['WANDB_CONSOLE'] = 'off'
os.environ['WANDB_API_KEY'] = WANDB_TOKEN
transformers.logging.set_verbosity_error()

In [3]:
def generate_target_text(df, cols, is_eval=False):
    def to_str(row):
        res = ''
        for col in cols:
            res += f'{col}: {row[col]}, '

        return res[:-2]
    df.columns = [col.lower() for col in df.columns]
    df = df.dropna()
    df.loc[:, 'target_text'] = df.apply(to_str, axis=1)
    df = df[cols + ['notes', 'target_text']]
    
    return df

cols_to_include = [
    # 'event_type',
    # 'sub_event_type',
    'actor1',
    'actor2',
    # 'location',
    # 'fatalities',
]

train = generate_target_text(
    pd.read_csv('data/task_1_information_extraction_train.tsv', sep='\t'),
    cols=cols_to_include,
)
valid = generate_target_text(
    pd.read_csv('data/task_1_information_extraction_valid.tsv', sep='\t'),
    cols=cols_to_include,
)
acled_df = pd.concat([train, valid])
acled_df = acled_df.reset_index(drop=True)
acled = Dataset.from_pandas(acled_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [4]:
acled_df.head(2)

Unnamed: 0,actor1,actor2,notes,target_text
0,Unidentified Armed Group (Pakistan),Civilians (Pakistan),Three people were killed while 27 others injur...,"actor1: Unidentified Armed Group (Pakistan), a..."
1,Military Forces of Somalia (2012-2017),Civilians (Somalia),Government security forces opened fire at a pr...,actor1: Military Forces of Somalia (2012-2017)...


In [6]:
ds = Dataset.from_pandas(acled_df)
ds.push_to_hub('vinaykudari/acled-ie-actors', token=HF_TOKEN)
ds[0]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

{'actor1': 'Unidentified Armed Group (Pakistan)',
 'actor2': 'Civilians (Pakistan)',
 'notes': 'Three people were killed while 27 others injured when a Peshawar-bound train hit a bomb planted by unidentified militants on railway tracks in Tul town in Jacobabad district in Sindh.',
 'target_text': 'actor1: Unidentified Armed Group (Pakistan), actor2: Civilians (Pakistan)'}

In [7]:
model_checkpoint = 't5-base'
proj_name = 't5-acled-ie-a'
ds_identifier = 'vinaykudari/acled-ie-actors'
model_checkpoint = 't5-base'
label_pad_token_id = -100
repo_name = f'vinaykudari/{proj_name}'

create_repo(proj_name, exist_ok=True, token=HF_TOKEN)

'https://huggingface.co/vinaykudari/t5-acled-ie-a'

In [8]:
def optimizer(params, lr):
    return torch.optim.Adam(params, lr)
    
def preprocess_function(samples, tokenizer, max_len=250, summary_len=50):
    l = len(samples['notes'])
    targets = samples['target_text']
    inputs = samples['notes']
    
    inputs = ['summarize: ' + inp for inp in inputs]
    model_inputs = tokenizer(
        inputs,
        max_length=max_len,
        padding='max_length',
        truncation=True,
    )
        
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=summary_len,
            padding='max_length',
            truncation=True,
        )
        
    labels['input_ids'] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels['input_ids']
    ]

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

def get_cols(lis, t=True):
    actor1_list = []
    actor2_list = []

    for text in lis:
        actor1 = ''
        actor2 = ''
        
        cols = [txt.strip() for txt in text.split(',')]
        n = len(cols)
        
        for col in cols:
            if 'actor1' in col:
                actor1 = col.split(':')[1].strip()
            elif 'actor2' in col:
                actor2 = col.split(':')[1].strip()

        actor1_list.append(actor1)
        actor2_list.append(actor2)
        
    res = (
        np.array(actor1_list),
        np.array(actor2_list, dtype=object),
    )
        
    return res
        
    
def score(preds, labels):
    pred_actor1, pred_actor2 = get_cols(preds)
    actual_actor1, actual_actor2 = get_cols(labels)
    
    actor1_acc = (actual_actor1 == pred_actor1).mean() * 100
    actor1_f1 = f1_score(
        actual_actor1,
        pred_actor1, 
        average='macro',
    )
    
    actor2_acc = (actual_actor2 == pred_actor2).mean() * 100
    actor2_f1 = f1_score(
        actual_actor2,
        pred_actor2, 
        average='macro',
    )

    res = { 
        'actor1_acc': actor1_acc,
        'actor1_f1': actor1_f1,

        'actor2_acc': actor2_acc,
        'actor2_f1': actor2_f1,
    }
    
    return res

def evaluate(
    model,
    dataloader,
    tokenizer,
    accelerator,
    metric,
    progress_bar,
):
    step_counter = 0
    for step, batch in enumerate(dataloader):
        with torch.no_grad():
            generated_tokens = model.generate(
                batch['input_ids'],
                max_length=wandb_config.SUMMARY_LEN,
                num_beams=wandb_config.NUM_BEAMS,
            )
            generated_tokens = generated_tokens.cpu().numpy()
            if isinstance(generated_tokens, tuple):
                generated_tokens = generated_tokens[0]

            
            decoded_preds = tokenizer.batch_decode(
                generated_tokens,
                skip_special_tokens=True,
            )
            
            labels = batch['labels'].cpu().numpy()
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            
            scores = score(decoded_preds, decoded_labels)
            wandb.log(scores)
            
        progress_bar.update(1)
        step_counter += 1
        
        if step_counter > wandb_config.MAX_EVAL_STEPS:
            break
                

def train_one_epoch(
    model, 
    dataloader,
    optimizer,
    accelerator,
    lr_scheduler,
    progress_bar,
    n_epoch,
    tot_steps,
):
    step_count = 0
    
    for step, batch in enumerate(dataloader):
        print(f'.', end='')
        tot_steps += 1
        
        preds = model(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            labels=batch['labels'],
        )
        loss = preds.loss
        loss /= wandb_config.GRAD_ACCUM_STEPS
        accelerator.backward(loss)
        
        wandb.log(
            {
                'loss': loss,
                'perplixity': torch.exp(loss),
            },
        )

        if step % wandb_config.GRAD_ACCUM_STEPS == 0 or step == len(dataloader) - 1:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
            step_count += 1

        if step_count >= wandb_config.MAX_TRAIN_STEPS:
            break
            
    return tot_steps

def save_push_to_hub(accelerater, commit_msg):
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(proj_name, save_function=accelerator.save)
    if accelerator.is_main_process:
        model.push_to_hub(repo_name)
        tokenizer.push_to_hub(repo_name)

def run(optim, model, dataset, config, tokenizer):         
    torch.manual_seed(wandb_config.SEED)
    np.random.seed(wandb_config.SEED)
    torch.backends.cudnn.deterministic = True
    
    if accelerator.is_main_process:
        repo = Repository(repo_name, clone_from=repo_name)
        
    accelerator.wait_for_everyone()
    
    remove_columns = dataset['train'].column_names
    
    # when trainer model is different from the tokenizer model
    model.resize_token_embeddings(len(tokenizer))
    
    with accelerator.main_process_first():
        tokenized_ds = dataset.map(
            preprocess_function,
            num_proc=8,
            batched=True,
            remove_columns=remove_columns,
            fn_kwargs={
                'tokenizer': tokenizer,
                'max_len': wandb_config.MAX_TEXT_LEN,
                'summary_len': wandb_config.SUMMARY_LEN
            },
            load_from_cache_file=True,
        )
    
    train_dataset = tokenized_ds['train']
    eval_dataset = tokenized_ds['test']
    
    # form a batch by using a list of dataset elements
    data_collator = DataCollatorForSeq2Seq(
        tokenizer, 
        model=model,
        label_pad_token_id=label_pad_token_id,
    )
    
    train_dataloader = DataLoader(
        train_dataset, 
        collate_fn=data_collator,
        batch_size=wandb_config.TRAIN_BATCH_SIZE
    )
    eval_dataloader = DataLoader(
        eval_dataset, 
        collate_fn=data_collator,
        batch_size=wandb_config.EVAL_BATCH_SIZE,
    )
    
    no_decay = ['bias', 'LayerNorm.weight']
    model_params = [
        {
            'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            'weight_decay': wandb_config.WEIGHT_DECAY,
        },
        {
            'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            'weight_decay': 0.0,
        },
    ]
    
    optimizer = optim(model_params, wandb_config.LEARNING_RATE)
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader
    )
    
    # log metrics with wandb
    wandb.watch(model, log='all')
    
    n_update_steps = math.ceil(len(train_dataloader) / wandb_config.GRAD_ACCUM_STEPS)
    if wandb_config.MAX_TRAIN_STEPS is None:
        wandb_config.MAX_TRAIN_STEPS = wandb_config.TRAIN_EPOCHS * n_update_steps
    
    lr_scheduler = get_scheduler(
        name='linear',
        optimizer=optimizer,
        num_warmup_steps=wandb_config.LR_WARM_UP_STEPS,
        num_training_steps=wandb_config.MAX_TRAIN_STEPS,
    )
    
    print('Training')
    metric = load_metric('rouge')
    train_p_bar = tqdm(
        range(wandb_config.MAX_TRAIN_STEPS),
        disable=not accelerator.is_local_main_process,
    )
    eval_p_bar = tqdm(
        range(wandb_config.MAX_EVAL_STEPS),
        disable=not accelerator.is_local_main_process,
    )
    tot_steps = 0
    
    for n_epoch in range(wandb_config.TRAIN_EPOCHS):
        # set model to traning mode
        model.train()
        tot_steps = train_one_epoch(
            model,
            train_dataloader,
            optimizer,
            accelerator,
            lr_scheduler,
            train_p_bar,
            n_epoch, 
            tot_steps,
        )
        print('\nValidation')
#         set model to evaluation mode
        model.eval()
        evaluate(
            model,
            eval_dataloader,
            tokenizer,
            accelerator,
            metric,
            eval_p_bar,
        )
#         save_push_to_hub(accelerator, f'Traning {n_epoch}')

In [10]:
accelerator = Accelerator(fp16=True)

config = AutoConfig.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, config=config)
block_size = tokenizer.model_max_length

acled = load_dataset(ds_identifier, split='train')
acled = acled.train_test_split(0.1)

torch.cuda.empty_cache()

TRAIN_BATCH_SIZE = 20
EVAL_BATCH_SIZE = 20
TRAIN_EPOCHS = 2
SEED = 42             
MAX_TEXT_LEN = 250
SUMMARY_LEN = 50 
WEIGHT_DECAY = 0.0
GRAD_ACCUM_STEPS = 5
NUM_BEAMS = 2
LR_WARM_UP_STEPS = 5
LEARNING_RATE = 1e-4
MAX_TRAIN_STEPS = 150
MAX_EVAL_STEPS = 100

n_epochs = TRAIN_EPOCHS
config = {'n_epochs': n_epochs}
w = wandb.init(project=proj_name, config=config)
wandb_config = wandb.config 


wandb_config.TRAIN_BATCH_SIZE = TRAIN_BATCH_SIZE
wandb_config.EVAL_BATCH_SIZE = EVAL_BATCH_SIZE
wandb_config.TRAIN_EPOCHS = TRAIN_EPOCHS
wandb_config.SEED = SEED          
wandb_config.MAX_TEXT_LEN = MAX_TEXT_LEN
wandb_config.SUMMARY_LEN = SUMMARY_LEN
wandb_config.WEIGHT_DECAY = WEIGHT_DECAY
wandb_config.GRAD_ACCUM_STEPS = GRAD_ACCUM_STEPS
wandb_config.NUM_BEAMS = NUM_BEAMS
wandb_config.LR_WARM_UP_STEPS = LR_WARM_UP_STEPS
wandb_config.LEARNING_RATE = LEARNING_RATE
wandb_config.MAX_TRAIN_STEPS = MAX_TRAIN_STEPS
wandb_config.MAX_EVAL_STEPS = MAX_EVAL_STEPS

run(
    optim=optimizer, 
    model=model,
    dataset=acled,
    config=config,
    tokenizer=tokenizer,
)

w.finish()

Using custom data configuration vinaykudari--acled-ie-actors-7e1252d290873b4b
Reusing dataset parquet (/root/.cache/huggingface/datasets/parquet/vinaykudari--acled-ie-actors-7e1252d290873b4b/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)
[34m[1mwandb[0m: Currently logged in as: [33mvkudari[0m. Use [1m`wandb login --relogin`[0m to force relogin


Cloning https://huggingface.co/vinaykudari/t5-acled-ie-a into local empty directory.


               

#0:   0%|          | 0/10 [00:00<?, ?ba/s]

#1:   0%|          | 0/10 [00:00<?, ?ba/s]

#2:   0%|          | 0/10 [00:00<?, ?ba/s]

#3:   0%|          | 0/10 [00:00<?, ?ba/s]

#4:   0%|          | 0/10 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/10 [00:00<?, ?ba/s]

#5:   0%|          | 0/10 [00:00<?, ?ba/s]

#7:   0%|          | 0/10 [00:00<?, ?ba/s]

            

#0:   0%|          | 0/2 [00:00<?, ?ba/s]

    

#1:   0%|          | 0/2 [00:00<?, ?ba/s]

#3:   0%|          | 0/2 [00:00<?, ?ba/s]

#2:   0%|          | 0/2 [00:00<?, ?ba/s]

#6:   0%|          | 0/2 [00:00<?, ?ba/s]

#5:   0%|          | 0/2 [00:00<?, ?ba/s]

#4:   0%|          | 0/2 [00:00<?, ?ba/s]

#7:   0%|          | 0/2 [00:00<?, ?ba/s]

Training


  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
Validation
..................................................................................................................................................................................................................................................

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
actor1_acc,▃▆▃▂▄▃▅▅▃▃▂▃▅▅▃█▂▅▃▄▁▄▂▄▃▆▁▄▂█▃▂▁▄▅▂▅▆▅▄
actor1_f1,▃▆▄▂▃▃▄▄▃▃▂▃▅▄▂▅▂▄▃▄▁▄▂▄▃▆▁▃▂█▃▂▂▃▄▂▅▅▅▄
actor2_acc,▃▅▂▆▆▅▃▃▇▂▃▅▆▆▅▇▅▇▅▅▂▃▁▃▁█▃▃▃█▅▅▂▂▆▅▃▇▅▅
actor2_f1,▄▄▃▅▅▄▃▃█▁▄▅▇▅▅▄▅▆▄▄▃▃▁▄▁█▂▃▃█▄▄▃▃▆▅▄▆▅▃
loss,█▅▄▃▃▂▂▂▂▂▂▂▂▂▁▁▁▂▂▁▂▂▁▁▁▂▁▁▂▁▁▁▂▁▁▂▁▂▁▂
perplixity,█▄▃▃▂▂▂▂▂▁▂▂▂▁▁▁▁▁▁▁▁▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▂

0,1
actor1_acc,20.0
actor1_f1,0.12963
actor2_acc,15.0
actor2_f1,0.04021
loss,0.13062
perplixity,1.13953


In [6]:
model.push_to_hub(repo_name)

Upload file pytorch_model.bin:   0%|          | 3.34k/850M [00:00<?, ?B/s]

To https://huggingface.co/vinaykudari/t5-acled-ie
   ca5966f..bb512ba  main -> main



'https://huggingface.co/vinaykudari/t5-acled-ie/commit/bb512ba2c45897b6eebefd14a54f2ec88fe37a02'

In [7]:
tokenizer.push_to_hub(repo_name)

### Eval

In [None]:
# remove_columns = acled['train'].column_names
# acled_ds = acled.map(
#     preprocess_function,
#     batched=True,
#     num_proc=4,
#     remove_columns=remove_columns,
#     fn_kwargs={
#         'tokenizer': tokenizer,
#         'max_len': 250,
#         'summary_len': 50,
#     },
# )

In [13]:
model_eval = 'vinaykudari/t5-acled-ie'
model = AutoModelForSeq2SeqLM.from_pretrained(model_eval)
# eval_tokenizer = AutoTokenizer.from_pretrained(model_eval)
# eval_dataset = acled_ds['test']
# eval_data_collator = DataCollatorForSeq2Seq(
#     eval_tokenizer, 
#     model=eval_model,
#     padding='longest',
#     label_pad_token_id=label_pad_token_id,
# )


# eval_model.eval()

In [None]:
# n = 20
# acled_ds['test'][n]['notes']

# test = torch.tensor([acled_ds['test'][n]['input_ids']])
# res = eval_model.generate(test, max_length=50)
# eval_tokenizer.decode(res[0], skip_special_tokens=True,)

In [None]:
# eval_ds[0]

In [11]:
def eval_preprocess_function(samples, tokenizer, max_len=250, summary_len=50):
    l = len(samples['notes'])
    inputs = samples['notes']
    
    inputs = ['summarize: ' + inp for inp in inputs]
    model_inputs = tokenizer(
        inputs,
        max_length=max_len,
        padding='max_length',
        truncation=True,
    )
    return model_inputs

In [12]:
valid_df = pd.read_csv('data/task_1_information_extraction_valid.tsv', sep='\t')
valid_df.columns = [col.lower() for col in valid_df.columns]
valid_df = valid_df.dropna()

model.eval()
eval_data_collator = DataCollatorForSeq2Seq(
    tokenizer, 
    model=model,
    label_pad_token_id=label_pad_token_id,
)
eval_ds = Dataset.from_pandas(valid_df)
remove_columns = eval_ds.column_names
eval_tds = eval_ds.map(
    eval_preprocess_function,
    batched=True,
    remove_columns=remove_columns,
    num_proc=4,
    fn_kwargs={
        'tokenizer': tokenizer,
        'max_len': 250,
        'summary_len': 50,
    },
)
eval_dataloader = DataLoader(
    eval_tds, 
    collate_fn=eval_data_collator,
    batch_size=EVAL_BATCH_SIZE,
)

# ed_lis = list(eval_dataloader)

      

#0:   0%|          | 0/3 [00:00<?, ?ba/s]

#1:   0%|          | 0/3 [00:00<?, ?ba/s]

  

#2:   0%|          | 0/3 [00:00<?, ?ba/s]

#3:   0%|          | 0/3 [00:00<?, ?ba/s]

In [16]:
device = 'cuda'
_ = model.to(device)

actor1_list = []
actor2_list = []

for step, batch in enumerate(eval_dataloader):
    print('.', end='')
    with torch.no_grad():
        try:
            generated_tokens = model.generate(
                torch.tensor(batch['input_ids'], device=device),
                max_length=SUMMARY_LEN,
            )

            if isinstance(generated_tokens, tuple):
                generated_tokens = generated_tokens[0]

            decoded_preds = tokenizer.batch_decode(
                generated_tokens,
                skip_special_tokens=True,
                max_length=MAX_TEXT_LEN,
                padding='max_length',
                truncation=True,
            )

            pred_actor1, pred_actor2 = get_cols(decoded_preds)
            actor1_list += pred_actor1.tolist()
            actor2_list += pred_actor2.tolist()
        except Exception as e:
            print(e)

.

  if sys.path[0] == '':


..............................'NoneType' object has no attribute '_log'
................................'NoneType' object has no attribute '_log'
.............................'NoneType' object has no attribute '_log'
...........................'NoneType' object has no attribute '_log'
..........................'NoneType' object has no attribute '_log'
..........................'NoneType' object has no attribute '_log'
.............................'NoneType' object has no attribute '_log'
.........................'NoneType' object has no attribute '_log'
...........................'NoneType' object has no attribute '_log'
.....................'NoneType' object has no attribute '_log'
...............................'NoneType' object has no attribute '_log'
..............................'NoneType' object has no attribute '_log'
................................'NoneType' object has no attribute '_log'
...............................'NoneType' object has no attribute '_log'
................

In [None]:
valid = valid_df.head(100)

In [16]:
valid_df['pred_location'] = location_list
valid_df['pred_fatalities'] = fatality_list

In [17]:
valid_df.head()

Unnamed: 0,notes,event_date,source,fatalities,event_type,sub_event_type,actor1,inter1,actor2,inter2,interaction,location,pred_location,pred_fatalities
0,Al Shabaab members attack and kill two men tra...,25-February-2013,Undisclosed Source,2,Violence against civilians,Attack,Al Shabaab,2,Civilians (Somalia),7,27,Baadhaade,Badhaade,2
1,Arrests: Al Shabaab militias arrested a group ...,16-December-2012,Undisclosed Source,0,Strategic developments,Arrests,Al Shabaab,2,Civilians (Somalia),7,27,Baadhaade,Mogadishu - Al Shabaab,0
2,AMISOM/Somali forces clash with al Shabaab in ...,26-September-2013,Shabelle Media Network,0,Battles,Armed clash,Military Forces of Somalia (2012-2017),1,Al Shabaab,2,12,Baadhaade,Kulbiyow,0
3,AS forces took back Diif village from Militia ...,12-March-2011,Undisclosed Source,0,Battles,Armed clash,Al Shabaab,2,Militia (Ahmed Madoobe),3,23,Baadhaade,Diif,0
4,Unidentified armed men picked a civilian man f...,06-November-2013,Undisclosed Source,1,Violence against civilians,Attack,Al Shabaab,2,Civilians (Somalia),7,27,Mogadishu - Karan,Mogadishu - Kaaraan,1


In [17]:
valid_df.to_csv('task1_valid_actor.csv')