In [1]:
import pandas as pd
import numpy as np
import os

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import string

from sklearn import model_selection
from sklearn.utils import shuffle
from sklearn import metrics

import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup



In [2]:
os.listdir('../input/zindi-swahili-news')

['Test.csv', 'SampleSubmission.csv', 'Train.csv', 'VariableDefinitions.csv']

In [3]:
MODEL_TYPE = 'xlm-roberta-base'

# Here we can specify 10 folds (90/10 split), for example, but
# only train on 3 folds i.e. we will only have 3 fold-models.
NUM_FOLDS = 10
NUM_FOLDS_TO_TRAIN = 10

L_RATE = 2e-5
MAX_LEN = 128
NUM_EPOCHS = 4
BATCH_SIZE = 4
ACCUMULATION_STEPS = 2

NUM_CORES = os.cpu_count()

NUM_CORES

2

## Resources

Training Sentiment Model Using BERT and Serving it with Flask API<br>
https://www.youtube.com/watch?v=hinZO--TEk4

Code from above video on Github<br>
https://github.com/abhishekkrthakur/bert-sentiment

Bare Bert Model docs<br>
https://huggingface.co/transformers/model_doc/bert.html


## Load the data

In [4]:
path = '../input/zindi-swahili-news/Train.csv'
df_data = pd.read_csv(path)

print(df_data.shape)

df_data.head()

(5151, 3)


Unnamed: 0,id,content,category
0,SW0,SERIKALI imesema haitakuwa tayari kuona amani...,Kitaifa
1,SW1,"Mkuu wa Mkoa wa Tabora, Aggrey Mwanri amesiti...",Biashara
2,SW10,SERIKALI imetoa miezi sita kwa taasisi zote z...,Kitaifa
3,SW100,KAMPUNI ya mchezo wa kubahatisha ya M-bet ime...,michezo
4,SW1000,WATANZANIA wamekumbushwa kusherehekea sikukuu...,Kitaifa


In [5]:
# Drop rows with NaN

df_data = df_data.dropna()

df_data.shape

(5151, 3)

In [6]:
df_data['category'].value_counts()

Kitaifa      2000
michezo      1720
Biashara     1360
Kimataifa      54
Burudani       17
Name: category, dtype: int64

In [7]:
path = '../input/zindi-swahili-news/Test.csv'
df_test = pd.read_csv(path)

# Create a dummy column so that the dataloader will work with the test set.
df_test['targets'] = 0

df_test.head()

Unnamed: 0,swahili_id,content,targets
0,001dd47ac202d9db6624a5ff734a5e7dddafeaf2,"MKUU wa Wilaya ya Bahi, Mkoani Dodoma, Mwanah...",0
1,0043d97f7690e9bc02f0ed8bb2b260d1d44bad92,"MWISHONI mwa wiki hii, Timu ya Soka ya Taifa,...",0
2,00579c2307b5c11003d21c40c3ecff5e922c3fd8,THAMANI ya mauzo ya bidhaa za Afrika Masharik...,0
3,00868eeee349e286303706ef0ffd851f39708d37,MENEJA Mawasiliano na Utetezi wa asasi ya AGP...,0
4,00a5cb12d3058dcf2e42f277eee599992db32412,"WAZIRI wa Kilimo, Japhet Hasunga amesema seri...",0


In [8]:
path = '../input/zindi-swahili-news/SampleSubmission.csv'
df_sample = pd.read_csv(path)

df_sample.head()

Unnamed: 0,swahili_id,kitaifa,michezo,biashara,kimataifa,burudani
0,001dd47ac202d9db6624a5ff734a5e7dddafeaf2,0,0,0,0,0
1,0043d97f7690e9bc02f0ed8bb2b260d1d44bad92,0,0,0,0,0
2,00579c2307b5c11003d21c40c3ecff5e922c3fd8,0,0,0,0,0
3,00868eeee349e286303706ef0ffd851f39708d37,0,0,0,0,0
4,00a5cb12d3058dcf2e42f277eee599992db32412,0,0,0,0,0


In [9]:
# Encode the classes.
# Create a target column.

def adjust_target(x):
    
    if x == 'Kitaifa':
        return 0
    elif x == 'michezo':
        return 1
    elif x == 'Biashara':
        return 2
    elif x == 'Kimataifa':
        return 3
    else:
        return 4

df_data['targets'] = df_data['category'].apply(adjust_target)

df_data.head()

Unnamed: 0,id,content,category,targets
0,SW0,SERIKALI imesema haitakuwa tayari kuona amani...,Kitaifa,0
1,SW1,"Mkuu wa Mkoa wa Tabora, Aggrey Mwanri amesiti...",Biashara,2
2,SW10,SERIKALI imetoa miezi sita kwa taasisi zote z...,Kitaifa,0
3,SW100,KAMPUNI ya mchezo wa kubahatisha ya M-bet ime...,michezo,1
4,SW1000,WATANZANIA wamekumbushwa kusherehekea sikukuu...,Kitaifa,0


In [10]:
df_data['targets'].value_counts()

0    2000
1    1720
2    1360
3      54
4      17
Name: targets, dtype: int64

## Create a holdout set

In [11]:
# train test split
df_excl_holdout, df_holdout = model_selection.train_test_split(
                                            df_data,
                                            test_size=0.03,
                                            random_state=1024,
                                            shuffle=True,
                                            stratify=df_data.targets.values
                                        )

print(df_holdout.shape)

df_holdout['targets'].value_counts()

(155, 4)


0    60
1    52
2    41
3     2
Name: targets, dtype: int64

In [12]:
df_excl_holdout.shape

(4996, 4)

## Create the folds

In [13]:
from sklearn.model_selection import KFold, StratifiedKFold

# shuffle
df = shuffle(df_excl_holdout)

# initialize kfold
kf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=1024)

# for stratification
y = df['targets']

# Note:
# Each fold is a tuple ([train_index_values], [val_index_values])
# fold_0, fold_1, fold_2, fold_3, fold_5 = kf.split(df, y)

# Put the folds into a list. This is a list of tuples.
fold_list = list(kf.split(df, y))

train_df_list = []
val_df_list = []

for i, fold in enumerate(fold_list):

    # map the train and val index values to dataframe rows
    df_train = df[df.index.isin(fold[0])]
    df_val = df[df.index.isin(fold[1])]
    
    train_df_list.append(df_train)
    val_df_list.append(df_val)
    

print(len(train_df_list))
print(len(val_df_list))

5
5


## Helper Functions

In [14]:
# https://discuss.pytorch.org/t/meaning-of-parameters/10655
class AverageMeter:
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

## Define the device

In [15]:
# For GPU

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print(device)

cuda:0


In [16]:
# For TPU

#device = xm.xla_device()

#print(device)

## Instantiate the tokenizer

In [17]:
from transformers import XLMRobertaTokenizer

# Instantiate the Bert tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_TYPE)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




## Test the tokenizer

In [18]:
text = "This is a happy tweet."
selected_text = "happy tweet"

# remove any spaces
text = " ".join(text.split())

print(text)

This is a happy tweet.


In [19]:
max_len = 15 # This value could be set as 256, 512 etc.


encoded_dict = tokenizer.encode_plus(
            text,                      # Sentence to encode.
            add_special_tokens = True,      # Add '[CLS]' and '[SEP]'
            max_length = max_len,           # Pad or truncate.
            pad_to_max_length = True,
            return_attention_mask = True,   # Construct attn. masks.
            return_tensors = 'pt',          # Return pytorch tensors.
           )


encoded_dict

{'input_ids': tensor([[    0,  3293,    83,    10, 17723, 89406,     5,     2,     1,     1,
             1,     1,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])}

In [20]:
# These have already been converted to torch tensors.
input_ids = encoded_dict['input_ids'][0]
att_mask = encoded_dict['attention_mask'][0]

print(input_ids)
print(att_mask)

tensor([    0,  3293,    83,    10, 17723, 89406,     5,     2,     1,     1,
            1,     1,     1,     1,     1])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0])


# Check the text lengths

In [21]:
encoded = tokenizer.encode(text)

# remove the CLS and SEP token
encoded = encoded[1:-1]

len(encoded)

6

In [22]:
def get_num_tokens(x):
    
    # convert to type string
    x = str(x)
    # remove any spaces
    x = " ".join(x.split())
    
    # get a list of tokens
    token_list = tokenizer.encode(x)

    # remove the CLS and SEP token
    token_list = token_list[1:-1]
    
    return len(token_list)

df_data['num_tokens'] = df_data['content'].apply(get_num_tokens)

df_data.head()

Unnamed: 0,id,content,category,targets,num_tokens
0,SW0,SERIKALI imesema haitakuwa tayari kuona amani...,Kitaifa,0,369
1,SW1,"Mkuu wa Mkoa wa Tabora, Aggrey Mwanri amesiti...",Biashara,2,338
2,SW10,SERIKALI imetoa miezi sita kwa taasisi zote z...,Kitaifa,0,589
3,SW100,KAMPUNI ya mchezo wa kubahatisha ya M-bet ime...,michezo,1,322
4,SW1000,WATANZANIA wamekumbushwa kusherehekea sikukuu...,Kitaifa,0,557


In [23]:
# Check the min and max number of tokens

print(df_data['num_tokens'].min())
print(df_data['num_tokens'].max())

0
4163


In [24]:
print(len(df_data))
print(len(df_data[df_data['num_tokens'] > 512]))

5151
1840


## Create the dataloader

In [25]:
df_train.head()

Unnamed: 0,id,content,category,targets
133,SW115,"Katibu na Mshauri wa Sheria PAP/IPTL, Joseph ...",Biashara,2
665,SW1748,IKIWA imesalia siku moja kabla ya Yanga kuuma...,michezo,1
3653,SW5094,SERIKALI ya Mkoa wa Pwani ikishirikiana na Mk...,Biashara,2
704,SW1794,Hali hiyo inatokana na vyama vya msingi vya m...,Biashara,2
493,SW1562,UKOMA ni ugonjwa unaosababishwa na vimelea ai...,Kitaifa,0


In [26]:
class CompDataset(Dataset):

    def __init__(self, df):
        self.df_data = df
        
        
    def __len__(self):
        return len(self.df_data)



    def __getitem__(self, index):
        
         # get the sentences from the dataframe
        sentence1 = self.df_data.loc[index, 'content']
        # make sure the text is a string
        sentence1 = str(sentence1)
        # remove any spaces
        sentence1 = " ".join(sentence1.split())


        # Process the sentence
        # ---------------------

        encoded_dict = tokenizer.encode_plus(
                    sentence1,                      # Sentence to encode.
                    add_special_tokens = True,      # Add '[CLS]' and '[SEP]'
                    max_length = MAX_LEN,           # Pad or truncate all sentences.
                    pad_to_max_length = True,
                    return_attention_mask = True,   # Construct attn. masks.
                    return_tensors = 'pt',          # Return pytorch tensors.
               )  
        
        # These are torch tensors already.
        input_ids = encoded_dict['input_ids'][0]
        attention_mask = encoded_dict['attention_mask'][0]
        
        # Convert the target to a torch tensor.
        # The dtype depends on the loss function that we will use.
        targets = torch.tensor(self.df_data.loc[index, 'targets'], dtype=torch.long)



        return {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "targets": targets
                }



# Test the dataloader

In [27]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [28]:
train_data = CompDataset(df_train)
val_data = CompDataset(df_val)
test_data = CompDataset(df_test)



train_dataloader = torch.utils.data.DataLoader(train_data,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                       num_workers=NUM_CORES)

val_dataloader = torch.utils.data.DataLoader(val_data,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                       num_workers=NUM_CORES)

test_dataloader = torch.utils.data.DataLoader(test_data,
                                        batch_size=BATCH_SIZE,
                                        shuffle=False,
                                       num_workers=NUM_CORES)



print(len(train_dataloader))
print(len(val_dataloader))
print(len(test_dataloader))

485
122
161


In [29]:
# Get one train batch

data = next(iter(train_dataloader))

print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(len(data['targets']))

torch.Size([8, 128])
torch.Size([8, 128])
8


In [30]:
# Get one val batch

data = next(iter(val_dataloader))

print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(len(data['targets']))

torch.Size([8, 128])
torch.Size([8, 128])
8


In [31]:
# Get one test batch

data = next(iter(test_dataloader))

print(data['input_ids'].shape)
print(data['attention_mask'].shape)

torch.Size([8, 128])
torch.Size([8, 128])


## Define the Model

In [32]:
from transformers import XLMRobertaModel

In [33]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        
        # This will load the bert model and pre-trined weights without internet.
        # The path points to a kaggle dataset folder that contains a config file and pytorch model.
        self.bert = transformers.XLMRobertaModel.from_pretrained(MODEL_TYPE)
        self.bert_drop = nn.Dropout(0.3)
        self.linear_out = nn.Linear(768, 5)
    
    def forward(self, input_ids, attention_mask):
        # not using sentiment at all
        sequence_output, pooled_output = self.bert(
                                            input_ids, 
                                            attention_mask=attention_mask
                                            )
        # Here we use the pooled_output only
        x = self.bert_drop(pooled_output)
        logits = self.linear_out(x)
        
        # shape: [batch_size, 1]
        return logits
    
    
model = Model()

print(model)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=512.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1115590446.0, style=ProgressStyle(descr…


Model(
  (bert): XLMRobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, eleme

## Test the model

In [34]:
# Create an input batch

data = next(iter(train_dataloader))

print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(len(data['targets']))

torch.Size([8, 128])
torch.Size([8, 128])
8


In [35]:
model = Model()

model.eval()

input_ids = data['input_ids']
attention_mask = data['attention_mask']
targets = data['targets']

outputs = model(
    input_ids, 
    attention_mask=attention_mask
)

outputs.shape

torch.Size([8, 5])

In [36]:
outputs

tensor([[-0.1468,  0.0996,  0.0915, -0.0746,  0.3818],
        [-0.1465,  0.0972,  0.0750, -0.0802,  0.3729],
        [-0.1439,  0.1110,  0.0875, -0.0764,  0.3795],
        [-0.1526,  0.0987,  0.0823, -0.0702,  0.3747],
        [-0.1448,  0.1063,  0.0812, -0.0834,  0.3749],
        [-0.1582,  0.1039,  0.0842, -0.0717,  0.3718],
        [-0.1579,  0.0933,  0.0808, -0.0780,  0.3729],
        [-0.1522,  0.1076,  0.0788, -0.0717,  0.3671]],
       grad_fn=<AddmmBackward>)

In [37]:
targets

tensor([0, 0, 1, 0, 0, 2, 2, 1])

In [38]:
# Calculate the loss

# This loss fuctions takes the raw output as input.
# targets need to be a torch tensor of type torch.float

def loss_fn(outputs, targets):
    return nn.CrossEntropyLoss()(outputs, targets)

In [39]:
loss = loss_fn(outputs, targets)

loss.item()

1.7188395261764526

In [40]:
outputs = outputs.detach().numpy()
preds = np.argmax(outputs, axis=1)

preds

array([4, 4, 4, 4, 4, 4, 4, 4])

In [41]:
# Take the sofmax to convert the preds to probabilities

torch_outputs = torch.tensor(outputs)

softmax = torch.nn.Softmax(dim=1)
pred_probas = softmax(torch_outputs)

pred_probas

tensor([[0.1582, 0.2024, 0.2008, 0.1701, 0.2684],
        [0.1594, 0.2034, 0.1989, 0.1703, 0.2680],
        [0.1585, 0.2045, 0.1998, 0.1696, 0.2676],
        [0.1580, 0.2031, 0.1998, 0.1715, 0.2676],
        [0.1591, 0.2046, 0.1995, 0.1692, 0.2676],
        [0.1571, 0.2042, 0.2003, 0.1713, 0.2670],
        [0.1578, 0.2028, 0.2003, 0.1709, 0.2682],
        [0.1582, 0.2051, 0.1993, 0.1715, 0.2659]])

In [42]:
# Calculate the logloss

score = metrics.log_loss(targets, pred_probas, labels=[0,1,2,3,4])

score

1.7188394516706467

## Engine - Training and Evaluation

In [43]:
def loss_fn(outputs, targets):
    return nn.CrossEntropyLoss()(outputs, targets)

In [44]:
def train_fn(df, model, optimizer, device, scheduler):
    
    data = CompDataset(df)

    data_loader = torch.utils.data.DataLoader(data,
                                            batch_size=BATCH_SIZE,
                                            shuffle=True,
                                           num_workers=NUM_CORES)
    
    model.train()
    
    # This turns gradient calculations on and off.
    torch.set_grad_enabled(True)
    

    for i, data in tqdm(enumerate(data_loader), total=len(data_loader)):
        
        input_ids = data['input_ids']
        attention_mask = data['attention_mask']
        targets = data['targets']


        input_ids = input_ids.to(device, dtype=torch.long)
        attention_mask = attention_mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.long)
        
        

        optimizer.zero_grad()
        
        outputs = model(
            input_ids, 
            attention_mask 
        )
        

        loss = loss_fn(outputs, targets)
        loss.backward()
        
        #if (i + 1) % ACCUMULATION_STEPS == 0:
        optimizer.step()
        scheduler.step()
            

In [45]:
def eval_fn(df, model, device):
    
    data = CompDataset(df)

    data_loader = torch.utils.data.DataLoader(data,
                                            batch_size=BATCH_SIZE,
                                            shuffle=False,
                                           num_workers=1)
    
    model.eval()
    
    # This turns gradient calculations on and off.
    torch.set_grad_enabled(False)
    
    fin_targets = []
    

    for i, data in tqdm(enumerate(data_loader), total=len(data_loader)):
        
        input_ids = data['input_ids']
        attention_mask = data['attention_mask']
        targets = data['targets']


        input_ids = input_ids.to(device, dtype=torch.long)
        attention_mask = attention_mask.to(device, dtype=torch.long)


        outputs = model(
            input_ids, 
            attention_mask
        )
        
        # convert to numpy so we can stack the batches.
        outputs = outputs.cpu().detach().numpy()
        
        # stack the preds from each batch
        if i == 0:
            fin_outputs = outputs
        else:
            fin_outputs = np.vstack((fin_outputs, outputs))
        
        fin_targets.extend(targets.cpu().detach().numpy().tolist())
        

    return fin_outputs, fin_targets

In [46]:
def inference_fn(df, model, device):
    
    data = CompDataset(df)

    data_loader = torch.utils.data.DataLoader(data,
                                            batch_size=BATCH_SIZE,
                                            shuffle=False,
                                           num_workers=1)
    
    model.eval()
    
    # This turns gradient calculations on and off.
    torch.set_grad_enabled(False)

    

    for i, data in tqdm(enumerate(data_loader), total=len(data_loader)):
        
        input_ids = data['input_ids']
        attention_mask = data['attention_mask']


        input_ids = input_ids.to(device, dtype=torch.long)
        attention_mask = attention_mask.to(device, dtype=torch.long)


        outputs = model(
            input_ids, 
            attention_mask
        )
        
        # convert to numpy so we can stack the batches.
        outputs = outputs.cpu().detach().numpy()
        
        # stack the preds from each batch
        if i == 0:
            fin_outputs = outputs
        else:
            fin_outputs = np.vstack((fin_outputs, outputs))
        
    return fin_outputs

In [47]:
def get_model(): 
        
    model = Model() # as defined above


    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]



    num_train_steps = int(len(df_train) / BATCH_SIZE * NUM_EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=L_RATE)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )
        
        
    return model, optimizer, scheduler

## Training

In [48]:
NUM_FOLDS_TO_TRAIN

2

In [49]:
NUM_FOLDS

5

In [50]:
len(train_df_list)

5

In [53]:
fold_scores_list = []
val_preds_list = []
test_preds_list = []

holdout_preds_list = []
holdout_targets_list = []


# reset the indices
df_holdout = df_holdout.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)


# For each fold...
for fold_index in range(0, NUM_FOLDS_TO_TRAIN):

    print('\n== Fold Model', fold_index)
    
    

    # Get df_train and df_val
    # ........................

    # Intialize the fold dataframes
    df_train = train_df_list[fold_index]
    df_val = val_df_list[fold_index]

    # Reset the indices or the dataloader won't work.
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    
    # Get the model
    model, optimizer, scheduler = get_model()
    model.to(device)



    # Training for the specified number of epochs
    # ............................................

    for epoch in range(0, NUM_EPOCHS):  

        # Train the model.
        train_fn(df_train, model, optimizer, device, scheduler)
        
        
        
    # Evaluate on the val set
    # ........................
    
    print('Val set prediction...')
  
    # Use the trained model to predict on the val set.
    # Generate a dictionary of outputs.
    outputs, targets = eval_fn(df_val, model, device)
    
    val_preds_list.append(outputs)


    #preds = np.argmax(outputs, axis=1)
    torch_outputs = torch.tensor(outputs)

    softmax = torch.nn.Softmax(dim=1)
    pred_probas = softmax(torch_outputs)

    # claculate the score
    score = metrics.log_loss(targets, pred_probas, labels=[0,1,2,3,4])

    print(f"Logloss Score = {score}")

    # save the fold score
    fold_scores_list.append(score)
    
    
    # Make out of fold predictions on the test set
    # Note that no models have been saved.
    # .............................................
    preds = inference_fn(df_test, model, device)
    print('Test set prediction complete.')
    
    test_preds_list.append(preds)
    
    
    # Make out of fold predictions on the holdout set
    # Note that no models have been saved.
    # .............................................
    outputs, targets = eval_fn(df_holdout, model, device)
    print('Holdout set prediction complete.')
    
    holdout_preds_list.append(outputs)
    
    if fold_index == 0:
        
        holdout_targets_list.extend(targets)


# Print the average val score for all folds
# ............................................

cv_score = sum(fold_scores_list)/NUM_FOLDS_TO_TRAIN
print('--------------')
print("\nCV Logloss:", cv_score)
print('--------------')




== Fold Model 0


100%|██████████| 485/485 [01:13<00:00,  6.61it/s]
100%|██████████| 485/485 [01:13<00:00,  6.61it/s]
100%|██████████| 485/485 [01:13<00:00,  6.61it/s]

Val set prediction...



100%|██████████| 121/121 [00:05<00:00, 20.89it/s]


Logloss Score = 0.30206513119737116


100%|██████████| 161/161 [00:07<00:00, 22.12it/s]

Test set prediction complete.



100%|██████████| 20/20 [00:00<00:00, 20.67it/s]


Holdout set prediction complete.

== Fold Model 1


100%|██████████| 484/484 [01:13<00:00,  6.62it/s]
100%|██████████| 484/484 [01:13<00:00,  6.61it/s]
100%|██████████| 484/484 [01:13<00:00,  6.62it/s]

Val set prediction...



100%|██████████| 122/122 [00:05<00:00, 22.05it/s]

Logloss Score = 0.3148861541597025



100%|██████████| 161/161 [00:08<00:00, 19.76it/s]

Test set prediction complete.



100%|██████████| 20/20 [00:01<00:00, 19.34it/s]

Holdout set prediction complete.
--------------

CV Logloss: 0.30847564267853683
--------------





In [54]:
df_holdout.head()

Unnamed: 0,id,content,category,targets
0,SW2661,TANZANIA imeendelea kung’ara kwa kuongoza kuw...,Kitaifa,0
1,SW4381,"WAZIRI Mkuu, Kassimu Majaliwa amewataka viong...",Kitaifa,0
2,SW5277,MABINGWA wa Afrika Mashariki na Kati Azam FC ...,michezo,1
3,SW689,"NAIBU Waziri wa Madini, Stanslaus Nyongo ames...",Biashara,2
4,SW1742,MAHAKAMA ya Hakimu Mkazi Kisutu imeshindwa ku...,michezo,1


## Process the holdout set in the same way as the test set

In [55]:
# Sum the predictions of all fold models
for i, item in enumerate(holdout_preds_list):
    
    if i == 0:
        
        preds = item
        
    else:
    
        # Sum the matrices
        preds = item + preds

        
# Average the predictions
avg_preds = preds/(len(holdout_preds_list))

In [56]:
# Take the softmax to convert the preds to probabilities between 0 and 1.

torch_outputs = torch.tensor(avg_preds)

softmax = torch.nn.Softmax(dim=1)
pred_probas = softmax(torch_outputs)

#pred_probas

tensor([[9.3554e-01, 8.6838e-03, 1.8283e-02, 3.3866e-02, 3.6247e-03],
        [9.8462e-01, 1.8696e-03, 6.2550e-03, 6.7476e-03, 5.0790e-04],
        [2.1265e-02, 9.7414e-01, 8.7212e-04, 2.3225e-03, 1.3973e-03],
        [3.1398e-01, 1.2633e-03, 6.8043e-01, 3.4285e-03, 9.0109e-04],
        [3.6266e-02, 9.5778e-01, 9.9306e-04, 3.3706e-03, 1.5953e-03],
        [9.7655e-01, 1.2154e-03, 1.7040e-02, 4.7617e-03, 4.3418e-04],
        [9.7191e-01, 1.9756e-03, 1.9751e-02, 5.8018e-03, 5.6088e-04],
        [8.2916e-01, 1.4516e-03, 1.6434e-01, 4.3449e-03, 7.0760e-04],
        [1.0545e-02, 9.8499e-01, 8.8650e-04, 1.9358e-03, 1.6411e-03],
        [1.1162e-02, 9.8432e-01, 8.5978e-04, 1.9923e-03, 1.6693e-03],
        [3.1304e-03, 4.4811e-04, 9.9540e-01, 4.8677e-04, 5.3286e-04],
        [1.1371e-02, 9.8430e-01, 7.5630e-04, 1.9910e-03, 1.5778e-03],
        [9.3483e-01, 1.4450e-03, 5.9498e-02, 3.7280e-03, 5.0160e-04],
        [9.7564e-01, 5.4731e-03, 4.2140e-03, 1.3579e-02, 1.0905e-03],
        [2.1504e-02,

In [61]:
np_preds = (pred_probas).numpy()

holdout_score = metrics.log_loss(holdout_targets_list, np_preds, labels=[0,1,2,3,4])

holdout_score

0.2689112957011187

## Process the test set oof predictions

In [None]:
# Sum the predictions of all fold models
for i, item in enumerate(test_preds_list):
    
    if i == 0:
        
        preds = item
        
    else:
    
        # Sum the matrices
        preds = item + preds

        
# Average the predictions
avg_preds = preds/(len(test_preds_list))

In [None]:
# Take the softmax to convert the preds to probabilities between 0 and 1.

torch_outputs = torch.tensor(avg_preds)

softmax = torch.nn.Softmax(dim=1)
pred_probas = softmax(torch_outputs)

pred_probas

In [None]:
# convert the preds to numpy
np_preds = (pred_probas).numpy()

# create a dataframe
cols = ['kitaifa', 'michezo', 'biashara', 'kimataifa', 'burudani']
df_preds = pd.DataFrame(np_preds, columns=cols)

# include the id column
df_preds['swahili_id'] = list(df_test['swahili_id'])

# reorder the columns
new_cols = ['swahili_id', 'kitaifa', 'michezo', 'biashara', 'kimataifa', 'burudani']
df_preds = df_preds[new_cols]


df_preds.head()

## Create a submission csv file

In [None]:
path = '../input/zindi-swahili-news/SampleSubmission.csv'

df_sample = pd.read_csv(path)

print(df_sample.shape)

df_sample.head()

In [None]:
# Check the the format matches the above sample submission format.
df_preds.head()

In [None]:
# Create a submission csv file
df_preds.to_csv('submission.csv', index=False)

In [None]:
!ls