In [54]:
import os

import random
import numpy as np

import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import (
    AdamW,
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_cosine_schedule_with_warmup,
)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import gc

In [55]:
# model_name = "roberta-base"

# tokenizer = AutoTokenizer.from_pretrained(model_name, ignore_mismatched_sizes=True)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [56]:
lr = 2e-5
epochs =  6
batch_size = 5
max_seq_len = 75

test_frac = 0.1

In [57]:
import os

def set_seed(seed=106052):
    """Set seed for reproducibility.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed()

In [58]:
class CEFRDataset(Dataset):
    """Classification dataset, built on top of pytorch dataset object
    """
    
    def __init__(self, texts, labels):
        
        self.encoder = LabelEncoder()
        print(self.encoder.__dict__)
        self.texts = texts
        self.labels = self.encoder.fit_transform(labels)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        encoded_text = tokenizer(
            text,
            padding="max_length",
            max_length=max_seq_len,
            truncation=True,
            return_tensors="pt",
        )
        encoded_text["input_ids"] = encoded_text["input_ids"].squeeze()
        encoded_text["attention_mask"] = encoded_text["attention_mask"].squeeze()
        label = torch.tensor(label)

        return {
            "input_ids": encoded_text["input_ids"],
            "attention_mask": encoded_text["attention_mask"],
            "labels": label,
        }

    def get_labels(self):
        return self.labels

In [59]:
train_set_df = pd.read_csv("../input/swahili-news-classification/Train (10).csv")


In [60]:
def train(train_set, valid_set, epochs=5, warmup_size=0.1, lr=1e-3, batch_size=8):
    model = get_model(model_name)
    optim = AdamW(model.parameters(), lr=lr)
    scheduler = get_scheduler(
        optim, warmup_size, round(len(train_set) / batch_size * epochs)
    )
    training_args = get_training_args(epochs, batch_size)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_set,
        eval_dataset=valid_set,
        optimizers=[optim, scheduler],
        compute_metrics=compute_accuracy
    )
    trainer.train()
    trainer.save_model()
    return trainer

In [61]:
def get_model(pretrained_checkpoint):
    model = AutoModelForSequenceClassification.from_pretrained(
        pretrained_checkpoint, num_labels=2, ignore_mismatched_sizes=True
    )
    return model.to(device)

In [62]:
os.environ["WANDB_DISABLED"] = "true"


def get_scheduler(optimizer, warmup_size, total_steps):
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=round(total_steps * warmup_size),
        num_training_steps=total_steps,
    )
    return scheduler


def get_training_args(epochs, batch_size):
    return TrainingArguments(
        output_dir="./b",
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        logging_steps=50,
        fp16=False,
        evaluation_strategy="epoch",
        eval_accumulation_steps=1,
        report_to=None,
#         save_total_limit=1,
#         load_best_model_at_end=True,
        save_strategy = 'epoch'
    )


def compute_accuracy(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}



In [63]:
lr = 2e-5
epochs =  8
batch_size = 8
max_seq_len = 512

In [64]:
test = pd.read_csv("../input/swahili-news-classification/Test (11).csv")
test.head()

Unnamed: 0,swahili_id,content
0,001dd47ac202d9db6624a5ff734a5e7dddafeaf2,"MKUU wa Wilaya ya Bahi, Mkoani Dodoma, Mwanah..."
1,0043d97f7690e9bc02f0ed8bb2b260d1d44bad92,"MWISHONI mwa wiki hii, Timu ya Soka ya Taifa,..."
2,00579c2307b5c11003d21c40c3ecff5e922c3fd8,THAMANI ya mauzo ya bidhaa za Afrika Masharik...
3,00868eeee349e286303706ef0ffd851f39708d37,MENEJA Mawasiliano na Utetezi wa asasi ya AGP...
4,00a5cb12d3058dcf2e42f277eee599992db32412,"WAZIRI wa Kilimo, Japhet Hasunga amesema seri..."


In [65]:
train_set_df.drop("id", axis=1, inplace=True)

In [66]:
train_set_df.columns=["text","label"]
train_set_df = train_set_df[["text", "label"]]

In [67]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

train_set_df.text = train_set_df.text.apply(lambda x: x.replace("\r", "").replace("\n", " "))


In [68]:
train_set_df.label = le.fit_transform(train_set_df.label)

In [69]:
from tqdm import tqdm 

def predict(model, text):
    
    preds = []
    
    for i in tqdm(range(len(text))):
        tokenized = tokenizer(text[i:i+1], return_tensors="pt", truncation=True, max_length=512).to("cuda")
        pred = model(**tokenized)
        preds.append(pred.logits.argmax(-1).item())

    return preds

In [70]:
import gc
torch.cuda.empty_cache()
gc.collect()

116

In [71]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [72]:
train_set_df = reduce_mem_usage(train_set_df)


Memory usage after optimization is: 0.04 MB
Decreased by 43.7%


In [73]:
# !pip install --upgrade transformers
# !pip install simpletransformers

[0m

In [74]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings
warnings.simplefilter('ignore')
import gc
from scipy.special import softmax
from simpletransformers.classification import ClassificationModel

In [75]:
import sklearn
from sklearn.metrics import log_loss
from sklearn.metrics import *
from sklearn.model_selection import *

In [76]:
test = pd.read_csv("../input/swahili-news-classification/Test (11).csv")
id_=test.swahili_id
test1=test.drop(['swahili_id'],axis=1)
test1['label']=0

In [77]:
sub=pd.read_csv("../input/swahili-news-classification/SampleSubmission (6).csv")
sub.head()

Unnamed: 0,swahili_id,kitaifa,michezo,biashara,kimataifa,burudani
0,001dd47ac202d9db6624a5ff734a5e7dddafeaf2,0,0,0,0,0
1,0043d97f7690e9bc02f0ed8bb2b260d1d44bad92,0,0,0,0,0
2,00579c2307b5c11003d21c40c3ecff5e922c3fd8,0,0,0,0,0
3,00868eeee349e286303706ef0ffd851f39708d37,0,0,0,0,0
4,00a5cb12d3058dcf2e42f277eee599992db32412,0,0,0,0,0


In [78]:
train_set_df['label'].value_counts()

3    2000
4    1720
0    1360
2      54
1      17
Name: label, dtype: int64

In [80]:

err=[]
y_pred_tot=[]
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
fold=StratifiedKFold(n_splits=20, shuffle=True, random_state=2)
i=1
for train_index, test_index in fold.split(train_set_df,train_set_df['label']):
    train1_trn, train1_val = train_set_df.iloc[train_index], train_set_df.iloc[test_index]
    model = ClassificationModel('roberta', 'roberta-large', use_cuda=True,num_labels=5, args={'train_batch_size':32,
                                                                         'reprocess_input_data': True,
                                                                         'overwrite_output_dir': True,
                                                                         'fp16': False,
                                                                         'do_lower_case': False,
                                                                         'num_train_epochs': 2,
                                                                         'max_seq_length': 64,
                                                                         'regression': False,
                                                                         'manual_seed': 2,
                                                                         "learning_rate":3e-5,
                                                                         'weight_decay':0,
                                                                         "save_eval_checkpoints": False,
                                                                         "save_model_every_epoch": False,
                                                                         "silent": True})
    model.train_model(train1_trn)
    raw_outputs_val = model.eval_model(train1_val)[1]
    raw_outputs_val = softmax(raw_outputs_val,axis=1)[:,:]
#     print(f"Log_Loss: {log_loss(train1_val['label'], raw_outputs_val)}")
#     err.append(log_loss(train1_val['label'], raw_outputs_val))
    raw_outputs_test = model.eval_model(test1)[1]
    raw_outputs_test = softmax(raw_outputs_test,axis=1)[:,:]
    y_pred_tot.append(raw_outputs_test)
# print("Mean LogLoss: ",np.mean(err))
final=pd.DataFrame()
final['swahili_id']=test['swahili_id']
final['label']=y_pred_tot
print(final.shape)
final.to_csv('20fold_rbl_2_3e5_32_64_0.csv',index=False)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifie

ValueError: Length of values (20) does not match length of index (1288)

****