In [168]:
import os

import random
import numpy as np

import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import (
    AdamW,
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_cosine_schedule_with_warmup,
)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [169]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"

tokenizer = AutoTokenizer.from_pretrained(model_name, ignore_mismatched_sizes=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

loading configuration file https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/d9226eeac7b8b96d83ebc327cdd670490866d8c999505c1f83b6ef206ccb1604.a34960b447312b0727cb670d710444fcb41a6156eddcba062a19b3fc05d95251
Model config BertConfig {
  "_name_or_path": "nlptown/bert-base-multilingual-uncased-sentiment",
  "_num_labels": 5,
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "finetuning_task": "sentiment-analysis",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "1 star",
    "1": "2 stars",
    "2": "3 stars",
    "3": "4 stars",
    "4": "5 stars"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "1 star": 0,
    "2 stars": 1,
    "3 stars": 2,
    "4 stars": 3,
    "5 stars": 4
  },
  "layer_norm

In [170]:
lr = 2e-5
epochs =  6
batch_size = 5
max_seq_len = 75

test_frac = 0.1

In [171]:
import os

def set_seed(seed=106052):
    """Set seed for reproducibility.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed()

In [172]:
class CEFRDataset(Dataset):
    """Classification dataset, built on top of pytorch dataset object
    """
    
    def __init__(self, texts, labels):
        
        self.encoder = LabelEncoder()
        print(self.encoder.__dict__)
        self.texts = texts
        self.labels = self.encoder.fit_transform(labels)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        encoded_text = tokenizer(
            text,
            padding="max_length",
            max_length=max_seq_len,
            truncation=True,
            return_tensors="pt",
        )
        encoded_text["input_ids"] = encoded_text["input_ids"].squeeze()
        encoded_text["attention_mask"] = encoded_text["attention_mask"].squeeze()
        label = torch.tensor(label)

        return {
            "input_ids": encoded_text["input_ids"],
            "attention_mask": encoded_text["attention_mask"],
            "labels": label,
        }

    def get_labels(self):
        return self.labels

In [173]:
def train(train_set, valid_set, epochs=10, warmup_size=0.1, lr=1e-3, batch_size=16):
    model = get_model(model_name)
    optim = AdamW(model.parameters(), lr=lr)
    scheduler = get_scheduler(
        optim, warmup_size, round(len(train_set) / batch_size * epochs)
    )
    training_args = get_training_args(epochs, batch_size)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_set,
        eval_dataset=valid_set,
        optimizers=[optim, scheduler],
        compute_metrics=compute_accuracy
    )
    trainer.train()
    trainer.save_model()
    return trainer

In [174]:
def get_model(pretrained_checkpoint):
    model = AutoModelForSequenceClassification.from_pretrained(
        pretrained_checkpoint, num_labels=2, ignore_mismatched_sizes=True
    )
    return model.to(device)

In [175]:
os.environ["WANDB_DISABLED"] = "true"


def get_scheduler(optimizer, warmup_size, total_steps):
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=round(total_steps * warmup_size),
        num_training_steps=total_steps,
    )
    return scheduler


def get_training_args(epochs, batch_size):
    return TrainingArguments(
        output_dir="./b",
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        logging_steps=50,
        fp16=False,
        evaluation_strategy="epoch",
        eval_accumulation_steps=1,
        report_to=None,
#         save_total_limit=1,
#         load_best_model_at_end=True,
        save_strategy = 'epoch'
    )


def compute_accuracy(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}



In [176]:
lr = 2e-5
epochs =  4
batch_size = 8
max_seq_len = 512

In [177]:
def split_valid(df, frac=0.1):
    
    val = pd.DataFrame()
    val["text"] = ""
    val["label"] = -1
    
    for i in df.label.unique():
        val = pd.concat([val, df[df.label == i].sample(frac=frac)])
        
    return df[~df.index.isin(val.index)].reset_index(drop=True) , val.reset_index(drop=True)

In [178]:
train_set_df = pd.read_csv("../input/covid19-tweet-classification-challenge-by-zindi/updated_train.csv")
train_set_df.drop("ID", axis=1, inplace=True)
train_set_df = train_set_df.reset_index(drop=True)
train_set_df.columns=["text","label"]
train_set_df = train_set_df[train_set_df.label != "-"]
train_set_df = train_set_df[["text", "label"]]

In [179]:
train_set_df.label.value_counts()

0    2746
1    2541
Name: label, dtype: int64

In [180]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

train_set_df.text = train_set_df.text.apply(lambda x: x.replace("\r", "").replace("\n", " "))

# extra_df = pd.read_csv("../input/frenchcefr/french_mike_june.csv")
# extra_df.columns = ["text", "label", "label_"]
# extra_df = extra_df[["text", "label"]]
# extra_df.text = extra_df.text.astype(str)
#train_set_df = pd.concat([train_set_df, extra_df]).reset_index(drop=True)

train_set_df, valid_set_df = split_valid(train_set_df)

In [181]:
train_set_df.label = le.fit_transform(train_set_df.label)
valid_set_df.label = le.transform(valid_set_df.label)

In [182]:
valid_set_df

Unnamed: 0,text,label
0,All empty during a show at the Max Stadium bef...,1
1,ki saru 41 donated 2cr to mumbai police in thi...,1
2,Soft water soft skin Looking for water softene...,1
3,Hospitals get paid more if patients listed as ...,1
4,Italian politician Vittorio Sgarbi reports in ...,1
...,...,...
524,Stephon Marbury says Larry Brown tried to kick...,0
525,He left it on the table Unfinished All the spo...,0
526,In these tough times the best way to grow is t...,0
527,poole Lib Dem Leader Fasts for Holy Ramadan in...,0


In [183]:
from tqdm import tqdm 

def predict(model, text):
    
    preds = []
    
    for i in tqdm(range(len(text))):
        tokenized = tokenizer(text[i:i+1], return_tensors="pt", truncation=True, max_length=512).to("cuda")
        pred = model(**tokenized)
        preds.append(pred.logits.argmax(-1).item())

    return preds

In [184]:
train_set = CEFRDataset(train_set_df["text"], train_set_df["label"])
valid_set = CEFRDataset(valid_set_df["text"], valid_set_df["label"])


trainer_second = train(train_set, valid_set, epochs=epochs, warmup_size=0.2, lr=lr, batch_size=batch_size)
model = trainer_second.model

{}
{}


loading configuration file https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/d9226eeac7b8b96d83ebc327cdd670490866d8c999505c1f83b6ef206ccb1604.a34960b447312b0727cb670d710444fcb41a6156eddcba062a19b3fc05d95251
Model config BertConfig {
  "_name_or_path": "nlptown/bert-base-multilingual-uncased-sentiment",
  "_num_labels": 5,
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "finetuning_task": "sentiment-analysis",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads":

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2869,0.302374,0.890359


***** Running Evaluation *****
  Num examples = 529
  Batch size = 8
Saving model checkpoint to ./b/checkpoint-595
Configuration saved in ./b/checkpoint-595/config.json
Model weights saved in ./b/checkpoint-595/pytorch_model.bin


KeyboardInterrupt: 

In [None]:
# valid_set_df["preds"] = train_set.encoder(predict(model, valid_set_df.text.tolist()))
# valid_set_df.columns = ["text", "cefr", "preds",] 

In [None]:
class CEFRDatasettest(Dataset):
    """Classification dataset, built on top of pytorch dataset object
    """
    
    def __init__(self, texts):
        
        self.encoder = LabelEncoder()
        print(self.encoder.__dict__)
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        encoded_text = tokenizer(
            text,
            padding="max_length",
            max_length=max_seq_len,
            truncation=True,
            return_tensors="pt",
        )
        encoded_text["input_ids"] = encoded_text["input_ids"].squeeze()
        encoded_text["attention_mask"] = encoded_text["attention_mask"].squeeze()

        return {
            "input_ids": encoded_text["input_ids"],
            "attention_mask": encoded_text["attention_mask"],
        }

    def get_labels(self):
        return self.labels

In [None]:
test = pd.read_csv("../input/covid19-tweet-classification-challenge-by-zindi/updated_test.csv")

In [None]:
preds=predict(model, test.text.to_list())

In [None]:
sub=pd.read_csv("../input/covid19-tweet-classification-challenge-by-zindi/updated_ss.csv")
sub.target=preds
sub

In [None]:
from IPython.display import FileLink
def create_submission(submission_file, submission_name):
    submission_file.to_csv(submission_name+".csv",index=False)
    return FileLink(submission_name+".csv")
create_submission(sub, "submission")

****