In [1]:
!pip uninstall tensorflow -y
!pip  install transformers==4.22.1 -q
!pip install evaluate



Collecting pyarrow>=8.0.0 (from datasets>=2.0.0->evaluate)
  Downloading pyarrow-12.0.1-cp39-cp39-win_amd64.whl (21.5 MB)
     ---------------------------------------- 21.5/21.5 MB 9.2 MB/s eta 0:00:00
Installing collected packages: pyarrow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 6.0.1
    Uninstalling pyarrow-6.0.1:
      Successfully uninstalled pyarrow-6.0.1
Successfully installed pyarrow-12.0.1




In [4]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.3.0-py3-none-any.whl (404 kB)
     -------------------------------------- 404.2/404.2 kB 8.6 MB/s eta 0:00:00
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.11.3-py3-none-any.whl (225 kB)
     ------------------------------------- 225.4/225.4 kB 13.5 MB/s eta 0:00:00
Collecting cmaes>=0.10.0 (from optuna)
  Downloading cmaes-0.10.0-py3-none-any.whl (29 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
     ---------------------------------------- 78.7/78.7 kB ? eta 0:00:00
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.11.3 cmaes-0.10.0 colorlog-6.7.0 optuna-3.3.0




In [2]:
import transformers
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification, DistilBertTokenizerFast, \
                         DataCollatorWithPadding, pipeline
from transformers import DataCollatorWithPadding
import evaluate
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, \
                            roc_auc_score,ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import scikitplot as skplt

In [3]:
import load_data
from load_data import Data
import train_datasets
from train_datasets import Train_dataset

In [4]:
import optuna

In [5]:
def load_dataset():
    #Load a datafraom from the Data class from load_data
    data.handle_file()
    data.convert_json_to_dataframe()
    data.get_next_value()
    data.compare_values()
    data.label_sentences()
    data.initial_df()

In [6]:
def check_balance(a):
    #Checking ratio of values in label variable
    a = pd.DataFrame(a)
    print(a.value_counts())
    print(round(a.value_counts()[0]/len(a)*100,2),"%", ":" , round(a.value_counts()[1]/len(a)*100,2),"%", end='\n')    

In [7]:
def preprocess_function(dataset):
    #Mapping tokenizer with a dataset
    return tokenizer(dataset["text"], truncation=True, max_length=400)

In [8]:
metric = evaluate.combine(["accuracy","recall","precision", "f1"])
def compute_metrics(eval_pred):
    #Setting evaluation metrics
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [9]:
def get_predictions(pred_list):
    #Getting label values from probabilities
    p = []
    for pred in pred_list:
        if pred[0] > pred[1]:
            p.append(0)
        else:
            p.append(1)        
    return p

In [10]:
def evaluate_model(labels, prediction, pred):
    confusion = confusion_matrix(labels, prediction)
    accuracy = accuracy_score(labels, prediction)
    precison = precision_score(labels, prediction)
    recall = recall_score(labels, prediction)
    f1 = f1_score(labels, prediction)
    
    print("accuracy : ",accuracy)
    print("precison : ", precison)
    print("recall : ", recall)
    print("f1 :", f1)
    
    disp = ConfusionMatrixDisplay(confusion_matrix=confusion)
    disp = disp.plot(cmap=plt.cm.Blues,values_format='g')

    skplt.metrics.plot_roc_curve(labels, pred)
    plt.show()

In [11]:
def objective(trial: optuna.Trial):
    uni_labels = ['Continue', "Change"]
    sequence_clf_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels = 2)
    sequence_clf_model.config.id2label = {i: l for i, l in enumerate(uni_labels)}
    sequence_clf_model.config.dropout =0.2
    
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=trial.suggest_int("num_train_epochs", low=3, high=4),
        learning_rate=trial.suggest_loguniform("learning_rate", low=4e-5, high=0.01),
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        weight_decay= trial.suggest_float("weight_decay", 0, 0.05),
        logging_dir='./logs',
        load_best_model_at_end=True,
        logging_steps=1,
        log_level='info',
        evaluation_strategy='epoch',
        eval_steps=100,
        save_strategy='epoch'
    )
        
    trainer1 = Trainer(
        model=sequence_clf_model,
        args=training_args,
        train_dataset=tokenized_dataset['train'],
        eval_dataset=tokenized_dataset['test1'],
        compute_metrics=compute_metrics,
        data_collator=data_collator
    )
    
    global result
    result = trainer1.train()
    evaluate_t = trainer1.evaluate()
    return evaluate_t['eval_f1']

In [None]:
if __name__ == '__main__':
    #============================================
    #Generate a df from Data class from data_load
    #A df contains "texc" column a sentence per a row and their lables(0: continue, 1: change)
    path ='hotels.json'
    data = Data(path)
    load_dataset()

    #============================================
    #Preprocessing data for modeling
    #t = df.copy()
    t = data.df.copy()
    t = t[:2000]


    #Splitting setences to tokens and labeling tokens
    train_dataset = Train_dataset(t) 
    train_dataset.test_text = t['text'].values
    train_dataset.test_label = t['label'].values
    
    train_dataset.split_token_sentences()
    train_dataset.tokenized_text_label = train_dataset.flatten_list(train_dataset.tokenized_text_label)
    train_dataset.tokenized_text = train_dataset.flatten_list(train_dataset.tokenized_text)

    #Generating sequnces to be used for training by combining tokens and labeling the sequences
    train_dataset.generate_test_dataset()

    #Generating datasets for training, vaildating and testing
    train_dataset.datasets_for_training()

    #Applying DistilBertTokenizerFast for DistilBert model
    dataset = train_dataset.dataset
    print(dataset)

    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased', ignore_mismatched_sizes=True)
    tokenized_dataset = dataset.map(preprocess_function, batched=True)

    #padding dataset
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    global study
    study = optuna.create_study(study_name='hyper-parameter-search', direction='maximize') 
    study.optimize(func=objective, n_trials=10)
    global best_params
    global best_trial
    global best_value
    best_params = study.best_params
    best_trial = study.best_trial
    best_value = study.best_value

    
    print("best_value", best_value) 
    print("best_params", best_params)
    print("best_trail", best_trial)