In [None]:
! pip install torch 
! pip install transformers
! pip install datasets
! pip install evaluate
! pip install pandas

In [None]:
#load dataset

from datasets import Value, load_dataset
raw_datasets=load_dataset("app_reviews",split='train[:20%]').train_test_split(test_size=0.3)

In [None]:
raw_datasets

In [None]:
label_list=set(raw_datasets["train"]["star"])
label_to_id = {v: i for i, v in enumerate(label_list)}


In [None]:
from transformers import AutoConfig,AutoModelForSequenceClassification,AutoTokenizer
model_name="bert-base-uncased"
config = AutoConfig.from_pretrained(
        "bert-base-uncased", #feel free to use other models
        num_labels=len(label_list),
        finetuning_task="text-classification"
    )
model = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        config=config,
        ignore_mismatched_sizes=True,
    )
tokenizer = AutoTokenizer.from_pretrained(
        "bert-base-uncased"
    )
model.config.label2id = label_to_id
model.config.id2label = {id: label for label, id in config.label2id.items()}

In [None]:
def preprocess_function(examples):   
    # Tokenize the texts
    result = tokenizer(examples["review"], padding=False, max_length=512, truncation=True)
    
    result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["star"]]
    print(result["label"])    
    return result

In [None]:

raw_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)


In [None]:
! pip install scikit-learn
import evaluate
from transformers import EvalPrediction
import numpy as np
metric = evaluate.load("accuracy")
def compute_metrics(p: EvalPrediction):
    preds = np.argmax(preds, axis=1)
    result = metric.compute(predictions=preds, references=p.label_ids)
    return result

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)


In [None]:
from transformers import Trainer,TrainingArguments

training_args=TrainingArguments(output_dir="temp")
train_dataset=raw_datasets["train"]
eval_dataset=raw_datasets["test"]
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset ,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
train_result = trainer.train()
#train_result = trainer.train(resume_from_checkpoint=checkpoint) if you wanna resume
metrics = train_result.metrics
max_train_samples = (len(train_dataset))
metrics["train_samples"] = min(max_train_samples, len(train_dataset))
trainer.save_model()  # Saves the tokenizer too for easy upload
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

In [None]:
metrics = trainer.evaluate(eval_dataset=eval_dataset)
max_eval_samples = len(eval_dataset)
metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)



In [None]:
from transformers import Pipeline
pipe=Pipeline("text-classification",model="./demo/")
pipe(input("Enter the text here"))