# HuggingFace Exploration:


1.   Get to know about Models, Datasets, and Spaces
2.   Search for the bert-base-uncased model and open it
3.   See the model card, model_size, framework, task, etc. of the model
4.   Use the HuggingFace platform to test the model online



# Step 0: Importing Required Packages



In [74]:
import pandas as pd
from scipy import stats
from transformers import pipeline

# Text Classification


In [None]:
textclassifier = pipeline(task="text-classification")
print(textclassifier("The movie was awesome!"))

# Challenge: Confuse the model! Find an input that makes the model produce the score (confidence) below 0.6

In [None]:
textclassifier.tokenizer

In [None]:
textclassifier.model

# Token Classification

In [None]:
classifier = pipeline(task="token-classification")
print(classifier("Ronaldo"))

# Challenge: Searching or Trying? Find more entity types (e.g., "I-PER", "I-LOC", etc.)

# Fill Mask

In [None]:
classifier = pipeline("fill-mask")
print(classifier("Paris is the <mask> of France."))

# Challenge: Undercover: try to increase masked words to see the model behaviour

# Table Question Answering

In [None]:
tqa = pipeline(task="table-question-answering")
data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
table = pd.DataFrame.from_dict(data)
question = "which actor has played in 53 movies?"
print(tqa(table=table, query=question)['cells'][0])

# Challenge: Predictable model! How many different answers we might see?

# Question Answering

In [None]:
qa = pipeline(task="question-answering")
context = "Brad Pitt has 87, Leonardo Di Caprio has 53, and George Clooney has 69 movies."
question = "how many movies has Leonardo Di Caprio played in?"
print(qa(question = question, context = context))

# Challenge: Let's take a deeper look! See the architecture of the model by calling .model.config

# Zero-Shot Classification

In [None]:
zsc = pipeline(task="zero-shot-classification")
print(zsc("Inception is the best movie ever",
    candidate_labels=["CINEMA", "MUSIC", "ART"],
))

# Challenge: Unique English words! What is the vocab_size of the Tokenizer used by zsc?

# Translation

In [None]:
fr_en_translator = pipeline(task="translation", model="Helsinki-NLP/opus-mt-fr-en")
fr_en_translator("quelle distance se trouve la ville la plus proche?")

# Challenge: Multilinguality! How many languages does this task support?

# Summarization

In [None]:
summarizer = pipeline(task="summarization", model="facebook/bart-large-cnn")
txt = "Paris is the capital and most populous city of France, with an estimated population of 2,175,601 residents as of 2018, in an area of more than 105 square kilometres (41 square miles). The City of Paris is the centre and seat of government of the region and province of Île-de-France, or Paris Region, which has an estimated population of 12,174,880, or about 18 percent of the population of France as of 2017."
print(f"the original text has {len(txt.split())} tokens")
output = summarizer(txt, max_length=50)
print(f"Summary: {output}")
print(f"the summarized text has {len(output[0]['summary_text'].split())} tokens")
# Challenge: SHORTEEERRR! Force the model to keep the summary under 10 words.

# Text Generation

In [None]:
generator = pipeline(task="text-generation")
generator("Hello, I'm a student at", num_return_sequences=2)

# Challenge: Lullaby! Force the model to tell you a single long story.

# Feature Extraction

In [None]:
feature_extractor = pipeline("feature-extraction", framework="pt")
text = "Transformers is an awesome library!"
a = feature_extractor(text,return_tensors = "pt")[0].numpy().mean(axis=0)

# Challenge: Does fraework matter? pt stands for PyTorch. Will we get the same output if we use TensorFlow?

In [None]:
a.shape

# Sentence Similarity

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
sentences = [
    "Competition day is next week",
    "Mastering this will greatly help",
    "Let's have a fun competition next week"]

embeddings = model.encode(sentences)
similarities = model.similarity(embeddings, embeddings)
print(similarities)


# Challenges:
    # What was different for this task? Try pipeline('sentece-similarity')
    # Try to add more sentences
    # what is the vector size (embedding dimension)?
    # Try words instead of sentences

# Fine Tuning

In [1]:
from transformers import AutoModel, AutoTokenizer

In [49]:
tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-tiny')
model = AutoModel.from_pretrained('prajjwal1/bert-tiny')

In [None]:
from transformers import pipeline

fill_mask = pipeline("fill-mask", model="prajjwal1/bert-tiny")

# Add a mask token to complete the sentence
output = fill_mask("[MASK] is a game.")
for prediction in output:
    print(f"{prediction['sequence']} (score: {prediction['score']:.4f})")

In [51]:
import pandas as pd
ds = pd.read_csv('game.csv')

In [52]:
from datasets import Dataset, load_dataset
dataset = Dataset.from_pandas(ds)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")

def tokenize_function(txt):
    return tokenizer(txt["description"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
import itertools

block_size = 1

def group_texts(examples):
    # Concatenate all texts in the batch for token-related columns
    concatenated_examples = {k: list(itertools.chain.from_iterable(examples[k])) for k in examples.keys() if k in ['input_ids', 'token_type_ids', 'attention_mask']}
    total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]])
    # Drop the small remainder of smaller than block_size
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


lm_dataset = tokenized_dataset.map(
    group_texts,
    batched=True,
    batch_size=1,
    remove_columns=tokenized_dataset.column_names,  # CRITICAL to prevent shape mismatch
)

In [None]:
from transformers import AutoModelForMaskedLM, TrainingArguments, Trainer

model = AutoModelForMaskedLM.from_pretrained("prajjwal1/bert-tiny")

training_args = TrainingArguments(
    output_dir="./bert-mlm2",
    per_device_train_batch_size=1,
    num_train_epochs=2,
    prediction_loss_only=True,
    report_to=[],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset,
    tokenizer=tokenizer,

)

trainer.train()

In [None]:
finetuned_model_path = "./bert-mlm2/checkpoint-7000" # Updated path to the checkpoint directory
finetuned_model = AutoModelForMaskedLM.from_pretrained(finetuned_model_path)

finetuned_fill_mask = pipeline("fill-mask", model=finetuned_model, tokenizer=tokenizer)

output_finetuned = finetuned_fill_mask("[MASK] is a game.")
print("\nPredictions from fine-tuned model:")
for prediction in output_finetuned:
    print(f"{prediction['sequence']} (score: {prediction['score']:.4f})")
