## Generative

In [40]:
from transformers import AutoTokenizer, OpenAIGPTLMHeadModel

tokenizer = AutoTokenizer.from_pretrained("openai-community/openai-gpt")
model = OpenAIGPTLMHeadModel.from_pretrained("openai-community/openai-gpt")

input_ids = tokenizer.encode("A long time ago, in a galaxy far", return_tensors="pt")
outputs = model.generate(input_ids, max_length=42, do_sample=True) 
text_generated = tokenizer.decode(outputs[0])
print(text_generated)

a long time ago, in a galaxy far from this one. " 
 jack said, " how do you explain this? " 
 " you will be told soon enough. there are a myriad of levels of


In [43]:
inputs = tokenizer("A long time ago, in a galaxy far", return_tensors="pt")
outputs = model.generate(**inputs, max_length=42, do_sample=True) 
text_generated = tokenizer.decode(outputs[0])
print(text_generated)

a long time ago, in a galaxy far from this one, as the light of jupiter's new light reflected off the frozen, frozen ice. the star ships, which the inhabitants aboard the fleet had used before


In [25]:
from transformers import AutoTokenizer, OpenAIGPTLMHeadModel

tokenizer = AutoTokenizer.from_pretrained("openai-community/openai-gpt")
model = OpenAIGPTLMHeadModel.from_pretrained("openai-community/openai-gpt")

input_ = tokenizer("A long time ago, in a galaxy far", return_tensors="pt")
outputs = model.generate(inputs['input_ids'], max_length=42, do_sample=True, top_k=50, temperature=0.7) 
text_generated = tokenizer.decode(outputs[0])
print(text_generated)

a long time ago, in a galaxy far off, we were working on the same quantum project, the same concept of the universe where we were supposed to go. i think there was a long period of time


In [26]:
from transformers import AutoTokenizer, OpenAIGPTLMHeadModel

tokenizer = AutoTokenizer.from_pretrained("openai-community/openai-gpt")
model = OpenAIGPTLMHeadModel.from_pretrained("openai-community/openai-gpt")

inputs = tokenizer("A long time ago, in a galaxy far", return_tensors="pt")
outputs = model.generate(inputs['input_ids'], max_length=42, do_sample=True, top_k=50, temperature=0.7) 
text_generated = tokenizer.decode(outputs[0])
print(text_generated)

a long time ago, in a galaxy far away. i didn't know how the people of this world were supposed to understand. i was in the way. i tried to warn them. i tried to help


In [29]:
from transformers import AutoTokenizer, OpenAIGPTLMHeadModel

tokenizer = AutoTokenizer.from_pretrained("openai-community/openai-gpt")
model = OpenAIGPTLMHeadModel.from_pretrained("openai-community/openai-gpt")

inputs = tokenizer("A long time ago, in a galaxy far", return_tensors="pt")
outputs = model.generate(inputs['input_ids'], max_length=42, do_sample=True) 
text_generated = tokenizer.decode(outputs[0])
print(text_generated)

a long time ago, in a galaxy far away. the earth is called earth, and there was a small settlement in the area where we made our first home. i'm afraid we've never seen it,


In [30]:
from transformers import AutoTokenizer, OpenAIGPTLMHeadModel

tokenizer = AutoTokenizer.from_pretrained("openai-community/openai-gpt")
model = OpenAIGPTLMHeadModel.from_pretrained("openai-community/openai-gpt")

inputs = tokenizer("A long time ago, in a galaxy far", return_tensors="pt")
outputs = model.generate(**inputs, max_length=42) # the ** is used to unpack a dictionary into keyword arguments
text_generated = tokenizer.decode(outputs[0])
print(text_generated)

a long time ago, in a galaxy far away. 
 " i'm sorry, " i say. " i'm sorry i didn't tell you. " 
 " it's okay, " he says.


*With Sampling (do_sample=True):* The model doesn't always pick the most likely token. Instead, it randomly samples from the distribution of possible next tokens according to their probabilities. This introduces an element of randomness, allowing for more diverse and creative outputs.

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load pre-trained model and tokenizer
model_name = 'gpt2'  # You can also use 'gpt2-medium', 'gpt2-large', etc.
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Encode the input text
input_text = "A long time ago"
inputs = tokenizer(input_text, return_tensors='pt')
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
pad_token_id = tokenizer.eos_token_id  

# Generate text completion
output = model.generate(input_ids, 
                        max_length=42, 
                        attention_mask=attention_mask,
                        pad_token_id=pad_token_id,
                        )
# Decode the generated text
generated_text = tokenizer.decode(output[0])

print(generated_text)

A long time ago, I was a little bit of a fan of the original Star Wars trilogy. I was a little bit of a fan of the original Star Wars trilogy. I was a little bit of a


Previous generation of NLP: representation model

In [2]:
from gensim.models import Word2Vec

# Define a list of sentences
sentences = [['I', 'love', 'machine', 'learning'],
             ['I', 'enjoy', 'coding'],
             ['I', 'like', 'data', 'analysis']]

# Train the Word2Vec model
model = Word2Vec(sentences, min_count=1)

# Get the word vectors
word_vectors = model.wv

# Find similar words
similar_words = word_vectors.most_similar('learning')

# Print the similar words
for word, similarity in similar_words:
    print(word, similarity)

data 0.1459505707025528
like 0.041577354073524475
coding 0.03476494178175926
enjoy 0.01915225386619568
I 0.01613021455705166
love 0.008826184086501598
machine 0.004842504393309355
analysis -0.11410722881555557


In [5]:
word_vectors['learning']

array([ 8.13227147e-03, -4.45733406e-03, -1.06835726e-03,  1.00636482e-03,
       -1.91113955e-04,  1.14817743e-03,  6.11386076e-03, -2.02715401e-05,
       -3.24596534e-03, -1.51072862e-03,  5.89729892e-03,  1.51410222e-03,
       -7.24261976e-04,  9.33324732e-03, -4.92128357e-03, -8.38409644e-04,
        9.17541143e-03,  6.74942741e-03,  1.50285603e-03, -8.88256077e-03,
        1.14874600e-03, -2.28825561e-03,  9.36823711e-03,  1.20992784e-03,
        1.49006362e-03,  2.40640994e-03, -1.83600665e-03, -4.99963388e-03,
        2.32429506e-04, -2.01418041e-03,  6.60093315e-03,  8.94012302e-03,
       -6.74754381e-04,  2.97701475e-03, -6.10765442e-03,  1.69932481e-03,
       -6.92623248e-03, -8.69402662e-03, -5.90020278e-03, -8.95647518e-03,
        7.27759488e-03, -5.77203138e-03,  8.27635173e-03, -7.24354526e-03,
        3.42167495e-03,  9.67499893e-03, -7.78544787e-03, -9.94505733e-03,
       -4.32914635e-03, -2.68313056e-03, -2.71289347e-04, -8.83155130e-03,
       -8.61755759e-03,  

BERT

In [62]:
from transformers import BertTokenizer, BertModel

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

input_ids = tokenizer.encode("This is a simple example.", return_tensors="pt")
outputs = model(input_ids)
outputs.last_hidden_state.shape



torch.Size([1, 8, 768])

In [47]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Example sentences
sentences = ["This is a simple example.", "Hugging Face makes it easy to use transformers."]

# Tokenize sentences
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

# Get the embeddings from the model
with torch.no_grad():
    outputs = model(**inputs)

# The last hidden state (embeddings) of the [CLS] token is usually used as the sentence representation
embeddings = outputs.last_hidden_state[:, 0, :]

# Print the shape of the embeddings (should be [batch_size, hidden_size])
print(embeddings.shape)

# Optionally, print the embeddings for each sentence
print(embeddings)



torch.Size([2, 768])
tensor([[-0.2490, -0.1667, -0.2463,  ..., -0.4247,  0.4081,  0.8204],
        [-0.4613,  0.0933,  0.3775,  ..., -0.5355, -0.0947,  0.5958]])


In [65]:
from transformers import RobertaTokenizer, RobertaModel
import torch

# Load pre-trained RoBERTa model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

# Sample text
text = "Hello, this is a simple representation model using RoBERTa."

# Tokenize input text
inputs = tokenizer(text, return_tensors='pt')

# Forward pass through the model
with torch.no_grad():
    outputs = model(**inputs)

# Extract last hidden state
last_hidden_state = outputs.last_hidden_state

# Print the shape of the last hidden state
print("Shape of last hidden state:", last_hidden_state.shape)

# Optionally, you can get the embeddings for the [CLS] token
cls_embedding = last_hidden_state[:, 0, :]
print("Shape of [CLS] token embedding:", cls_embedding.shape)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Shape of last hidden state: torch.Size([1, 15, 768])
Shape of [CLS] token embedding: torch.Size([1, 768])
