--- Set Proxy ---

In [1]:
import os

# 设置 HTTP 代理
os.environ['http_proxy'] = "http://127.0.0.1:7890"
os.environ['https_proxy'] = "http://127.0.0.1:7890"
os.environ['all_proxy'] = "socks5://127.0.0.1:7890"

# 确认代理已设置
print("HTTP Proxy:", os.environ.get('http_proxy'))
print("HTTPS Proxy:", os.environ.get('https_proxy'))
print("All Proxy:", os.environ.get('all_proxy'))

HTTP Proxy: http://127.0.0.1:7890
HTTPS Proxy: http://127.0.0.1:7890
All Proxy: socks5://127.0.0.1:7890


The Architecture In Action

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Configure quantization with BitsAndBytesConfig for 8-bit loading  gpu 
quantization_config = BitsAndBytesConfig(load_in_8bit=True)

# Load the pre-trained model and tokenizer from the Hugging Face model hub
OPT = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", quantization_config=quantization_config)  # Load model in 8-bit mode to reduce memory usage
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")  # Load tokenizer for the same model

# Define a sample input sentence
inp = "The quick brown fox jumps over the lazy dog"

# Tokenize the input sentence and return tensors in PyTorch format
inp_tokenized = tokenizer(inp, return_tensors="pt") 

# Print the size of the tokenized input tensor (shows the dimensions)
print("inp_tokenized['input_ids'].size():", inp_tokenized['input_ids'].size())  

# Print the tokenized input (shows the token IDs corresponding to each word in the sentence)
print("inp_tokenized:", inp_tokenized)

  from .autonotebook import tqdm as notebook_tqdm


inp_tokenized['input_ids'].size(): torch.Size([1, 10])
inp_tokenized: {'input_ids': tensor([[    2,   133,  2119,  6219, 23602, 13855,    81,     5, 22414,  2335]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [3]:
# examine the model’s architecture
print(OPT.model)

OPTModel(
  (decoder): OPTDecoder(
    (embed_tokens): Embedding(50272, 2048, padding_idx=1)
    (embed_positions): OPTLearnedPositionalEmbedding(2050, 2048)
    (final_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
    (layers): ModuleList(
      (0-23): 24 x OPTDecoderLayer(
        (self_attn): OPTSdpaAttention(
          (k_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (out_proj): Linear(in_features=2048, out_features=2048, bias=True)
        )
        (activation_fn): ReLU()
        (self_attn_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=2048, out_features=8192, bias=True)
        (fc2): Linear(in_features=8192, out_features=2048, bias=True)
        (final_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
      )


In [4]:
# examine the Input Embedding
embedded_input = OPT.model.decoder.embed_tokens(inp_tokenized['input_ids'])

print("Layer:\t", OPT.model.decoder.embed_tokens)
print("Size:\t", embedded_input.size())
print("Output:\t", embedded_input)


Layer:	 Embedding(50272, 2048, padding_idx=1)
Size:	 torch.Size([1, 10, 2048])
Output:	 tensor([[[-0.0407,  0.0519,  0.0574,  ..., -0.0263, -0.0355, -0.0260],
         [-0.0371,  0.0220, -0.0096,  ...,  0.0265, -0.0166, -0.0030],
         [-0.0455, -0.0236, -0.0121,  ...,  0.0043, -0.0166,  0.0193],
         ...,
         [ 0.0007,  0.0267,  0.0257,  ...,  0.0622,  0.0421,  0.0279],
         [-0.0126,  0.0347, -0.0352,  ..., -0.0393, -0.0396, -0.0102],
         [-0.0115,  0.0319,  0.0274,  ..., -0.0472, -0.0059,  0.0341]]],
       grad_fn=<EmbeddingBackward0>)


In [5]:
# examine the Positional Encoding
embed_pos_input = OPT.model.decoder.embed_positions(inp_tokenized['attention_mask'])

print("Layer:\t", OPT.model.decoder.embed_positions)
print("Size:\t", embed_pos_input.size())
print("Output:\t", embed_pos_input)


Layer:	 OPTLearnedPositionalEmbedding(2050, 2048)
Size:	 torch.Size([1, 10, 2048])
Output:	 tensor([[[-8.1406e-03, -2.6221e-01,  6.0768e-03,  ...,  1.7273e-02,
          -5.0621e-03, -1.6220e-02],
         [-8.0585e-05,  2.5000e-01, -1.6632e-02,  ..., -1.5419e-02,
          -1.7838e-02,  2.4948e-02],
         [-9.9411e-03, -1.4978e-01,  1.7557e-03,  ...,  3.7117e-03,
          -1.6434e-02, -9.9087e-04],
         ...,
         [ 3.6979e-04, -7.7454e-02,  1.2955e-02,  ...,  3.9330e-03,
          -1.1642e-02,  7.8506e-03],
         [-2.6779e-03, -2.2446e-02, -1.6754e-02,  ..., -1.3142e-03,
          -7.8583e-03,  2.0096e-02],
         [-8.6288e-03,  1.4233e-01, -1.9012e-02,  ..., -1.8463e-02,
          -9.8572e-03,  8.7662e-03]]], grad_fn=<EmbeddingBackward0>)


In [6]:
# examine the first layer’s selfattention component 
# Add token embeddings and positional embeddings
embed_position_input = embedded_input + embed_pos_input
# Pass the input through the self-attention mechanism of layer 0
hidden_states, _, _ = OPT.model.decoder.layers[0].self_attn(embed_position_input)
print("Layer:\t", OPT.model.decoder.layers[0].self_attn)
print("Size:\t", hidden_states.size())
print("Output:\t", hidden_states)

Layer:	 OPTSdpaAttention(
  (k_proj): Linear(in_features=2048, out_features=2048, bias=True)
  (v_proj): Linear(in_features=2048, out_features=2048, bias=True)
  (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
  (out_proj): Linear(in_features=2048, out_features=2048, bias=True)
)
Size:	 torch.Size([1, 10, 2048])
Output:	 tensor([[[-0.0135, -0.0095,  0.0013,  ...,  0.0065, -0.0017,  0.0134],
         [-0.0130, -0.0102,  0.0022,  ...,  0.0087,  0.0004,  0.0124],
         [-0.0131, -0.0059,  0.0038,  ...,  0.0098,  0.0020,  0.0141],
         ...,
         [-0.0121, -0.0098,  0.0050,  ...,  0.0095,  0.0015,  0.0098],
         [-0.0119, -0.0101,  0.0051,  ...,  0.0094,  0.0011,  0.0091],
         [-0.0118, -0.0109,  0.0055,  ...,  0.0095,  0.0013,  0.0091]]],
       grad_fn=<ViewBackward0>)


The Encoder-Decoder Architecture

In [7]:
from transformers import AutoModel

# load the BART model
BART = AutoModel.from_pretrained("facebook/bart-large")

# examine the model’s architecture
print(BART)

BartModel(
  (shared): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
  (encoder): BartEncoder(
    (embed_tokens): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
    (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
    (layers): ModuleList(
      (0-11): 12 x BartEncoderLayer(
        (self_attn): BartSdpaAttention(
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (activation_fn): GELUActivation()
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=

In [8]:
from transformers import pipeline
import torch

# Check if CUDA (GPU) is available, and set device accordingly
device = 0 if torch.cuda.is_available() else -1

summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)
sum = summarizer("""Gaga was best known in the 2010s for pop hits like “Poker Face” and avant-garde experimentation on albums like “Artpop,” and Bennett, a singer who mostly stuck to standards, was in his 80s when the pair met. And yet Bennett and Gaga became fast friends and close collaborators, which they remained until Bennett’s death at 96 on Friday. They recorded two albums together, 2014’s “Cheek to Cheek” and 2021’s “Love for Sale,” which both won Grammys for best traditional pop vocal album.""", min_length=20, max_length=50)

print(sum[0]['summary_text'])

Bennett and Gaga became fast friends and close collaborators. They recorded two albums together, 2014's "Cheek to Cheek" and 2021's "Love for Sale"


The Encoder-Only Architecture

In [9]:
from transformers import AutoModel

# load the BERT model  
BERT = AutoModel.from_pretrained("bert-base-uncased")

# examine the model’s architecture
print(BERT)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

The Decoder-Only Architecture

In [10]:
from transformers import AutoModel

# load the gpt2 model
GPT2 = AutoModel.from_pretrained("gpt2")

# examine the model’s architecture
print(GPT2)

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2SdpaAttention(
        (c_attn): Conv1D(nf=2304, nx=768)
        (c_proj): Conv1D(nf=768, nx=768)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D(nf=3072, nx=768)
        (c_proj): Conv1D(nf=768, nx=3072)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)


In [4]:
from transformers import pipeline
import torch

# Check if CUDA (GPU) is available, and set device accordingly
device = 0 if torch.cuda.is_available() else -1

# Initialize a text generation pipeline using the GPT-2 model
generator = pipeline(model="gpt2",device=device)

# Generate text based on the input prompt
output = generator(
    "This movie was a very",
    do_sample=True,
    top_p=0.95,
    num_return_sequences=4,
    max_new_tokens=50,
    return_full_text=False
)

# Print each generated text sequence
for item in output:
    print(">", item['generated_text'])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


>  personal project for me, but then it's really a film I love.

How did The Girl with the Dragon Tattoo come to be?

I knew my name was coming in, and just then I started to develop that identity through
>  short one, but it's got a lot to offer and I think it's a great idea to do it in a more theatrical manner. The world looks more realistic than if you've seen some of the more recent films."

The film is
>  high profile and very high profile hit and I'm really looking forward to making it again, I am a huge fan.

Logan: So to find out who the cast of The Muppets are, you get that kind of an amazing
>  successful and well received reboot of the series. The original movie was a successful and well received adaptation. A sequel was written. There was a change in direction of the series and was released in 1977. Then the original was made into a movie and was


In [6]:
import numpy as np

def self_attention(query, key, value, mask=None):
    scores = np.dot(query, key.T)

    if mask is not None:
        # Set masked positions to a very large negative value to exclude them
        scores = scores + (mask * -1e9)

    # Apply softmax to obtain attention weights
    attention_weights = np.exp(scores) / np.sum(np.exp(scores), axis=-1, keepdims=True)

    # Compute weighted sum of value vectors
    output = np.dot(attention_weights, value)

    return output

# Example data for query, key, and value
np.random.seed(42)  # For reproducibility
query = np.random.rand(1, 4)  # One query vector with depth 4
key = np.random.rand(3, 4)    # Three key vectors with depth 4
value = np.random.rand(3, 4)  # Three value vectors with depth 4

# Binary mask (1 for valid positions, 0 for masked positions)
mask = np.array([[1, 1, 0]])  # Mask the last key by setting it to 0

# Perform self-attention
output = self_attention(query, key, value, mask=mask)

# Display results
print("Query:\n", query)
print("Key:\n", key)
print("Value:\n", value)
print("Mask:\n", mask)
print("Attention Output:\n", output)


Query:
 [[0.37454012 0.95071431 0.73199394 0.59865848]]
Key:
 [[0.15601864 0.15599452 0.05808361 0.86617615]
 [0.60111501 0.70807258 0.02058449 0.96990985]
 [0.83244264 0.21233911 0.18182497 0.18340451]]
Value:
 [[0.30424224 0.52475643 0.43194502 0.29122914]
 [0.61185289 0.13949386 0.29214465 0.36636184]
 [0.45606998 0.78517596 0.19967378 0.51423444]]
Mask:
 [[1 1 0]]
Attention Output:
 [[0.45606998 0.78517596 0.19967378 0.51423444]]


Cohere LLMs

In [2]:
import cohere

# Initialize the Cohere client (API key is omitted and will be read from the environment variable)
co = cohere.Client()

# Set up chat history and the question
response = co.chat(
    chat_history=[
        {"role": "USER", "message": "Who discovered gravity?"},
        {"role": "CHATBOT", "message": "The man who is widely credited with discovering gravity is Sir Isaac Newton"}
    ],
    message="What year was he born?",  
    connectors=[{"id": "web-search"}]  # # Perform web search before answering the question  Use the Cohere web-search connector
)

# Print the response
print(response.text)

Isaac Newton was born on 25 December 1642, according to the Julian calendar in use in England at the time.


Meta’s LLaMA 2

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
# Use torch.device for non-pipeline code
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer
model_id = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load the model Use GPU
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    torch_dtype=torch.float16
).to(device)  # Move model to device

# Prepare inputs
prompt = "Translate English to French: Configuration files are easy to use!"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate model output
outputs = model.generate(**inputs, max_new_tokens=100)

# Decode and print the output
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])