<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# NLP Basics

**Transformers**

&copy; Dr. Yves J. Hilpisch

<a href="https://tpq.io" target="_blank">https://tpq.io</a> | <a href="https://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>

_Code primarily from ChatGPT_.

## `transformers` Package

_From ChatGPT_.

## Use Cases

In [None]:
!git clone https://github.com/tpq-classes/natural_language_processing.git
import sys
sys.path.append('natural_language_processing')


In [None]:
# !pip install tensorflow
# !pip install tf-keras
!pip install transformers

In [None]:
import tensorflow as tf
import transformers
tf.__version__, transformers.__version__

In [None]:
import warnings
warnings.simplefilter('ignore')
from transformers import logging as transformers_logging

# Set the logging level to ERROR to suppress INFO and WARNING messages
transformers_logging.set_verbosity_error()

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

### Machine Translation

In [None]:
from transformers import pipeline

# Initialize the translation pipeline
translator = pipeline('translation_en_to_de')

In [None]:
# Text to translate
text = "Transformers are revolutionizing natural language processing."

# Translate text
translation = translator(text, max_length=40)

print(translation[0]['translation_text'])

In [None]:
news = """Group Chief Executive Masayoshi Son reiterated his bullish forecasts for artificial intelligence
in a speech Thursday that stressed advances made by OpenAI.

OpenAI raised $6.6 billion in a recent round of new funding that included a $500 million
investment by SoftBank, according to a person familiar with the matter."""

In [None]:
# Translate text
translation = translator(news, max_length=70)

print(translation[0]['translation_text'])

### Language Detection

In [None]:
# Initialize the zero-shot classification pipeline
classifier = pipeline('zero-shot-classification')

# Text in an unknown language
text = "Je suis tr√®s heureux aujourd'hui."
# text = "I am quite happy today."
# text = "Heute geht es mir wirklich gut."

# Candidate labels (languages)
labels = ['English', 'French', 'German', 'Spanish']

# Classify language
result = classifier(text, candidate_labels=labels)

print("Detected Language:", result['labels'][0])

### Feature Extraction

In [None]:
# Initialize the feature extraction pipeline
feature_extractor = pipeline('feature-extraction')

# Text to extract features from
text = "Transformers provide embeddings for NLP tasks."

# Extract features
features = feature_extractor(text)

print(f"Feature vector length: {len(features[0])} tokens")
print(f"Embedding size per token: {len(features[0][0])}")

In [None]:
# Extract features
features = feature_extractor(news)

print(f"Feature vector length: {len(features[0])} tokens")
print(f"Embedding size per token: {len(features[0][0])}")

### Text Classification

In [None]:
# Initialize the text classification pipeline
classifier = pipeline('text-classification')

# Text to classify
text = "The stock market crashed due to unforeseen circumstances."
# text = "Today was a good day for stock market investors."

# Classify text
result = classifier(text)[0]

print(f"Label: {result['label']}, Score: {round(result['score'], 4)}")

### Zero-Short Classification

In [None]:
# Initialize the zero-shot classification pipeline
classifier = pipeline('zero-shot-classification')

# Text to classify
text = "I love to program in Python and build machine learning models."
# text = "Bayern Munich is the record holder for German Bundesliga championships."

# Candidate labels
labels = ['education', 'politics', 'technology', 'sports', 'programming']

# Classify text
result = classifier(text, candidate_labels=labels)

print("Sequence:", result['sequence'])
print("Labels and scores:")
for label, score in zip(result['labels'], result['scores']):
    print(f"{label}: {round(score, 4)}")

In [None]:
!pip install torch

### Sentence Similarity

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel

# Load pre-trained model and tokenizer
model_name = 'sentence-transformers/paraphrase-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Explicitly set from_pt=True to ensure proper loading of PyTorch weights for TensorFlow
model = TFAutoModel.from_pretrained(model_name, from_pt=True)

# Sentences to compare
sentence1 = "A man is playing a guitar."
# sentence2 = "Several men are playing guitars."
sentence2 = "Someone is strumming a musical instrument."
# sentence2 = "A man is playing a piano."

# Tokenize sentences
encoded1 = tokenizer(sentence1, return_tensors='tf')
encoded2 = tokenizer(sentence2, return_tensors='tf')

# Compute embeddings
embedding1 = model(**encoded1).last_hidden_state  # Shape: (1, sequence_length, hidden_size)
embedding2 = model(**encoded2).last_hidden_state  # Shape: (1, sequence_length, hidden_size)

# Mean pooling to get sentence embeddings
embedding1 = tf.reduce_mean(embedding1, axis=1)  # Shape: (1, hidden_size)
embedding2 = tf.reduce_mean(embedding2, axis=1)  # Shape: (1, hidden_size)

# Compute cosine similarity
cosine_similarity = tf.keras.losses.cosine_similarity(embedding1, embedding2, axis=1)
cosine_similarity = -cosine_similarity  # Convert from loss to similarity score

print(f"Cosine Similarity: {cosine_similarity.numpy()[0]:.4f}")

### Emotion Detection

In [None]:
# Initialize the zero-shot classification pipeline
classifier = pipeline('zero-shot-classification')

# Text expressing emotion
text = "I can't believe I won the lottery!"
text = "I am angry at our judicial system."
text = "I am afraid of my next mathematics exam."

# Candidate labels
labels = ['joy', 'sadness', 'anger', 'fear', 'surprise']

# Classify emotion
result = classifier(text, candidate_labels=labels)

print("Detected Emotion:", result['labels'][0])

In [None]:
result

### Extractive Q&A From Documents

In [None]:
# Initialize the question-answering pipeline
qa_pipeline = pipeline('question-answering')

# Load a longer context (could be from a file)
context = """
The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France.
It is named after the engineer Gustave Eiffel, whose company designed and built the tower.
Constructed from 1887 to 1889 as the entrance to the 1889 World's Fair,
it was initially criticized by some of France's leading artists and intellectuals
for its design, but it has become a global cultural icon of France and one of the most
recognizable structures in the world.
"""

# Define a question
question = "Who designed the Eiffel Tower?"

# Get the answer
answer = qa_pipeline(question=question, context=context)

print(f"Answer: {answer['answer']}")

In [None]:
# Define a question
question = "When was the Eiffel Tower built?"

# Get the answer
answer = qa_pipeline(question=question, context=context)

print(f"Answer: {answer['answer']}")

In [None]:
# Define a question
question = "What type of structure is the Eiffel Tower?"



In [None]:
with open('/content/natural_language_processing/article.txt', 'r') as f:
    article = f.read()

In [None]:
article[:500]

In [None]:
question = "How much capital did OpenAI raise?"
# Get the answer
answer = qa_pipeline(question=question, context=article)

print(f"Answer: {answer['answer']}")

<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

<a href="https://tpq.io" target="_blank">https://tpq.io</a> | <a href="https://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>