(**Click the icon below to open this notebook in Colab**)

[![Open InColab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/xiangshiyin/machine-learning-for-actuarial-science/blob/main/2025-spring/week15/notebook/demo.ipynb)

# Introduction to NLP

## Preprocessing

In [None]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords


data = "This is a simple example to demonstrate removing stopwords using NLTK."
stopWords = set(stopwords.words('english'))

In [None]:
len(stopWords)

In [None]:
stopwords.words('english')[:10]

In [None]:
tokenized_data = word_tokenize(data)

In [None]:
print(f"Original text: {data}")
print(f"Tokenized text: {"|".join(tokenized_data)}")

In [None]:
filtered_tokenized_data = [
    w
    for w in tokenized_data
    if w not in stopWords
]
print(f"After removing stopwords: {filtered_tokenized_data}")

In [None]:
print(f"Original text: {data}")
print(f"Tokenized text: {"|".join(tokenized_data)}")
print(f"After removing stopwords: {"|".join(filtered_tokenized_data)}")

## Feature Extraction

### Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 1. Sample dataset
texts = [
    "I love this product",         # positive
    "This is amazing",             # positive
    "Very happy with the result",  # positive
    "I hate this",                 # negative
    "Worst experience ever",       # negative
    "Not satisfied at all"         # negative
]

labels = [1, 1, 1, 0, 0, 0]  # 1 = positive, 0 = negative

# 2. Convert text to bag-of-words vectors
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)

# 3. Show feature names
print("Feature Names (Vocabulary):")
print(vectorizer.get_feature_names_out())


In [None]:
X.toarray()

In [None]:
X.shape

### TF-IDF

In [None]:
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import nltk

# Download NLTK movie_reviews data
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews

# Prepare dataset
docs = []
labels = []

for fileid in movie_reviews.fileids():
    docs.append(movie_reviews.raw(fileid))
    labels.append(movie_reviews.categories(fileid)[0])  # 'pos' or 'neg'

# Convert labels to binary format
y = [1 if label == 'pos' else 0 for label in labels]

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(docs, y, test_size=0.2, random_state=42)

# Vectorize using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
feature_names = vectorizer.get_feature_names_out()
feature_names[:20]

In [None]:
import pandas as pd

# Choose a sample document from the test set
sample_idx = 0
sample_vector = X_test_tfidf[sample_idx]

# Convert sparse vector to dense and create DataFrame
df_features = pd.DataFrame(
    data=sample_vector.toarray()[0],
    index=feature_names,
    columns=["tfidf"]
)

# Filter non-zero features and sort
df_nonzero = df_features[df_features.tfidf > 0].sort_values(by="tfidf", ascending=False)

# Show top 15 features by TF-IDF weight
print("\nTop TF-IDF features in sample test document:")
print(df_nonzero.head(15))

### Word2Vec

#### Hand-craft implementation

In [None]:
import numpy as np
import re
import random

# Sample corpus
corpus = "The quick brown fox jumps over the lazy dog"

# Preprocessing: Tokenization and vocabulary building
tokens = re.findall(r'\b\w+\b', corpus.lower())
vocab = set(tokens)
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
vocab_size = len(vocab)

In [None]:
word_to_idx

In [None]:
idx_to_word

In [None]:
tokens

In [None]:
# Generate training data
def generate_training_data(tokens, window_size):
    training_data = []
    for idx, target_word in enumerate(tokens):
        target_idx = word_to_idx[target_word]
        context_range = list(range(max(0, idx - window_size), idx)) + \
                        list(range(idx + 1, min(len(tokens), idx + window_size + 1)))
        for context_idx in context_range:
            context_word = tokens[context_idx]
            context_word_idx = word_to_idx[context_word]
            training_data.append((target_idx, context_word_idx))
    return training_data

window_size = 2
training_data = generate_training_data(tokens, window_size)


In [None]:
# Inspect the training data
print(f"Corpus: {corpus}")
print([
    (idx_to_word[t[0]], idx_to_word[t[1]])
    for t in training_data
])

In [None]:
# Initialize parameters
embedding_dim = 10
W1 = np.random.randn(vocab_size, embedding_dim)
W2 = np.random.randn(embedding_dim, vocab_size)

# Sigmoid function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Training parameters
epochs = 1000
learning_rate = 0.01
num_negative_samples = 2

# Training loop
for epoch in range(epochs):
    loss = 0
    for target_idx, context_idx in training_data:
        # Positive sample
        h = W1[target_idx]
        u = np.dot(h, W2[:, context_idx])
        pred = sigmoid(u)
        error = pred - 1
        loss += -np.log(pred + 1e-7)
        # Gradients
        grad_W2 = error * h
        grad_W1 = error * W2[:, context_idx]
        # Update weights
        W2[:, context_idx] -= learning_rate * grad_W2
        W1[target_idx] -= learning_rate * grad_W1

        # Negative sampling
        negative_samples = random.sample([i for i in range(vocab_size) if i != context_idx], num_negative_samples)
        for neg_idx in negative_samples:
            u_neg = np.dot(h, W2[:, neg_idx])
            pred_neg = sigmoid(u_neg)
            error_neg = pred_neg
            loss += -np.log(1 - pred_neg + 1e-7)
            # Gradients
            grad_W2_neg = error_neg * h
            grad_W1_neg = error_neg * W2[:, neg_idx]
            # Update weights
            W2[:, neg_idx] -= learning_rate * grad_W2_neg
            W1[target_idx] -= learning_rate * grad_W1_neg
    if (epoch + 1) % 100 == 0:
        print(f"Epoch {epoch + 1}, Loss: {loss:.4f}")

In [None]:
# Retrieve word embeddings
word_embeddings = W1

# Example: Find similar words
def find_similar(word, top_n=3):
    if word not in word_to_idx:
        print(f"'{word}' not in vocabulary.")
        return
    idx = word_to_idx[word]
    vec = word_embeddings[idx]
    similarities = []
    for i in range(vocab_size):
        if i == idx:
            continue
        sim = np.dot(vec, word_embeddings[i]) / (np.linalg.norm(vec) * np.linalg.norm(word_embeddings[i]))
        similarities.append((idx_to_word[i], sim))
    similarities.sort(key=lambda x: x[1], reverse=True)
    for word, sim in similarities[:top_n]:
        print(f"{word}: {sim:.4f}")

# Test the model
print("\nWords similar to 'fox':")
find_similar('fox')

#### With `Gensim`

In [None]:
import gensim
from gensim.models import Word2Vec

# Sample corpus
sentences = [
    ["the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"],
    ["i", "love", "natural", "language", "processing"],
    ["word2vec", "is", "a", "technique", "for", "natural", "language", "processing"],
    ["the", "dog", "is", "lazy", "but", "the", "brown", "fox", "is", "quick"]
]


In [None]:
# Initialize and train the model
model = Word2Vec(
    sentences,
    vector_size=100,  # Dimensionality of the word vectors
    window=5,         # Maximum distance between the current and predicted word
    min_count=1,      # Ignores all words with total frequency lower than this
    workers=4,        # Use these many worker threads to train the model
    sg=1              # 1 for Skip-gram; 0 for CBOW
)

In [None]:
# Find most similar words
similar_words = model.wv.most_similar("fox", topn=3)
print(similar_words)

# Compute similarity between two words
similarity = model.wv.similarity("dog", "fox")
print(f"Similarity between 'dog' and 'fox': {similarity:.4f}")

In [None]:
sample = """
Mr. Dursley was the director of a firm called Grunnings, which made
drills. He was a big, beefy man with hardly any neck, although he did
have a very large mustache. Mrs. Dursley was thin and blonde and had
nearly twice the usual amount of neck, which came in very useful as she
spent so much of her time craning over garden fences, spying on the
neighbors. The Dursleys had a small son called Dudley and in their
opinion there was no finer boy anywhere.


The Dursleys had everything they wanted, but they also had a secret, and
their greatest fear was that somebody would discover it. They didn't
think they could bear it if anyone found out about the Potters. Mrs.
Potter was Mrs. Dursley's sister, but they hadn't met for several years;
in fact, Mrs. Dursley pretended she didn't have a sister, because her
sister and her good-for-nothing husband were as unDursleyish as it was
possible to be. The Dursleys shuddered to think what the neighbors would
say if the Potters arrived in the street. The Dursleys knew that the
Potters had a small son, too, but they had never even seen him. This boy
was another good reason for keeping the Potters away; they didn't want
Dudley mixing with a child like that.


When Mr. and Mrs. Dursley woke up on the dull, gray Tuesday our story
starts, there was nothing about the cloudy sky outside to suggest that
strange and mysterious things would soon be happening all over the
country. Mr. Dursley hummed as he picked out his most boring tie for
work, and Mrs. Dursley gossiped away happily as she wrestled a screaming
Dudley into his high chair.
"""

sentences = [
    gensim.utils.simple_preprocess(sentence)
    for sentence in sample.split("\n\n")
]

In [None]:
model = Word2Vec(
    sentences,
    vector_size=100,  # Dimensionality of the word vectors
    window=5,         # Maximum distance between the current and predicted word
    min_count=1,      # Ignores all words with total frequency lower than this
    workers=4,        # Use these many worker threads to train the model
    sg=1              # 1 for Skip-gram; 0 for CBOW
)

In [None]:
# Find most similar words
similar_words = model.wv.most_similar("potter", topn=3)
print(similar_words)

### `GLoVE`

- https://nlp.stanford.edu/projects/glove/
- https://radimrehurek.com/gensim/models/word2vec.html#pretrained-models

In [None]:
import gensim.downloader

In [None]:
# All available models in gensim-data
for model in gensim.downloader.info()['models'].keys():
    print(model)

In [None]:
glove_vectors = gensim.downloader.load('glove-twitter-25')

In [None]:
glove_vectors.most_similar('twitter', topn=20)

In [None]:
glove_vectors.most_similar('president', topn=20)

In [None]:
glove_vectors.most_similar('usa', topn=20)

In [None]:
glove_vectors.get_vector('king')

In [None]:
king = glove_vectors.get_vector('king')
queen = glove_vectors.get_vector('queen')
man = glove_vectors.get_vector('man')
woman = glove_vectors.get_vector('woman')

res = king - man + woman

In [None]:
# calculate the cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(res.reshape(1, -1), queen.reshape(1, -1))
print(f"Similarity between queen and res: {similarity[0][0]}")

In [None]:
# calculate the cosine similarity of two vectors following the linear algebra formula
import numpy as np

def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))


# Prompt Engineering

## Quick Example

In [None]:
import openai
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv(), override=True) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

In [None]:
def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0,
    )
    return response.choices[0].message["content"]

In [None]:
prompt = "what is the capital of France?"
response = get_completion(prompt)
print(response)

In [None]:
prompt = "If there are 3 apples and you take away 2, how many in total?"
response = get_completion(prompt)
print(response)

In [None]:
text = f"""
Cooking ma po tofu is easy. First, you need to buy some tofu. Then you need to heat some oil in a pan.
After that, you need to add the tofu to the pan. Then you need to cook the tofu. After that, you need 
to add some seasoning to the tofu. Some people might first cook some ground beef and then add the tofu.
And that's it! You have cooked some delicious tofu. Enjoy!
"""

prompt = f"""
You will be provided with text delimited by triple quotes. If the content contains a sequence of instructions,
re-write those instructions in the following format:

Step 1 - ...
Step 2 - ...
...
Step N - ...
If the content does not contain a sequence of instructions, then simply write \"No steps provided.\"
\"\"\"{text}\"\"\"
"""

response = get_completion(prompt)
print("Completion for Text-to-Step transformation:")
print(response)


## Tokens

In [None]:
import tiktoken

tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [None]:
tokenizer.encode('tiktoken is great!')

In [None]:
def num_tokens_from_string(string: str, model_name: str = "gpt-3.5-turbo") -> int:
    """Returns the number of tokens in a text string."""
    tokenizer = tiktoken.encoding_for_model(model_name)
    num_tokens = len(tokenizer.encode(string))
    return num_tokens

In [None]:
num_tokens_from_string(prompt)

In [None]:
num_tokens_from_string(response)

In [None]:
# turn tokens into text
tokenizer.decode([83, 1609, 5963, 374, 2294, 0])

## More Examples

### Avoid prompt injection

### Format

In [None]:
text = f"""
Cooking ma po tofu is easy. First, you need to buy some tofu. Then you need to heat some oil in a pan.
After that, you need to add the tofu to the pan. Then you need to cook the tofu. After that, you need 
to add some seasoning to the tofu. Some people might first cook some ground beef and then add the tofu.
And that's it! You have cooked some delicious tofu. Enjoy!
"""

prompt = f"""
You will be provided with text delimited by triple quotes. If the content contains a sequence of instructions,
re-write those instructions in the following format:

Step 1 - ...
Step 2 - ...
...
Step N - ...
If the content does not contain a sequence of instructions, then simply write \"No steps provided.\"
Please provide the response in JSON format with the following keys:
step_numbers, steps
\"\"\"{text}\"\"\"
"""

response = get_completion(prompt)
print("Completion for Text-to-Step transformation:")
print(response)

### Check if condition is met

### Control the length

### Few-shot prompting

In [None]:
prompt = """
Please answer questions in a consistent style.

Q: How can I become a kungfu master?
A: Empty your mind, be formless. Shapeless, like water. If you put water into a cup, it becomes the cup. You put water into a bottle and it becomes the bottle. You put it in a teapot, it becomes the teapot. Now, water can flow or it can crash. Be water, my friend.
Q: How can I become a good leader?
"""

response = get_completion(prompt)
print(response)


### A math problem (coursera example)

In [None]:
prompt = """
Determine if the student's solution is correct or not.

Question:
I'm building a solar power installation and I need
 help working out the financials. 
- Land costs $100 / square foot
- I can buy solar panels for $250 / square foot
- I negotiated a contract for maintenance that will cost 
me a flat $100k per year, and an additional $10 / square
foot
What is the total cost for the first year of operations 
as a function of the number of square feet.

Student's Solution:
Let x be the size of the installation in square feet.
Costs:
1. Land cost: 100x
2. Solar panel cost: 250x
3. Maintenance cost: 100,000 + 100x
Total cost: 100x + 250x + 100,000 + 100x = 450x + 100,000
"""

response = get_completion(prompt)
print(response)

In [None]:
len(response)

### Hallucinations

In [None]:
prompt = "Tell me about the architecture Xiangshi Yin"

response = get_completion(prompt)
print(response)

## Interative Solution

In [None]:
prompt = "Tell me about the self-attention mechanism in transformers."

## Summarize

In [None]:
review = """
First year changing from Milorganite.....Have already spread over 3 acres once, waiting on my next shipment. This stuff spreads super easy and I am glad they increased the size of bags. We only had one issue and it is when my daughter had her basketball team over, it is good for PGF and bad.....maybe. I decided to use the spreading of fertilizer as conditioning drill, which seemed like a win/win. All the girls did well until it got to KOBI. Kobi started sprinting with the spreader and I was attempting to get her to slow down. By the time I got to her to explain why I needed an even spread on my beautiful lawn, she slipped in a giant St. Bernard turd. Unfortunately she was wearing her basketball shoes for some reason so they got ruined....but that is not the worst part. When she fell she went forward and landed face first in the spreader with all the fertilizer. Which would not have been a problem but Kobi is the most out of shape person on the team and when she began sprinting with the spreader she immediately broke out in a sweat. The PGF fertilizer stuck to her face and when she looked up it looked like a young version of the bearded lady from the carnival. After I quit laughing, I attempted to help her wash off her face. Well actually I asked her to pick up all the fertilizer she spilled first because this stuff is not cheap. Then we washed off her face but she screamed and screamed. I thought it was because of her prepubescent acne face. However, I now believe it was because PGF started to work instantly, with the moisture on her face. That was late February and her mother just called me in April and stated her daughter is growing a full beard and mustache. It was unfortunate because we had a basketball tournament and they would not let her play because they did not believe she was a girl. The hair on her face is thick and rich, which makes me think this is a great product. But you might want to keep it away from the Kobi's of the world.
"""


## Inferring
- Sentiment (positive/negative)
- Identify types of emotions
- Identify the subject of the text
- Identify the entities (product and company)
- Multiple tasks at once

In [None]:
prompt = f"""
Help me identify the entities and relations present in the following product review:
Review:
```
{review}
```
"""

response = get_completion(prompt)
print(response)

In [None]:
prompt = f"""
Help me identify the entities (product, company, etc.) and relations present in the following product review:
Review:
```
{review}
```
"""

response = get_completion(prompt)
print(response)

## Transform

### Translation

In [None]:
prompt = """
Translate the following English text to Chinese:

Hi, I would like to order a ma po tofu.
"""


### Tone transformation

### Spellchecks (example from coursera) 

In [None]:
text = [ 
  "The girl with the black and white puppies have a ball.",  # The girl has a ball.
  "Yolanda has her notebook.", # ok
  "Its going to be a long day. Does the car need it’s oil changed?",  # Homonyms
  "Their goes my freedom. There going to bring they’re suitcases.",  # Homonyms
  "Your going to need you’re notebook.",  # Homonyms
  "That medicine effects my ability to sleep. Have you heard of the butterfly affect?", # Homonyms
  "This phrase is to cherck chatGPT for speling abilitty"  # spelling
]

### Reply to Customer Emails

In [None]:
prompt = f"""
You are a customer service AI assistant.
Your task is to send an email reply to a valued customer.
Given the customer email delimited by ```, \
Generate a reply to thank the customer for their review.
If the sentiment is positive or neutral, thank them for \
their review.
If the sentiment is negative, apologize and suggest that \
they can reach out to customer service. 
Make sure to use specific details from the review.
Write in a concise and professional tone.
Sign the email as `AI customer agent`.
Customer review: ```{review}```
Review sentiment: {sentiment}
"""

## Chatbot

In [None]:
messages =  [  
{'role':'system', 'content':'You are an assistant that speaks like Shakespeare.'},    
{'role':'user', 'content':'tell me a joke'},   
{'role':'assistant', 'content':'Why did the chicken cross the road'},   
{'role':'user', 'content':'I don\'t know'}  ]

In [None]:
x = input("Tell me a joke: ")

In [None]:
print(x)