<a href="https://colab.research.google.com/github/tranghorn88/CS5720_Home_Assignment_3/blob/main/Home_Assignment_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
### Implementing an RNN for Text Generation
# Import libraries
import tensorflow as tf
import numpy as np

# Load the dataset
path_to_file = tf.keras.utils.get_file("shakespeare.txt",
   "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt")
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
print(f"Length of text: {len(text)} characters")

# Preprocess the text
vocabulary = sorted(set(text))
char_to_idx = {u: i for i, u in enumerate(vocabulary)}
idx_to_char = np.array(vocabulary)
text_as_int = np.array([char_to_idx[c] for c in text])

# Create training examples
seq_length = 100
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

# Prepare batches
BATCH_SIZE = 64
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

# Build the model
vocabulary_size = len(vocabulary)
embedding_dim = 256
rnn_units = 1024

def build_model(vocabulary_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Input(batch_shape=(batch_size, None)),
        tf.keras.layers.Embedding(vocabulary_size, embedding_dim),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocabulary_size)
    ])
    return model

model = build_model(vocabulary_size, embedding_dim, rnn_units, batch_size=BATCH_SIZE)

# Compile and train the model
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)

EPOCHS = 5
model.fit(dataset, epochs=EPOCHS)

# Generate text with temperature scaling
def generate_text(model, start_string, temperature=1.0, num_generate=500):
    input_eval = [char_to_idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    text_generated = []

    # Find the LSTM layer and reset its states
    for layer in model.layers:
        if isinstance(layer, tf.keras.layers.LSTM):
            layer.reset_states()
            break

    for _ in range(num_generate):
        predictions = model(input_eval)
        predictions = predictions[:, -1, :] / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx_to_char[predicted_id])

    return start_string + ''.join(text_generated)

# Rebuild model for generation (batch size = 1)
gen_model = build_model(vocabulary_size, embedding_dim, rnn_units, batch_size=1)
gen_model.set_weights(model.get_weights())
gen_model.build(tf.TensorShape([1, None]))

# Generate and print outputs for different temperatures to observe randomness
temperatures = [0.2, 0.8, 1.2]
for temp in temperatures:
    print(f"\n--- Generated Text at Temperature = {temp} ---")
    generated_text = generate_text(gen_model, start_string="To be, or not to be", temperature=temp)
    print(generated_text)
    print("-" * 80)




Length of text: 1115394 characters
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

--- Generated Text at Temperature = 0.2 ---
To be, or not to be consul,
And then the world the world of his head.

KING EDWARD IV:
What say you shall be to the hand of his hands.

CAPULET:
As it is the world he is a word of his son.

CAPULET:
A blood of this fair soul think you will have so many of the state,
And then the last of the world of his father,
And then the world of this son of strike,
And when you have done the sun that have been so fair and man the house:
And then the marriage of the world of his father.

CORIOLANUS:
I will to the part of the co
--------------------------------------------------------------------------------

--- Generated Text at Temperature = 0.8 ---
To be, or not to be:

First Senator:
Which, he would have we did be real of
'Twas straighted to the bastardness,
Even this nothing and for this resolving eld,
Which in commission like, how like a measure of my shame,
Both in

In [9]:
### NLP Preprocessing Pipeline
# Install and import NLTK
!pip install -q nltk

import nltk
nltk.download('stopwords')

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Define preprocessing function
def preprocess_func(sentence):
    # Tokenize using RegexpTokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    print("1. Original Tokens:", tokens)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens_without_stopwords = [word for word in tokens if word.lower() not in stop_words]
    print("2. Tokens Without Stopwords:", tokens_without_stopwords)

    # Apply stemming
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in tokens_without_stopwords]
    print("3. Stemmed Words:", stemmed_words)

# Test the function with the sentence
sentence = "NLP techniques are used in virtual assistants like Alexa and Siri."
preprocess_func(sentence)


[nltk_data] Downloading package stopwords to /root/nltk_data...


1. Original Tokens: ['NLP', 'techniques', 'are', 'used', 'in', 'virtual', 'assistants', 'like', 'Alexa', 'and', 'Siri']
2. Tokens Without Stopwords: ['NLP', 'techniques', 'used', 'virtual', 'assistants', 'like', 'Alexa', 'Siri']
3. Stemmed Words: ['nlp', 'techniqu', 'use', 'virtual', 'assist', 'like', 'alexa', 'siri']


[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
### Named Entity Recognition with SpaCy
# Import library
import spacy

# Download the English language model
!python -m spacy download en_core_web_sm

# Load the model
nlp = spacy.load("en_core_web_sm")

# Input sentence
sentence = "Barack Obama served as the 44th President of the United States and won the Nobel Peace Prize in 2009."

# Process the sentence with spaCy
doc = nlp(sentence)

# Print each detected entity's text, label, and start and end character positions
print("Named Entities Detected:\n")
for ent in doc.ents:
    print(f"Text: {ent.text:30} - Label: {ent.label_:15} (Start Position: {ent.start_char} - End Position: {ent.end_char})")



Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Named Entities Detected:

Text: Barack Obama                   - Label: PERSON          (Start Position: 0 - End Position: 12)
Text: 44th                           - Label: ORDINAL         (Start Position: 27 - End Position: 31)
Text: the United States              - Label: GPE             (Start Position: 45 - End Position: 62)
Text

In [11]:
### Scaled Dot-Product Attention
# Import libraries
import numpy as np

# Define the softmax function to normalize attention scores
def softmax_func(x):
    # Subtract max for numerical stability before applying exponentiation
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    # Devide by the sum to get softmax probabilities (row-wise)
    return e_x / np.sum(e_x, axis=-1, keepdims=True)

# Define the scaled dot-product attention function
def scaled_dot_product_attention_func(Q, K, V):
    # Get the dimension of the key vectors
    d_k = Q.shape[-1]
    # Compute raw attention scores by dot product of Q and K transpose, then scale
    scores = np.matmul(Q, K.T) / np.sqrt(d_k)
    # Apply softmax to get attention weights
    att_weights = softmax_func(scores)
    output = np.matmul(att_weights, V)
    return att_weights, output

# Test input matrices (Q, K, V)
Q = np.array([[1, 0, 1, 0], [0, 1, 0, 1]])
K = np.array([[1, 0, 1, 0], [0, 1, 0, 1]])
V = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])

# Run scaled dot-product attention mechanism
weights, output = scaled_dot_product_attention_func(Q, K, V)

# Display attention weights matrix (after softmax)
print("Attention Weights:\n", weights)
# Display the final weighted sum of values (context vector)
print("Final Output:\n", output)

Attention Weights:
 [[0.73105858 0.26894142]
 [0.26894142 0.73105858]]
Final Output:
 [[2.07576569 3.07576569 4.07576569 5.07576569]
 [3.92423431 4.92423431 5.92423431 6.92423431]]


In [12]:
### Sentiment Analysis using HuggingFace Transformers
# Import the HuggingFace pipeline
from transformers import pipeline

# Load the pre-trained sentiment-analysis model
classifier = pipeline("sentiment-analysis")

# Input sentence to analyze
sentence = "Despite the high price, the performance of the new MacBook is outstanding."

# Run sentiment analysis
result = classifier(sentence)[0]

# Print sentiment label and confidence score
print("Sentiment:", result['label'])
print("Confidence Score:", round(result['score'], 4))



No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


Sentiment: POSITIVE
Confidence Score: 0.9998
