In [3]:
# NLPK procedure

!pip install nltk

import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Sample text
sample_text = """Hello there! Welcome to Finance 7047. This course, which is offered in Spring 2025, 
is designed to provide a good understanding of Financial Technologies and Cryptocurrency."""

print(f"Original Text:\n{sample_text}\n")

# Step 1: Convert to Lowercase
lowercase_text = sample_text.lower()
print(f"After Lowercasing:\n{lowercase_text}\n")

# Step 2: Remove Punctuation
# Using regular expressions to remove punctuation marks
cleaned_text = re.sub(f"[{re.escape(string.punctuation)}]", "", lowercase_text)
print(f"After Removing Punctuation:\n{cleaned_text}\n")

# Step 3: Tokenization
# Split text into words (tokens)
tokens = word_tokenize(cleaned_text)
print(f"After Tokenization:\n{tokens}\n")

# Step 4: Remove Stopwords
# Load NLTK's list of English stopwords
stop_words = set(stopwords.words("english"))
filtered_tokens = [word for word in tokens if word not in stop_words]
print(f"After Removing Stopwords:\n{filtered_tokens}\n")

# Step 5: Stemming (Optional)
# Use Porter Stemmer to reduce words to their root form
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print(f"After Stemming:\n{stemmed_tokens}\n")

# Step 6: Lemmatization (Alternative to Stemming)
# Use WordNet Lemmatizer to get the base form of words
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print(f"After Lemmatization:\n{lemmatized_tokens}\n")

# Final Preprocessed Text
final_text = " ".join(lemmatized_tokens)
print(f"Final Preprocessed Text:\n{final_text}")


Original Text:
Hello there! Welcome to Finance 7047. This course, which is offered in Spring 2025, 
is designed to provide a good understanding of Financial Technologies and Cryptocurrency.

After Lowercasing:
hello there! welcome to finance 7047. this course, which is offered in spring 2025, 
is designed to provide a good understanding of financial technologies and cryptocurrency.

After Removing Punctuation:
hello there welcome to finance 7047 this course which is offered in spring 2025 
is designed to provide a good understanding of financial technologies and cryptocurrency

After Tokenization:
['hello', 'there', 'welcome', 'to', 'finance', '7047', 'this', 'course', 'which', 'is', 'offered', 'in', 'spring', '2025', 'is', 'designed', 'to', 'provide', 'a', 'good', 'understanding', 'of', 'financial', 'technologies', 'and', 'cryptocurrency']

After Removing Stopwords:
['hello', 'welcome', 'finance', '7047', 'course', 'offered', 'spring', '2025', 'designed', 'provide', 'good', 'understan

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yut3\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yut3\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yut3\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\yut3\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [4]:
# TextBlob -- sentiment analysis

#!pip install textblob
from textblob import TextBlob

# Sample sentences
sentences = [
    "I love this product! It’s absolutely fantastic.",
    "This is the worst experience I have ever had.",
    "I'm not very happy with the service.",
    "The movie was great and very enjoyable!",
    "I am extremely disappointed with the quality of this item."
]

# Perform sentiment analysis on each sentence
for sentence in sentences:
    blob = TextBlob(sentence)  # Create a TextBlob object
    sentiment_score = blob.sentiment.polarity  # Get the polarity score (-1 to 1)
    
    # Determine sentiment based on polarity score
    if sentiment_score > 0:
        sentiment = "Positive"
    elif sentiment_score < 0:
        sentiment = "Negative"
    else:
        sentiment = "Neutral"
    
    # Print the sentence and its corresponding sentiment
    print(f"Sentence: {sentence}\nSentiment: {sentiment}\n")


Sentence: I love this product! It’s absolutely fantastic.
Sentiment: Positive

Sentence: This is the worst experience I have ever had.
Sentiment: Negative

Sentence: I'm not very happy with the service.
Sentiment: Negative

Sentence: The movie was great and very enjoyable!
Sentiment: Positive

Sentence: I am extremely disappointed with the quality of this item.
Sentiment: Negative



In [5]:
# Topic modeling
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd

# Sample documents for topic modeling
documents = [
    "I love to eat pizza and pasta.",
    "The economy is facing a downturn with rising unemployment.",
    "Pizza and burgers are my favorite fast foods.",
    "The government announced new policies to tackle inflation.",
    "AI and machine learning are transforming the technology industry.",
    "The stock market is experiencing a significant drop.",
    "Artificial intelligence is changing how we interact with technology.",
    "The economic impact of the pandemic has been severe."
]
# Step 1: Vectorize the text (Bag-of-Words)
# Use CountVectorizer to create the document-term matrix
vectorizer = CountVectorizer(stop_words='english')  # Remove common English stopwords
X = vectorizer.fit_transform(documents)
# Step 2: Apply LDA to extract topics
lda = LatentDirichletAllocation(n_components=2, random_state=42)  # Number of topics set to 2
lda.fit(X)

# Step 3: Display the top words in each topic
num_top_words = 5  # Number of top words to display for each topic
feature_names = vectorizer.get_feature_names_out()  # Get the feature names (words)

# Create a dictionary to store top words for each topic
topics = {}
for topic_idx, topic in enumerate(lda.components_):
    topics[f'Topic #{topic_idx + 1}'] = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]

# Convert to a DataFrame for easy display
topics_df = pd.DataFrame(topics)

# Display the DataFrame with the top words for each topic
print("Top Words in Each Topic:")
print(topics_df)


Top Words in Each Topic:
    Topic #1      Topic #2
0      pizza    technology
1   policies      industry
2  inflation  transforming
3        new      learning
4  announced            ai


In [6]:
# BoW
from sklearn.feature_extraction.text import CountVectorizer

# Define a sample set of documents
documents = [
    "The cat sat on the mat.",
    "The dog barked loudly.",
    "The cat chased the mouse.",
]

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit the model and transform the documents into a BoW matrix
bow_matrix = vectorizer.fit_transform(documents)

# Convert the result into a dense matrix format and print it
print("BoW Matrix (Document-Term Matrix):\n", bow_matrix.toarray())

# Get the feature names (vocabulary)
print("Feature Names (Vocabulary):\n", vectorizer.get_feature_names_out())

BoW Matrix (Document-Term Matrix):
 [[0 1 0 0 0 1 0 1 1 2]
 [1 0 0 1 1 0 0 0 0 1]
 [0 1 1 0 0 0 1 0 0 2]]
Feature Names (Vocabulary):
 ['barked' 'cat' 'chased' 'dog' 'loudly' 'mat' 'mouse' 'on' 'sat' 'the']


In [10]:
# Read in a large file

import PyPDF2

# Define the PDF file path
file_path = "nakamoto_2008_bitcoin.pdf"

# Open the PDF file and extract text
with open(file_path, "rb") as file:
    reader = PyPDF2.PdfReader(file)
    
    # Extract text from all pages
    large_document = "\n".join([page.extract_text() or "" for page in reader.pages])

# Print the first 500 characters as a preview
print(large_document[:500])  





Bitcoin: A Peer-to-Peer Electronic Cash System
Satoshi Nakamoto
satoshin@gmx.com
www.bitcoin.org
Abstract.  A purely peer-to-peer version of electronic cash would allow online  
payments to be sent directly from one party to another without going through a  
financial institution.  Digital signatures provide part of the solution, but the main  
benefits are lost if a trusted third party is still required to prevent double-spending.  
We propose a solution to the double-spending problem using a p


In [13]:
# Embeding

from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Sample documents
documents = ["The cat ran.", "The dog barked."]

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# Tokenize sentences for Word2Vec
tokenized_sentences = [word_tokenize(doc.lower()) for doc in documents]

# Train Word2Vec model
model = Word2Vec(sentences=tokenized_sentences, vector_size=50, window=3, min_count=1)

# Display vocabulary
print("Vocabulary:", list(model.wv.index_to_key))

# Find words similar to "dog"
print("Most similar to 'dog':", model.wv.most_similar("dog"))


Vocabulary: ['.', 'the', 'barked', 'dog', 'ran', 'cat']
Most similar to 'dog': [('.', -0.014475265517830849), ('barked', -0.15515565872192383), ('the', -0.17424817383289337), ('ran', -0.20600518584251404), ('cat', -0.2091003954410553)]


In [25]:
# Textual analysis using GPT

import openai

# Set up your OpenAI API key
openai.api_key = ""  # Replace with your actual API key


# Define messages for chat-based models
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": """
    Given the sentences below, analyze the relationships between words and describe the similarities:
    1. "The quick brown fox jumps over the lazy dog."
    2. "The dog chased the cat."
    3. "The cat climbed the tree."
    4. "The dog barked loudly at the intruder."
    How are the words 'dog' and 'cat' related in these contexts?
    """}
]

# Make the API call to OpenAI
client = openai.OpenAI()  # New API client format
response = client.chat.completions.create(
    model="gpt-4-turbo",  # Use the latest available model
    messages=messages
)

# Print the response
print(response.choices[0].message.content)


In the sentences provided, the words 'dog' and 'cat' play central roles, and their relationships with other words in the context indicate a number of similarities and differences concerning their functions and attributes:

1. **Sentence Analysis**:
   - **Sentence 1**: "The quick brown fox jumps over the lazy dog."
     - 'Dog' is described as 'lazy' and is part of a passive scene where the action (jumping) is performed by another animal (fox).
   - **Sentence 2**: "The dog chased the cat."
     - Here, 'dog' is active, performing the action of chasing. 'Cat' is the object of the chase, implying a dynamic interaction where the dog is the aggressor and the cat potentially the victim or a participant in a playful or antagonistic scenario.
   - **Sentence 3**: "The cat climbed the tree."
     - 'Cat' is the subject performing the action (climbing), indicating agility and possibly a reaction (maybe even to being chased as could be inferred from the previous sentence).
   - **Sentence 4**: 