# Code examples for basic NLP/LLM terminology

## Full article with explanation of every mentioned term can be found at:

## Tokenization

In [12]:
import nltk
from nltk.tokenize import word_tokenize

# Downloading the necessary NLTK data
nltk.download('punkt')

# Sample text
text = "Hello, this is an example of tokenization."

# Tokenizing the text
tokens = word_tokenize(text)

print(tokens)

['Hello', ',', 'this', 'is', 'an', 'example', 'of', 'tokenization', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Stop words removal

In [13]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download stopwords from NLTK
nltk.download('stopwords')
nltk.download('punkt')

# Sample text
text = "This is a sample sentence demonstrating the removal of stopwords."

# Tokenize the text
tokens = word_tokenize(text)

# Get English stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords
filtered_sentence = [word for word in tokens if not word.lower() in stop_words]

print("Original Sentence:", text)
print("Filtered Sentence:", ' '.join(filtered_sentence))

Original Sentence: This is a sample sentence demonstrating the removal of stopwords.
Filtered Sentence: sample sentence demonstrating removal stopwords .


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## POS Tagging

In [14]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Download required resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Sample text
text = "This is a simple example of POS tagging using NLTK in Python."

# Tokenizing the text
tokens = word_tokenize(text)

# POS Tagging
tags = pos_tag(tokens)

print(tags)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('simple', 'JJ'), ('example', 'NN'), ('of', 'IN'), ('POS', 'NNP'), ('tagging', 'VBG'), ('using', 'VBG'), ('NLTK', 'NNP'), ('in', 'IN'), ('Python', 'NNP'), ('.', '.')]


## N-Grams

In [15]:
def generate_ngrams(text, n):
    # Tokenize the text by words
    words = text.split()

    # Create n-grams
    ngrams = zip(*[words[i:] for i in range(n)])

    return [" ".join(ngram) for ngram in ngrams]

# Example usage
text = "This is a simple example to demonstrate n-grams."
n = 2  # For bigrams; change to 3 for trigrams, etc.

bigrams = generate_ngrams(text, n)

print(bigrams)

['This is', 'is a', 'a simple', 'simple example', 'example to', 'to demonstrate', 'demonstrate n-grams.']


## Lemmatization

In [16]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Downloading required resources
nltk.download('wordnet')
nltk.download('punkt')

# Initialize the WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Sample text
text = "The cats are running faster than the dogs."

# Tokenizing the text
tokens = word_tokenize(text)

# Lemmatizing each word
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]

print("Original Sentence:", text)
print("Lemmatized Words:", ' '.join(lemmatized_words))


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Original Sentence: The cats are running faster than the dogs.
Lemmatized Words: The cat are running faster than the dog .


## Stemming

In [17]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Downloading the tokenizer models (if not already installed)
nltk.download('punkt')

# Initialize the Porter Stemmer
stemmer = PorterStemmer()

# Sample text
text = "The cats are running faster than the dogs."

# Tokenize the text
tokens = word_tokenize(text)

# Stemming each token
stemmed_words = [stemmer.stem(word) for word in tokens]

print("Original Sentence:", text)
print("Stemmed Words:", ' '.join(stemmed_words))


Original Sentence: The cats are running faster than the dogs.
Stemmed Words: the cat are run faster than the dog .


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## One-hot Encoding

In [18]:
import pandas as pd

# Sample data: a list of categories
data = ['dog', 'cat', 'fish']

# Convert to pandas DataFrame
df = pd.DataFrame(data, columns=['Animal'])

# One-hot encode the data
one_hot_encoded_data = pd.get_dummies(df, columns=['Animal'])

print(one_hot_encoded_data)

   Animal_cat  Animal_dog  Animal_fish
0           0           1            0
1           1           0            0
2           0           0            1


## TF-IDF

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = [
    "the sky is blue",
    "the sun is bright",
    "the sun in the sky is bright",
    "we can see the shining sun, the bright sun"
]

# Create a TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)

# Show the TF-IDF values
print(tfidf_matrix.toarray().round(2))

[[0.66 0.   0.   0.   0.42 0.   0.   0.52 0.   0.34 0.  ]
 [0.   0.52 0.   0.   0.52 0.   0.   0.   0.52 0.43 0.  ]
 [0.   0.32 0.   0.5  0.32 0.   0.   0.4  0.32 0.53 0.  ]
 [0.   0.24 0.37 0.   0.   0.37 0.37 0.   0.48 0.39 0.37]]


## Embeddings (gensim)

In [22]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

# Sample sentences
sentences = [
    "Python is a popular programming language",
    "Python scripts are easy to write",
    "Machine learning involves algorithms and data"
]

# Tokenizing words
nltk.download('punkt')
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

# Training the Word2Vec model
model = Word2Vec(tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Getting the vector for a word
word_vector = model.wv['python']

print(word_vector)

[-5.3622725e-04  2.3643136e-04  5.1033497e-03  9.0092728e-03
 -9.3029495e-03 -7.1168090e-03  6.4588725e-03  8.9729885e-03
 -5.0154282e-03 -3.7633716e-03  7.3805046e-03 -1.5334714e-03
 -4.5366134e-03  6.5540518e-03 -4.8601604e-03 -1.8160177e-03
  2.8765798e-03  9.9187379e-04 -8.2852151e-03 -9.4488179e-03
  7.3117660e-03  5.0702621e-03  6.7576934e-03  7.6286553e-04
  6.3508903e-03 -3.4053659e-03 -9.4640139e-04  5.7685734e-03
 -7.5216377e-03 -3.9361035e-03 -7.5115822e-03 -9.3004224e-04
  9.5381187e-03 -7.3191668e-03 -2.3337686e-03 -1.9377411e-03
  8.0774371e-03 -5.9308959e-03  4.5162440e-05 -4.7537340e-03
 -9.6035507e-03  5.0072931e-03 -8.7595852e-03 -4.3918253e-03
 -3.5099984e-05 -2.9618145e-04 -7.6612402e-03  9.6147433e-03
  4.9820580e-03  9.2331432e-03 -8.1579173e-03  4.4957981e-03
 -4.1370760e-03  8.2453608e-04  8.4986202e-03 -4.4621765e-03
  4.5175003e-03 -6.7869602e-03 -3.5484887e-03  9.3985079e-03
 -1.5776526e-03  3.2137157e-04 -4.1406299e-03 -7.6826881e-03
 -1.5080082e-03  2.46979

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Using LLM (OpenAI API)

In [None]:
import openai

# Set your API key
openai.api_key = 'your-api-key'

# Send a prompt to the API
response = openai.Completion.create(
  engine="text-davinci-003",  # Specify the model
  prompt="Translate the following English text to French: '{}'",
  max_tokens=60
)

print(response.choices[0].text.strip())