<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# NLP Basics

**Word Embeddings**

&copy; Dr. Yves J. Hilpisch

<a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>

## Imports

In [None]:
!git clone https://github.com/tpq-classes/natural_language_processing.git
import sys
sys.path.append('natural_language_processing')


In [None]:
import random
import numpy as np
import pandas as pd

## Document Vector

In [None]:
from gensim.models import Word2Vec

In [None]:
def get_doc_vector(doc):
    doc_tokens = [w.lower().strip('.,()') for w in doc.split()]
    dv = np.mean([model.wv[t] for t in doc_tokens], axis=0)
    return dv

## Text Classification (1)

In [None]:
python_snippets = [
    "Python is a versatile language for web development, data analysis, and automation.",
    "Use Python's libraries like NumPy and Pandas for efficient data manipulation.",
    "Python supports multiple programming paradigms, including procedural, object-oriented, and functional programming.",
    "The Python community offers extensive documentation and a wealth of online resources.",
    "Python's syntax is designed to be readable and straightforward, making it beginner-friendly.",
    "Django and Flask are popular frameworks for developing web applications in Python.",
    "Automate repetitive tasks with Python scripts and save time in your workflow."
]

In [None]:
nlp_snippets = [
    "Natural Language Processing (NLP) enables computers to understand and process human language.",
    "NLP is used in applications like sentiment analysis, chatbots, and machine translation.",
    "Tokenization is a fundamental step in NLP, breaking text into meaningful units.",
    "Named Entity Recognition (NER) identifies proper nouns in text, such as names and locations.",
    "Vectorization converts text data into numerical form for machine learning models.",
    "Popular NLP libraries include NLTK, SpaCy, and Hugging Face Transformers.",
    "NLP combines computational linguistics and machine learning for language understanding."
]

In [None]:
llm_snippets = [
    "Large Language Models (LLMs) are advanced neural networks trained on vast text corpora.",
    "LLMs like GPT-3 generate human-like text based on input prompts.",
    "Applications of LLMs include content creation, code generation, and conversational agents.",
    "LLMs utilize transformers, a deep learning architecture, for efficient processing.",
    "Training LLMs requires substantial computational resources and large datasets.",
    "Fine-tuning LLMs on specific tasks enhances their performance and accuracy.",
    "Ethical considerations in LLMs include bias, misinformation, and data privacy."
]

In [None]:
X = list()
X.extend(python_snippets)
X.extend(nlp_snippets)
X.extend(llm_snippets)

In [None]:
len(X)

In [None]:
y = np.array(7 * [0] + 7 * [1] + 7 * [2])
y

In [None]:
sentences = list()
for s in X:
    sentences.append([w.lower().strip('(),;.') for w in s.split()])
sentences[:1]

In [None]:
model = Word2Vec(sentences, min_count=1,
                 vector_size=7,
                 sg=0, window=3)

In [None]:
# model.wv.key_to_index.keys()

In [None]:
model.wv['proper'].round(3)

In [None]:
X_ = [get_doc_vector(d) for d in X]

In [None]:
X_[:2]

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

In [None]:
classifier = LogisticRegression(C=1)
# classifier = MLPClassifier(
#    hidden_layer_sizes=[100],
#    max_iter=3000)

In [None]:
classifier.fit(X_, y)

In [None]:
p_train = classifier.predict(X_)
p_train

In [None]:
accuracy_score(y, p_train)

In [None]:
# test snippets
test_snippets = [
    "Python's extensive standard library supports many common programming tasks.",
    "Jupyter notebooks are widely used for interactive Python development and data visualization.",
    "Python's dynamic typing and garbage collection simplify memory management.",
    "Sentiment analysis in NLP determines the emotional tone of text.",
    "Text classification categorizes text into predefined labels using NLP techniques.",
    "Word embeddings represent words as dense vectors for better machine learning performance.",
    "Transfer learning is often used in LLMs to adapt pre-trained models to new tasks.",
    "LLMs can summarize long documents, extracting key information efficiently.",
    "Prompt engineering tailors inputs to guide LLM outputs more effectively."
]

# Labels for the test snippets
new_labels = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])

In [None]:
def get_doc_vector_imp(doc, vs, low=0, high=20):
    doc_tokens = [w.lower().strip('(),;.:!?|') for w in doc.split()
                 if len(w) > low and len(w) < high]
    dv = list()
    for t in doc_tokens:
        try:
            dv.append(model.wv[t])
        except:
            dv.append(np.zeros(vs))
    dv = np.mean(dv, axis=0)
    return dv

In [None]:
X_test_ = [get_doc_vector_imp(d, vs=7) for d in test_snippets]

In [None]:
X_test_[:2]

In [None]:
p_test = classifier.predict(X_test_)
p_test

In [None]:
accuracy_score(new_labels, p_test)

## Text Classification (2)

In [None]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

In [None]:
data = fetch_20newsgroups(
    categories=['sci.med', 'sci.crypt', 'sci.space'])

In [None]:
# data['data'][0]

In [None]:
X, y = data.data, data.target

In [None]:
len(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=500)

In [None]:
docs = list()
for d in X_train:
    docs.append([w.lower().strip('(),;.:!?|') for w in d.split()
                if len(w) > 4 and len(w) < 10])
# docs[:1]

In [None]:
vs = 10

In [None]:
model = Word2Vec(docs, min_count=3,
                 vector_size=vs,
                 sg=1, window=5)

In [None]:
X_train_ = [get_doc_vector_imp(d, vs=vs, low=4, high=10) for d in X_train]

In [None]:
X_train_[:2]

In [None]:
# classifier = LogisticRegression()
classifier = MLPClassifier(
    hidden_layer_sizes=[100, 100],
    max_iter=3000)

In [None]:
classifier.fit(X_train_, y_train)

In [None]:
p_train = classifier.predict(X_train_)
p_train

In [None]:
accuracy_score(y_train, p_train)

In [None]:
X_test_ = [get_doc_vector_imp(d, vs=vs, low=4, high=10) for d in X_test]

In [None]:
p_test = classifier.predict(X_test_)
p_test[:10]

In [None]:
accuracy_score(y_test, p_test)

<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

<a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>