# Examples for Paper

- Update so I can enter a single sentence and I'll get POS or NER. For now, can use https://parts-of-speech.info/

In [1]:
import os
import sys

import pandas as pd

from spacy import displacy
from IPython.display import Image

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

from data_processing import DataProcessing
from feature_extraction import TfidfFeatureExtraction, SpacyFeatureExtraction



In [2]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Example in Introduction

In [3]:
colors = {
    "Person": "#fc9ce7",
    "Organization": "#fad0c4",
    "Word": "#08e76d",
    "Exact Time": "#a6c1ee",
    "Forecast Time": "#fcb69f",
    "Attribute": "#c2e9fb",
    "Verb": "#f9e24f",
    "Metric": "#ffb214",
    "Slope": "#21f7f6",
    "Location": "#61040c"
}

In [4]:
def create_visualization(text, entities, title, colors):
    doc_data = {
        "text": text,
        "ents": entities,
        # "title": title
    }

    options = {"colors": colors}

    # Render the visualization
    html = displacy.render(doc_data, style="ent", manual=True, jupyter=True, options=options)

In [5]:
text = "obesity"


In [6]:
# # Your text and custom entities
# text = "According to David Harper from Weather Underground, on Fri, August 9, 2024, the air quality index in Los Angeles is likely to improve by 20% in the time frame of 21 Aug 2024."
# entities = [
#     {"start": 12, "end": 26, "label": "Person"},
#     {"start": 30, "end": 50, "label": "Organization"},
#     {"start": 55, "end": 74, "label": "Exact Time"},
#     {"start": 79, "end": 97, "label": "Attribute"},
#     {"start": 100, "end": 112, "label": "Location"},
#     {"start": 113, "end": 123, "label": "Verb"},
#     {"start": 126, "end": 134, "label": "Slope"},
#     {"start": 136, "end": 140, "label": "Metric"},
#     {"start": 161, "end": 173, "label": "Forecast Time"}
# ]
# create_visualization(text, entities, colors)

In [7]:
title = "Health Prediction Example"
text = "A trusted expert speculates on 23 October 2024 that the global vaccination rate for measles in the US should stay stable at 100K people in 2027 Quarter 4."
entities = [
    {"start": 0, "end": 16, "label": "Person"},
    {"start": 17, "end": 27, "label": "Word"},
    {"start": 31, "end": 46, "label": "Exact Time"},
    {"start": 55, "end": 92, "label": "Attribute"},
    {"start": 99, "end": 101, "label": "Organization"},
    {"start": 102, "end": 109, "label": "Verb"},
    {"start": 109, "end": 121, "label": "Slope"},
    {"start": 124, "end": 136, "label": "Metric"},
    {"start": 139, "end": 153, "label": "Forecast Time"}
]

create_visualization(text, entities, title, colors)

# [A trusted expert] [speculates] on [23 October 2024] that the [global vaccination rate for measles] in the [US] [should] [stay stable] at [100K people] in [2027 Quarter 4].

## Example in Comprehensive Text Corpus

In [8]:
colors = {
    "Prediction Source": "#fc9ce7",
    "Prediction Target": "#fad0c4",
    "Prediction Date": "#08e76d",
    "Prediction Outcome": "#a6c1ee",
}


In [18]:
def create_visualization(text, entities, title, colors):
    doc_data = {
        "text": text,
        "ents": entities,
        # "title": title
    }

    options = {"ents": list(colors.keys()), "colors": colors}

    # Render the visualization
    html = displacy.render(doc_data, style="ent", manual=True, jupyter=True, options=options)

In [19]:
# Do the below for POS

In [21]:
# title = "Financial Prediction Template 1"
text = "On [ $p_t$ ], [ $p_p$ ] [ $p_w$ ] that the [ $p_a$ ] at [ $p_o$ ] [ $p_v$ ] [ $p_s$ ] by [ $p_m$ ] in [ $p_f$ ]."
entities = [
    {"start": 3, "end": 12, "label": "p_t"},
    {"start": 14, "end": 23, "label": "p_p"},
    {"start": 24, "end": 33, "label": "p_w"},
    {"start": 43, "end": 53, "label": "p_a"},
    {"start": 55, "end": 65, "label": "p_o"},
    {"start": 66, "end": 76, "label": "p_v"},
    {"start": 76, "end": 86, "label": "p_s"},
    {"start": 89, "end": 99, "label": "p_m"},
    {"start": 102, "end": 111, "label": "p_f"}
]
create_visualization(text, entities, title, colors)
print()
print()
print()
# title = "Financial Prediction Example"

text = "On Sunday, 20 April 2025, Detravious, an investor forecasts that the stock price at Apple will likely increase by 5 percent in Q4 of 2025."
entities = [
    {"start": 3, "end": 24, "label": "Prediction Date"},
    {"start": 26, "end": 59, "label": "Prediction Source"},
    {"start": 69, "end": 80, "label": "Prediction Outcome"},
    {"start": 84, "end": 89, "label": "Prediction Target"},
    {"start": 102, "end": 124, "label": "Prediction Outcome"},
    {"start": 127, "end": 137, "label": "Prediction Date"}
]

create_visualization(text, entities, title, colors)
print()
print()
print()











In [12]:
ex_text = "On Monday, December 16, 2024, Detravious, an investor forecasts that the revenue at Apple will likely increase by 5 percent to $20 billion in 2025 Q1."
label = 1
ex_df = pd.DataFrame([ex_text], columns=["Base Sentence"])
ex_df["Sentence Label"] = label
ex_df

Unnamed: 0,Base Sentence,Sentence Label
0,"On Monday, December 16, 2024, Detravious, an investor forecasts that the revenue at Apple will likely increase by 5 percent to $20 billion in 2025 Q1.",1


In [13]:
max_features = None

tf_idf_feature_extractor = TfidfFeatureExtraction(ex_df, 'Base Sentence')
tfidf_vectorized_features = tf_idf_feature_extractor.word_feature_extraction(max_features)
tfidf_vectorized_features

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 25 stored elements and shape (1, 25)>

In [14]:
# tfidf_vectorized_features_df = tf_idf_feature_extractor.feature_scores(max_features)
# tfidf_vectorized_features_df

In [15]:
# initialize the spacy model
spacy_feature_extractor = SpacyFeatureExtraction(ex_df, 'Base Sentence')
spacy_feature_extractor

<feature_extraction.SpacyFeatureExtraction at 0x14b2ae6221d0>

In [16]:
only_predictions = DataProcessing.df_to_list(ex_df, 'Base Sentence')
only_predictions

['On Monday, December 16, 2024, Detravious, an investor forecasts that the\xa0revenue at Apple will likely increase by 5 percent to $20 billion\xa0in\xa02025 Q1.']

In [17]:
word_leveL_disable_components = ["lemmatizer"]
word_level_pos_tags, word_level_pos_mappings, word_level_ner_entities, word_level_ner_mappings = spacy_feature_extractor.extract_entities(only_predictions, word_leveL_disable_components, visualize=True)

AttributeError: 'SpacyFeatureExtraction' object has no attribute 'extract_entities'

In [None]:
all_word_level_pos_df = DataProcessing.convert_to_df(word_level_pos_tags, mapping=word_level_pos_mappings)
all_word_level_pos_df

In [None]:
all_word_level_ner_df = DataProcessing.convert_to_df(word_level_ner_entities, word_level_ner_mappings)
all_word_level_ner_df

In [None]:
# word_level_tags_entities = [all_word_level_pos_df, all_word_level_ner_df]
# word_level_tags_entities_df = DataProcessing.concat_dfs(word_level_tags_entities, axis=1, ignore_index=False)
# word_level_tags_entities_df

In [None]:
# encoded_word_level_tags_entities_df = DataProcessing.encode_tags_entities_df(word_level_tags_entities_df, sentence_and_label_df=ex_df)
# encoded_word_level_tags_entities_df

In [None]:
# import spacy
# from spacy import displacy


# nlp = spacy.load("en_core_web_sm")
# doc = nlp(ex_text)
# displacy.serve(doc, style="ent", auto_select_port=True)

## Non-paper example

In [None]:
import spacy
from spacy import displacy

# Load a blank model
nlp = spacy.blank("en")

# Your text and custom entities
text = "My name is John Smith and I live in Paris"
entities = [
    {"start": 11, "end": 21, "label": "Person"},
    {"start": 36, "end": 41, "label": "Location"}
]

# Create a dictionary for displaCy
doc_data = {
    "text": text,
    "ents": entities,
    "title": None
}

# Render the visualization
html = displacy.render(doc_data, style="ent", manual=True, jupyter=True)


In [None]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
ex_text = "On Monday, December 16, 2024, Detravious, an investor forecasts that the revenue at Apple should likely increase by 5 percent to $20 billion in 2025 Q1."

doc = nlp(ex_text)
sentence_spans = list(doc.sents)
# displacy.serve(sentence_spans, style="dep")
html = displacy.render(sentence_spans, style="span", jupyter=True)