<a href="https://colab.research.google.com/github/michaelwnau/ai_academy_notebooks/blob/main/ner_zetas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import pandas as pd
import string
import spacy
import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from collections import defaultdict
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
STOP_WORDS = set(stopwords.words('english'))
PUNCTUATION = set(string.punctuation)
entity_counts = defaultdict(int)
# Regular expressions for undesired patterns
UNDESIRED_PATTERNS = [r"['\"]\w+$"]

In [None]:
def preprocess_text(text):
    # This is where the preprocess step
    # Remove undesired patterns
    for pattern in UNDESIRED_PATTERNS:
        text = re.sub(pattern, "", text)

    # Replace newline characters and carriage returns with a space
    text = text.replace("\n", " ").replace("\r", " ")

    # Lowercase - lowercasing improves tokenizing
    text = text.lower()

    # Tokenize and remove punctuation
    words = wordpunct_tokenize(text)
    words = [word for word in words if word not in PUNCTUATION]

    # Remove stop words
    words = [word for word in words if word not in STOP_WORDS]

    # Convert numbers to words
    words = [num2words(word) if word.isdigit() else word for word in words]

    return " ".join(words)

In [None]:
# This is where we connect our document store whether that is google drive or some other db
# Insert your text here
text = """<YOUR TEXT HERE>"""

preprocessed_text = preprocess_text(text)
print(preprocessed_text)


today rapidly evolving technological landscape artificial intelligence ai tools leaving indelible mark various sectors including government reactions government agencies leaders towards emerging ai tools mixed ranging outright prohibition cautious exploration zcore group practical experience working federal customers extensive prototyping depth research leads us believe ai tools present valuable opportunity government agencies high time agencies recognize potential several compelling reasons consumer technology world already integrating ai products chris sales recently said enhancing existing software creating new experiences ground result public expectations high quality experiences services increasingly influenced advantages ai brings government agencies must intentional leveraging ai improve customer experience avoid widening gap public expectations deliver ai sets apart emerging technologies demonstrating potential bring value government technologies like blockchain received signif

In [None]:
from collections import defaultdict

entity_counts = defaultdict(int)

for sent in nltk.sent_tokenize(text):
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
        if hasattr(chunk, 'label'):
            entity = ' '.join(c[0] for c in chunk)
            entity_counts[entity] += 1

for entity, count in entity_counts.items():
    print(f"{entity}({count})")


zCore Group(1)
AI(13)
Chris(1)
Large Language Models(1)
LLMs(9)
LLM(3)
Semantic(1)
Plain(1)


In [None]:
# Convert the dictionary into a pandas DataFrame
df = pd.DataFrame.from_dict(entity_counts, orient='index', columns=['Count'])

# If you want to reset the index and have the entities as a separate column:
df.reset_index(inplace=True)
df.rename(columns={"index": "Entity"}, inplace=True)

print(df)

                  Entity  Count
0            zCore Group      1
1                     AI     13
2                  Chris      1
3  Large Language Models      1
4                   LLMs      9
5                    LLM      3
6               Semantic      1
7                  Plain      1


In [None]:
# Get file location from user
file_location = input("Enter file location and name: ")

# Save the DataFrame to a CSV file
df.to_csv(file_location, index=False)