In [68]:
!nvidia-smi

Mon Oct  7 10:44:41 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   77C    P0             34W /   70W |    3693MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

In [69]:
!python --version

Python 3.10.14


In [70]:
import cupy

print(cupy.__version__)

cuda_version = cupy.cuda.runtime.runtimeGetVersion()
print("CUDA Runtime Version:", cuda_version)

13.3.0
CUDA Runtime Version: 12060


In [71]:
!pip install spacy[lookups]



In [72]:
!python -m spacy download pt_core_news_lg

Collecting pt-core-news-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_lg-3.7.0/pt_core_news_lg-3.7.0-py3-none-any.whl (568.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m568.2/568.2 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_lg')


In [73]:
import spacy
from spacy import displacy

In [74]:
# Check if spaCy can run on GPU
spacy.require_gpu()

True

In [75]:
nlp = spacy.load("pt_core_news_lg")

In [76]:
text = "Steve Wozniak, CEO da Apple, lançou no novo iPhone em Cupertino, Califórnia."

In [77]:
doc = nlp(text)

In [78]:
print("POS Tagging:")
for token in doc:
    print(f"{token.text} - {token.pos_} ({token.tag_})")

POS Tagging:
Steve - PROPN (PROPN)
Wozniak - PROPN (PROPN)
, - PUNCT (PUNCT)
CEO - PROPN (PROPN)
da - ADP (ADP)
Apple - PROPN (PROPN)
, - PUNCT (PUNCT)
lançou - VERB (VERB)
no - ADP (ADP)
novo - ADJ (ADJ)
iPhone - NOUN (NOUN)
em - ADP (ADP)
Cupertino - PROPN (PROPN)
, - PUNCT (PUNCT)
Califórnia - PROPN (PROPN)
. - PUNCT (PUNCT)


In [79]:
# NER
print("\nNamed Entities (NER):")
for ent in doc.ents:
    print(f"{ent.text} - {ent.label_}")


Named Entities (NER):
Steve Wozniak - PER
Apple - ORG
iPhone - MISC
Cupertino - LOC
Califórnia - LOC


In [80]:
# Visualizar a árvore de dependência
displacy.render(doc, style="dep", jupyter=True)

# Combinando Named Entity Recognition com Dependências sintáticas do texto

In [81]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from collections import Counter
import numpy as np

In [82]:
texts = [
    "O presidente discursou na ONU.",
    "A Apple lançou o novo iPhone no Brasil.",
    "O Brasil venceu a Copa América em 2021.",
    "A Google está investindo em inteligência artificial.",
    "O governo do Brasil anunciou novas medidas econômicas."
]

In [83]:
# Extração e contagem de dependências em uma frase
def extract_dependency_counts(doc):
    dep_counts = Counter([token.dep_ for token in doc])
    return dep_counts

In [84]:
# Extração e contagem de NERs em uma frase
def extract_ner_counts(doc):
    ner_counts = Counter([ent.label_ for ent in doc.ents])
    return ner_counts

In [85]:
# Analise dos textos, extração e contagem de NERs e dependências
dep_features = []
ner_features = []
for text in texts:
    doc = nlp(text)
    dep_counts = extract_dependency_counts(doc)
    ner_counts = extract_ner_counts(doc)
    dep_features.append(dep_counts)
    ner_features.append(ner_counts)

In [86]:
# Criação de um conjunto único de relações de dependência e labels NER para vetorização
all_dependencies = set(dep for dep_count in dep_features for dep in dep_count.keys())
all_ner_labels = set(ner for ner_count in ner_features for ner in ner_count.keys())

In [87]:
# converte a contagem de dependências em uma matriz
dep_features_matrix = []
for dep_count in dep_features:
    # Create a vector based on the presence/frequency of each dependency relation
    vector = [dep_count.get(dep, 0) for dep in all_dependencies]
    dep_features_matrix.append(vector)

In [88]:
# Converte a contagem de NERs em uma matriz
ner_features_matrix = []
for ner_count in ner_features:
    # Create a vector based on the presence/frequency of each NER type
    vector = [ner_count.get(ner, 0) for ner in all_ner_labels]
    ner_features_matrix.append(vector)

In [89]:
# Converte ambas as martizes em array do numpy
dep_features_matrix = np.array(dep_features_matrix)
ner_features_matrix = np.array(ner_features_matrix)

In [90]:
# Padronização dos atributos de dependências e NER
scaler = StandardScaler()
dep_features_scaled = scaler.fit_transform(dep_features_matrix)
ner_features_scaled = scaler.fit_transform(ner_features_matrix)

In [91]:
# Alvo de 3 clusters
num_clusters = 3  # Set the number of clusters

# Cluster usando só estruturas de dependências sintáticas

In [92]:
kmeans_dependencies = KMeans(n_clusters=num_clusters, random_state=42)
kmeans_dependencies.fit(dep_features_scaled)

# Cria labels para os clusters
clusters_dependencies = kmeans_dependencies.labels_



In [93]:
# Resultados
for i, text in enumerate(texts):
    print(f"Text {i+1}: {text}")
    print(f"Assigned Cluster: {clusters_dependencies[i]}\n")

Text 1: O presidente discursou na ONU.
Assigned Cluster: 1

Text 2: A Apple lançou o novo iPhone no Brasil.
Assigned Cluster: 2

Text 3: O Brasil venceu a Copa América em 2021.
Assigned Cluster: 2

Text 4: A Google está investindo em inteligência artificial.
Assigned Cluster: 1

Text 5: O governo do Brasil anunciou novas medidas econômicas.
Assigned Cluster: 0



# Clusters usando só NER

In [94]:
kmeans_ner = KMeans(n_clusters=num_clusters, random_state=42)
kmeans_ner.fit(dep_features_scaled)

# Cria labels para os clusters
clusters_ner = kmeans_ner.labels_

# Resultados
for i, text in enumerate(texts):
    print(f"Text {i+1}: {text}")
    print(f"Assigned Cluster: {clusters_ner[i]}\n")

Text 1: O presidente discursou na ONU.
Assigned Cluster: 1

Text 2: A Apple lançou o novo iPhone no Brasil.
Assigned Cluster: 2

Text 3: O Brasil venceu a Copa América em 2021.
Assigned Cluster: 2

Text 4: A Google está investindo em inteligência artificial.
Assigned Cluster: 1

Text 5: O governo do Brasil anunciou novas medidas econômicas.
Assigned Cluster: 0





# Cluster usando NER e estruturas sintáticas

In [95]:
# Combina os atributos advindos das dependências e das NERs
combined_features = np.hstack((dep_features_scaled, ner_features_scaled))

In [96]:
# Aplica K-means
num_clusters = 3  # Set the number of clusters
kmeans_combined = KMeans(n_clusters=num_clusters, random_state=42)
kmeans_combined.fit(combined_features)

# Cria labels para os clusters
clusters_combined = kmeans_combined.labels_



In [97]:
# Resultados
for i, text in enumerate(texts):
    print(f"Text {i+1}: {text}")
    print(f"Assigned Cluster: {clusters_combined[i]}\n")

Text 1: O presidente discursou na ONU.
Assigned Cluster: 1

Text 2: A Apple lançou o novo iPhone no Brasil.
Assigned Cluster: 2

Text 3: O Brasil venceu a Copa América em 2021.
Assigned Cluster: 2

Text 4: A Google está investindo em inteligência artificial.
Assigned Cluster: 1

Text 5: O governo do Brasil anunciou novas medidas econômicas.
Assigned Cluster: 0



# Enriquecendo a base de NERs

In [98]:
from spacy.training import offsets_to_biluo_tags

In [99]:
# Adiciona o NER ao pipeline do spaCy (caso ainda não esteja)
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe("ner")

In [100]:
TRAINING_DATA = [
    ("A Apple lançou o novo iPhone.", {"entities": [(22, 28, "PRODUCT")]}),
    ("O novo iPhone 12 é o mais recente modelo.", {"entities": [(8, 17, "PRODUCT")]}),
    ("A Samsung apresentou o Galaxy S21.", {"entities": [(23, 33, "PRODUCT")]}),
    ("A Microsoft anunciou o novo Surface Laptop.", {"entities": [(28, 42, "PRODUCT")]}),
    ("A Google lançou o Pixel 5.", {"entities": [(19, 26, "PRODUCT")]}),
    ("O iPad Pro foi apresentado em 2021.", {"entities": [(2, 10, "PRODUCT")]}),
    ("O novo PlayStation 5 será lançado em breve.", {"entities": [(7, 19, "PRODUCT")]})
]

In [101]:
import random
from spacy.util import minibatch, compounding
from spacy.training import Example

In [102]:
# Add new labels to the NER
for _, annotations in TRAINING_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

In [103]:
# Disable other pipes during training (we only want to train NER)
with nlp.disable_pipes(*[pipe for pipe in nlp.pipe_names if pipe != "ner"]):
    # Initialize the optimizer
    optimizer = nlp.begin_training()
    
    # Training the model for more epochs
    for iteration in range(100):  # Train for more iterations
        random.shuffle(TRAINING_DATA)
        losses = {}

        # Create batches of training data using minibatch
        batches = minibatch(TRAINING_DATA, size=compounding(4.0, 32.0, 1.001))

        for batch in batches:
            examples = []
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                try:
                    example = Example.from_dict(doc, annotations)
                except ValueError:
                    print(f"Skipping misaligned entities in: '{text}'")
                    continue
                examples.append(example)

            # Update the model
            nlp.update(examples, sgd=optimizer, drop=0.35, losses=losses)

        print(f"Iteration {iteration} Losses: {losses}")

        # Early stopping condition (stop if losses are minimal or don't improve)
        if losses.get("ner", 0) < 0.01:
            print(f"Early stopping at iteration {iteration}")
            break

# Save the trained model
nlp.to_disk("custom_ner_model")

[2024-10-07 10:48:42,328] [INFO] Added vocab lookups: lexeme_norm
[2024-10-07 10:48:42,329] [INFO] Created vocabulary
[2024-10-07 10:48:42,331] [INFO] Finished initializing nlp object


Iteration 0 Losses: {'ner': 40.958024740219116}
Iteration 1 Losses: {'ner': 46.75210374593735}
Iteration 2 Losses: {'ner': 44.205777406692505}
Iteration 3 Losses: {'ner': 37.57341408729553}
Iteration 4 Losses: {'ner': 29.168720841407776}
Iteration 5 Losses: {'ner': 16.378386456519365}
Iteration 6 Losses: {'ner': 10.445551069453359}
Iteration 7 Losses: {'ner': 8.600672765722265}
Iteration 8 Losses: {'ner': 7.743284066498745}
Iteration 9 Losses: {'ner': 7.902292549722915}
Iteration 10 Losses: {'ner': 7.5678448945400305}
Iteration 11 Losses: {'ner': 6.927634832274634}
Iteration 12 Losses: {'ner': 6.0375048582063755}
Iteration 13 Losses: {'ner': 5.1340563867706805}
Iteration 14 Losses: {'ner': 5.062635310692713}
Iteration 15 Losses: {'ner': 7.6007994796091225}
Iteration 16 Losses: {'ner': 6.720187225160771}
Iteration 17 Losses: {'ner': 3.9950682268317905}
Iteration 18 Losses: {'ner': 6.506265244819133}
Iteration 19 Losses: {'ner': 7.968474141445768}
Iteration 20 Losses: {'ner': 6.351039614

In [104]:
# Load the trained model
nlp_custom = spacy.load("custom_ner_model")

# Test the model
test_text = "A Apple lançou o novo iPhone."
doc = nlp_custom(test_text)

for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")

Entity: iPhone, Label: PRODUCT


# K-Means usando GPU

In [66]:
!conda install -c rapidsai -c nvidia cuml=23.04 python=3.10 cudatoolkit=12.3

Retrieving notices: ...working... done
Channels:
 - rapidsai
 - nvidia
 - nodefaults
 - conda-forge
 - defaults
 - pytorch
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: failed

LibMambaUnsatisfiableError: Encountered problems while solving:
  - nothing provides requested cudatoolkit 12.3**
  - package cuml-23.04.00-cuda11_py310_230412_g4a62e3cf6_0 requires cudatoolkit >=11,<12.0a0, but none of the providers can be installed

Could not solve for environment specs
The following packages are incompatible
├─ [32mcuda-version 12.3** [0m is installable and it requires
│  └─ [32mcudatoolkit 12.3|12.3.* [0m, which can be installed;
├─ [31mcudatoolkit 12.3** [0m does not exist (perhaps a typo or a missing channel);
└─ [31mcuml 23.04** [0m is not installable because it requires
   └─ [31mcudatoolkit >=11,<12.0a0 [0m, which conflicts with any installable versions previously reported.



In [67]:
import numpy as np
from cuml.cluster import KMeans as cuKMeans
from sklearn.preprocessing import StandardScaler

num_clusters = 3  # Set the number of clusters
kmeans_GPU = cuKMeans(n_clusters=num_clusters, random_state=42)
kmeans_GPU.fit(combined_features)

# Get cluster labels
clusters_GPU = kmeans_GPU.labels_

# Display results
for i, text in enumerate(texts):
    print(f"Text {i+1}: {text}")
    print(f"Assigned Cluster: {clusters_GPU[i]}\n")


Text 1: O presidente discursou na ONU.
Assigned Cluster: 0

Text 2: A Apple lançou o novo iPhone no Brasil.
Assigned Cluster: 1

Text 3: O Brasil venceu a Copa América em 2021.
Assigned Cluster: 1

Text 4: A Google está investindo em inteligência artificial.
Assigned Cluster: 0

Text 5: O governo do Brasil anunciou novas medidas econômicas.
Assigned Cluster: 2

