In [4]:
import spacy
import pandas as pd
from spacy.lang.pt.stop_words import STOP_WORDS
import string

# Load Portuguese SpaCy
!python -m spacy download pt_core_news_sm
nlp = spacy.load('pt_core_news_sm')

# Load the CSV file
file_path = 'https://dados-ml-pln.s3.sa-east-1.amazonaws.com/tickets_reclamacoes_classificados.csv'
data = pd.read_csv(file_path, delimiter=';', encoding='latin1')

# Keep only the necessary columns
data_filtered = data[['descricao_reclamacao', 'categoria']].copy()

# Function to clean and lemmatize text
def clean_and_lemmatize(text):
    doc = nlp(text)
    tokens = []
    for token in doc:
        if token.lemma_ not in STOP_WORDS and token.lemma_.isalpha():
            tokens.append(token.lemma_.lower())
    return " ".join(tokens)

# Apply cleaning and lemmatization using .loc to avoid SettingWithCopyWarning
data_filtered['descricao_reclamacao_processed'] = data_filtered['descricao_reclamacao'].apply(clean_and_lemmatize)

# Prepare final dataset
final_dataset = data_filtered[['descricao_reclamacao_processed', 'categoria']].copy()

# Handle preexisting commas in the dataset safely
final_dataset['descricao_reclamacao_processed'] = final_dataset['descricao_reclamacao_processed'].str.replace(',', ';')

# Save to a new CSV file
output_file_path = 'processed_file.csv'  # Update this path as needed
final_dataset.to_csv(output_file_path, index=False, sep=',', encoding='utf-8')

print("Dataset processed and saved successfully.")


Collecting pt-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.7.0/pt_core_news_sm-3.7.0-py3-none-any.whl (13.0 MB)
     ---------------------------------------- 13.0/13.0 MB 3.8 MB/s eta 0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')
Dataset processed and saved successfully.
