In [18]:
import spacy
import pandas as pd

In [19]:
nlp = spacy.load("en_core_web_lg")


In [20]:
class_mapping = {
    'ACCOUNTANT': 0,
    'ADVOCATE': 1,
    'AGRICULTURE': 2,
    'APPAREL': 3,
    'ARTS': 4,
    'AUTOMOBILE': 5,
    'AVIATION': 6,
    'BANKING': 7,
    'BPO': 8,
    'BUSINESS-DEVELOPMENT': 9,
    'CHEF': 10,
    'CONSTRUCTION': 11,
    'CONSULTANT': 12,
    'DESIGNER': 13,
    'DIGITAL-MEDIA': 14,
    'ENGINEERING': 15,
    'FINANCE': 16,
    'FITNESS': 17,
    'HEALTHCARE': 18,
    'HR': 19,
    'INFORMATION-TECHNOLOGY': 20,
    'PUBLIC-RELATIONS': 21,
    'SALES': 22,
    'TEACHER': 23
}

In [21]:
def remove_stopwords_and_lemmatize(text):
    doc = nlp(text.lower())
    tokens_lemmatized = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(tokens_lemmatized)

In [22]:
def map_classes(input_csv, class_mapping):
    df = pd.read_csv(input_csv)
    df['class'] = df['class'].map(class_mapping)
    df.to_csv(input_csv, index=False)
    print(f'Arquivo CSV com classes mapeadas salvo em {input_csv}')
    category_counts = df['class'].value_counts().sort_index()
    for category_id, count in category_counts.items():
        category_name = next(key for key, value in class_mapping.items() if value == category_id)
        print(f"{category_name} = {count}")

In [23]:
def process_csv(input_csv):
    df = pd.read_csv(input_csv, encoding='utf-8')
    df = df[df['text'].apply(lambda x: x.strip() != '')]
    df['text'] = df['text'].apply(remove_stopwords_and_lemmatize)
    df.to_csv(input_csv, index=False, encoding='utf-8')
    print(f'Arquivo CSV processado salvo em {input_csv}')

In [24]:
input_csv = 'csv/output.csv'
map_classes(input_csv, class_mapping)

Arquivo CSV com classes mapeadas salvo em csv/output.csv


In [26]:
process_csv(input_csv)


Arquivo CSV processado salvo em csv/output.csv
