In [4]:
import spacy
import pandas as pd
import logging

In [5]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [6]:
nlp = spacy.load("en_core_web_lg")

In [7]:
class_mapping = {
    'ACCOUNTANT': 0,
    'ADVOCATE': 1,
    'AGRICULTURE': 2,
    'APPAREL': 3,
    'ARTS': 4,
    'AUTOMOBILE': 5,
    'AVIATION': 6,
    'BANKING': 7,
    'BPO': 8,
    'BUSINESS-DEVELOPMENT': 9,
    'CHEF': 10,
    'CONSTRUCTION': 11,
    'CONSULTANT': 12,
    'DESIGNER': 13,
    'DIGITAL-MEDIA': 14,
    'ENGINEERING': 15,
    'FINANCE': 16,
    'FITNESS': 17,
    'HEALTHCARE': 18,
    'HR': 19,
    'INFORMATION-TECHNOLOGY': 20,
    'PUBLIC-RELATIONS': 21,
    'SALES': 22,
    'TEACHER': 23
}

In [8]:
def remove_stopwords_and_lemmatize(text):
    cleaned_text = ' '.join(text.strip().split())
    doc = nlp(cleaned_text.lower())
    tokens_lemmatized = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(tokens_lemmatized)


In [9]:
def map_classes(input_csv, class_mapping):
    df = pd.read_csv(input_csv)
    df['class'] = df['class'].map(class_mapping)
    df.to_csv(input_csv, index=False)
    logging.info(f'Arquivo CSV com classes mapeadas salvo em {input_csv}')

In [10]:
def process_csv(input_csv):
    df = pd.read_csv(input_csv, encoding='utf-8')
    df = df[df['text'].apply(lambda x: x.strip() != '')]
    df['text'] = df['text'].apply(remove_stopwords_and_lemmatize)
    df.to_csv(input_csv, index=False, encoding='utf-8')
    logging.info(f'Arquivo CSV processado salvo em {input_csv}')

In [11]:
input_csv = '../csv/output.csv'

In [12]:
map_classes(input_csv, class_mapping)
process_csv(input_csv)

2024-06-19 15:17:54,878 - INFO - Arquivo CSV com classes mapeadas salvo em ../csv/output.csv
2024-06-19 15:22:58,566 - INFO - Arquivo CSV processado salvo em ../csv/output.csv
