In [31]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd



In [32]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /home/vini/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/vini/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vini/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [33]:
class_mapping = {
    'ACCOUNTANT': 0,
    'ADVOCATE': 1,
    'AGRICULTURE': 2,
    'APPAREL': 3,
    'ARTS': 4,
    'AUTOMOBILE': 5,
    'AVIATION': 6,
    'BANKING': 7,
    'BPO': 8,
    'BUSINESS-DEVELOPMENT': 9,
    'CHEF': 10,
    'CONSTRUCTION': 11,
    'CONSULTANT': 12,
    'DESIGNER': 13,
    'DIGITAL-MEDIA': 14,
    'ENGINEERING': 15,
    'FINANCE': 16,
    'FITNESS': 17,
    'HEALTHCARE': 18,
    'HR': 19,
    'INFORMATION-TECHNOLOGY': 20,
    'PUBLIC-RELATIONS': 21,
    'SALES': 22,
    'TEACHER': 23
}

In [34]:
def remove_stopwords_and_lemmatize(text):
    tokens = word_tokenize(text.lower())
    tokens_lemmatized = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens_lemmatized)

In [35]:
def map_classes(input_csv, class_mapping):
    df = pd.read_csv(input_csv)
    df['class'] = df['class'].map(class_mapping)
    df.to_csv(input_csv, index=False)
    print(f'Arquivo CSV com classes mapeadas salvo em {input_csv}')
    
    category_counts = df['class'].value_counts().sort_index()
    for category_id, count in category_counts.items():
        category_name = next(key for key, value in class_mapping.items() if value == category_id)
        print(f"{category_name} = {count}")

In [36]:
def process_csv(input_csv):
    df = pd.read_csv(input_csv, encoding='utf-8')
    df = df[df['text'].apply(lambda x: x.strip() != '')]
    df['text'] = df['text'].apply(remove_stopwords_and_lemmatize)
    df.to_csv(input_csv, index=False, encoding='utf-8')
    print(f'Arquivo CSV processado salvo em {input_csv}')

In [37]:
input_csv = 'csv/output.csv'
map_classes(input_csv, class_mapping)

Arquivo CSV com classes mapeadas salvo em csv/output.csv
ACCOUNTANT = 118
ADVOCATE = 118
AGRICULTURE = 63
APPAREL = 97
ARTS = 103
AUTOMOBILE = 36
AVIATION = 117
BANKING = 115
BPO = 22
BUSINESS-DEVELOPMENT = 120
CHEF = 118
CONSTRUCTION = 112
CONSULTANT = 115
DESIGNER = 107
DIGITAL-MEDIA = 96
ENGINEERING = 118
FINANCE = 118
FITNESS = 117
HEALTHCARE = 115
HR = 110
INFORMATION-TECHNOLOGY = 120
PUBLIC-RELATIONS = 111
SALES = 116
TEACHER = 102


In [38]:
process_csv(input_csv)


Arquivo CSV processado salvo em csv/output.csv
