In [None]:
import spacy
import pandas as pd
import logging
import numpy as np
import re

In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
nlp = spacy.load("en_core_web_lg")

In [None]:
class_mapping = {
    'ACCOUNTANT': 0,
    'ADVOCATE': 1,
    'AGRICULTURE': 2,
    'APPAREL': 3,
    'ARTS': 4,
    'AUTOMOBILE': 5,
    'AVIATION': 6,
    'BANKING': 7,
    'BPO': 8,
    'BUSINESS-DEVELOPMENT': 9,
    'CHEF': 10,
    'CONSTRUCTION': 11,
    'CONSULTANT': 12,
    'DESIGNER': 13,
    'DIGITAL-MEDIA': 14,
    'ENGINEERING': 15,
    'FINANCE': 16,
    'FITNESS': 17,
    'HEALTHCARE': 18,
    'HR': 19,
    'INFORMATION-TECHNOLOGY': 20,
    'PUBLIC-RELATIONS': 21,
    'SALES': 22,
    'TEACHER': 23,
    'ARCHITECT' :24,
    'MANAGMENT' : 25
}

In [None]:
senior_mapping = {
    'Junior': 0,
    'Mid-level': 1,
    'Senior': 2,
}

In [None]:
def remove_stopwords_and_lemmatize(text):
    text = re.sub(r'http\S+', '', text)
    cleaned_text = ' '.join(text.strip().split())
    doc = nlp(cleaned_text.lower())
    tokens_lemmatized = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(tokens_lemmatized)

In [None]:
def map_classes(input_csv, class_mapping):
    df = pd.read_csv(input_csv, encoding='utf-8',delimiter=";")
    df['class_number'] = df['class'].map(class_mapping)  
    df = df.dropna(subset=['class_number']) 
    df['class_number'] = df['class_number'].astype(int)  
    df.to_csv(input_csv, index=False)
    logging.info(f'CSV file with mapped classes saved in {input_csv}')

In [None]:
def senior_classes(input_csv, senior_mapping):
    df = pd.read_csv(input_csv,encoding='utf-8',delimiter=";")
    df['senior'] = df['senior'].map(senior_mapping)
    df['senior'] = pd.to_numeric(df['senior'], errors='coerce')
    df['senior'].fillna(3, inplace=True) 
    df['senior'] = df['senior'].astype(int)
    df.to_csv(input_csv, index=False)
    logging.info(f'CSV file with mapped classes saved in {input_csv}')

In [None]:
def process_csv(input_csv):
    df = pd.read_csv(input_csv, encoding='utf-8',delimiter=";")
    df = df[df['text'].apply(lambda x: x.strip() != '')]
    df['text'] = df['text'].apply(remove_stopwords_and_lemmatize)
    df.to_csv(input_csv, index=False, encoding='utf-8')
    logging.info(f'Processed CSV file saved in{input_csv}')

In [None]:
def modify_senior_column(csv_path):
    df = pd.read_csv(csv_path)
    df['senior'] = df['senior'].map({1: 0, 2: 1, 3: 2})
    df.dropna(subset=['senior'], inplace=True)
    df['senior'] = df['senior'].astype(int)
    df.to_csv(csv_path, index=False)

In [None]:
input_csv = '../csv/dataset.csv'

In [None]:
def process_csv(input_csv):
    df = pd.read_csv(input_csv, encoding='utf-8')
    df = df[df['text'].apply(lambda x: x.strip() != '')]
    df['text'] = df['text'].apply(remove_stopwords_and_lemmatize)
    df.to_csv(input_csv, index=False, encoding='utf-8')
    logging.info(f'Processed CSV file saved in{input_csv}')

In [None]:
input_csv = '../csv/output.csv'

In [None]:
process_csv(input_csv)

In [None]:
senioridade_counts = df['senior'].value_counts()
print(senioridade_counts)

In [None]:

df = pd.read_csv(input_csv, encoding='utf-8',delimiter=",")
senioridade_counts = df['senior'].value_counts()
print(senioridade_counts)

In [None]:
sampled_df = pd.DataFrame()
senior_values = [0, 1, 2]
sample_size = 426
for value in senior_values:
    samples = df[df['senior'] == value].sample(n=sample_size, replace=True)
    sampled_df = pd.concat([sampled_df, samples], ignore_index=True)
sampled_df.to_csv('sampled_data.csv', index=False)