In [314]:
# Utilizando sistema de gerenciador de pacotes.
# Todos os pacotes utilizados na aplicação, é listado no arquivo 'requirements.txt'.
# É obrigatório a execução desta bloco de código antes de executar os demais blocos de códigos.
%pip install -r requirements.txt





[notice] A new release of pip available: 22.3.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [315]:
# Importando as bibliotecas necessárias.
import pandas as pd
import matplotlib as plot
import seaborn as sbn
import numpy as np

In [316]:
# Atribuindo a variável 'principal' o arquivo .csv para tratabilidade
df = pd.read_csv(r'00_dataset\cleaned_data.csv', sep=',', encoding='utf-8')
df.head()

Unnamed: 0.1,Unnamed: 0,Name,Company,Location,Description,preprocessed_salary
0,0,data scientist,"tantus technologies, inc.","woodlawn, md","tantus technologies, inc, tantus, - recognized...",137500.0
1,1,data scientist,rogue credit union,"medford, or",department: business intelligence and enterpri...,
2,2,data scientist,knowbe4,"clearwater, fl",we are ranked 1 best place to work in technolo...,85000.0
3,3,data scientist,affinity solutions,"new york, ny","data scientistaffinity solutions, marketing cl...",114500.0
4,4,data scientist,pnnl,"richland, wa",. *organization and job id. *job description. ...,76500.0


In [317]:
# Renomeando as colunas para uma melhor visualização
df.columns = ['ID', 'Name', 'Company', 'Location', 'Description', 'Salary']
df.head()

Unnamed: 0,ID,Name,Company,Location,Description,Salary
0,0,data scientist,"tantus technologies, inc.","woodlawn, md","tantus technologies, inc, tantus, - recognized...",137500.0
1,1,data scientist,rogue credit union,"medford, or",department: business intelligence and enterpri...,
2,2,data scientist,knowbe4,"clearwater, fl",we are ranked 1 best place to work in technolo...,85000.0
3,3,data scientist,affinity solutions,"new york, ny","data scientistaffinity solutions, marketing cl...",114500.0
4,4,data scientist,pnnl,"richland, wa",. *organization and job id. *job description. ...,76500.0


In [318]:
# Verificado que a coluna em questão está para contagem de linhas, a mesma foi retirada do dataframe
df.drop(['ID'], axis=1, inplace=True)
df.head()

Unnamed: 0,Name,Company,Location,Description,Salary
0,data scientist,"tantus technologies, inc.","woodlawn, md","tantus technologies, inc, tantus, - recognized...",137500.0
1,data scientist,rogue credit union,"medford, or",department: business intelligence and enterpri...,
2,data scientist,knowbe4,"clearwater, fl",we are ranked 1 best place to work in technolo...,85000.0
3,data scientist,affinity solutions,"new york, ny","data scientistaffinity solutions, marketing cl...",114500.0
4,data scientist,pnnl,"richland, wa",. *organization and job id. *job description. ...,76500.0


In [319]:
# Chamando a coluna Salary para visualização dos dados em questão
df['Salary'].count()

2277

In [320]:
# Fazendo a contagem de quantos NaN conta dentro da coluna 'Salary'
df['Salary'].isna().sum()

12542

In [321]:
# Atribuindo 0 aos valores ausentes na coluna 'Salary'
df['Salary'] = df['Salary'].fillna(0, inplace=False)
df.head()

Unnamed: 0,Name,Company,Location,Description,Salary
0,data scientist,"tantus technologies, inc.","woodlawn, md","tantus technologies, inc, tantus, - recognized...",137500.0
1,data scientist,rogue credit union,"medford, or",department: business intelligence and enterpri...,0.0
2,data scientist,knowbe4,"clearwater, fl",we are ranked 1 best place to work in technolo...,85000.0
3,data scientist,affinity solutions,"new york, ny","data scientistaffinity solutions, marketing cl...",114500.0
4,data scientist,pnnl,"richland, wa",. *organization and job id. *job description. ...,76500.0


In [322]:
# Configuração para mostrar o conteúdo completo da coluna
pd.set_option('display.max_colwidth', None)

In [323]:
# Resetar para o tamanho padrão (50)
pd.reset_option('display.max_colwidth')

In [324]:
# Função para tratar uma string dividida por vírgulas
def tratar_string(arg):
    partes = arg.split(',')
    
    if len(partes) == 1:   # Se houver apenas uma parte, converte para título (primeira letra de cada palavra maiúscula)
        return partes[0].title()
    elif len(partes) == 2: # Se houver duas partes, converte a primeira para título e a segunda para maiúsculas (removendo espaços em branco desnecessários)
        return f"{partes[0].title()}, {partes[1].strip().upper()}"
    elif len(partes) == 3: # Se houver três partes, converte a primeira para título, a segunda e a terceira para maiúsculas (removendo espaços em branco desnecessários)
        return f"{partes[0].title()}, {partes[1].strip().upper()}, {partes[2].strip().upper()}"
    else:                  # Se houver mais de três partes, retorna a string original
        return arg

# Converte as colunas 'Name' e 'Company' para o formato de título
df['Name'] = df['Name'].str.title()
df['Company'] = df['Company'].str.title()
df['Location'] = df['Location'].apply(tratar_string)  # Aplica a função tratar_string à coluna 'Location' do DataFrame

df.head()

Unnamed: 0,Name,Company,Location,Description,Salary
0,Data Scientist,"Tantus Technologies, Inc.","Woodlawn, MD","tantus technologies, inc, tantus, - recognized...",137500.0
1,Data Scientist,Rogue Credit Union,"Medford, OR",department: business intelligence and enterpri...,0.0
2,Data Scientist,Knowbe4,"Clearwater, FL",we are ranked 1 best place to work in technolo...,85000.0
3,Data Scientist,Affinity Solutions,"New York, NY","data scientistaffinity solutions, marketing cl...",114500.0
4,Data Scientist,Pnnl,"Richland, WA",. *organization and job id. *job description. ...,76500.0


In [325]:
# Definindo uma string padrão 'Scientist' para a variável defaultDataScientist
defaultDataScientist = 'Scientist'
# Definindo uma função chamada 'Job' que categoriza trabalhos com base em palavras-chave
def Job(arg):
    # Dicionário de categorias e suas palavras-chave associadas
    keywords = {
        'Data Scientist': ['scientist', 'science', 'sceintist'],
        'Data Analyst': ['analyst', 'analytics', 'analysis', 'anaylst', 'intelligence', 'model',
                         'big data', 'visual', 'verif', 'report', 'special', 'platf'],
        'Data Engineer': ['engineer'],
        'Machine Learning': ['machine learning'],
        'Data Architect': ['architect'],
        'Statistician': ['math', 'statis'],
        'App Developer': ['devel'],
        'Data Researcher': ['research'],
        'Biologist': ['biolo'],
        'Data Mining': ['mining'],
        'Consultant': ['consult'],
        'Data Manager': ['manage'],
        'Data Strategy': ['strat']
    }
    # Convertendo a entrada para minúsculas para tornar a comparação de palavras-chave case-insensitive
    arg_lower = arg.lower()
    # Iterando sobre as categorias e suas palavras-chave associadas
    for category, keywords_list in keywords.items():   
        if any(keyword in arg_lower for keyword in keywords_list): # Verificando se pelo menos uma palavra-chave está presente na entrada
            return category
    # Se nenhuma correspondência é encontrada, retorna 'Others'
    return 'Others'

dfName = df['Name'].unique()    # Obtendo valores únicos da coluna 'Name' do DataFrame df
resultados = [Job(i) for i in dfName]   # Aplicando a função Job para categorizar os nomes dos trabalhos e armazenando os resultados
resultados_series = pd.Series(resultados).unique()  # Obtendo valores únicos da série de resultados
df_resultados = pd.DataFrame(resultados_series, columns=['Names'])  # Criando um DataFrame com os resultados

df_resultados

Unnamed: 0,Names
0,Data Scientist
1,Data Engineer
2,Data Analyst
3,Others
4,Data Manager
5,Data Architect
6,App Developer
7,Biologist
8,Data Researcher
9,Consultant


In [348]:
def get_state(location):
    state_mapping = {
        'Alabama, AL': ['alabama', 'huntsville', 'alabaster', 'birmingham', 'mobile', 'jefferson county', 'madison', 'montgomery'], 
        'Alaska, AK': ['alaska', 'anchorage'], 'Arizona, AZ': ['arizona', 'phoenix', 'tempe', 'scottsdale', 'chandler', 'gilbert',
        'mesa', 'dc ranch', 'luke afb'], 
        'Arkansas, AR': ['arkansas', 'rogers', 'little rock', 'bentonville', 'conway', 'mountain home'],
        'American Samoa, AS': ['american samoa'], 
        'California, CA': ['california', 'san jose', 'mountain view', 
        'san francisco', 'redlands', 'fremont', 'san diego', 'milpitas', 'palo alto', 'santa barbara', 'novato', 'santa fe springs',
        'los angeles', 'valencia', 'agoura hills', 'san rafael', 'concord', 'highland', 'burbank', 'carlsbad', 'scotts valley',
        'camarillo', 'irvine', 'woodland hills', 'redwood city', 'alameda', 'san ramon', 'monterey', 'fountain valley',
        'santa clara', 'torrance', 'brisbane', 'oakland', 'sunnyvale', 'san carlos', 'san mateo', 'clovis', 'menlo park', 'duarte',
        'encinitas', 'long beach', 'thousand oaks', 'universal city', 'anaheim', 'culver city', 'cerritos', 'santa monica', 'el segundo',
        'glendale', 'brea', 'westmont', 'pasadena', 'irwindale', 'calabasas', 'hermosa beach', 'whittier', 'northridge', 'marina del rey',
        'monaco', 'venice', 'sherman oaks', 'wilmington', 'monrovia', 'carson', 'seal beach', 'poway', 'la jolla', 'chula vista', 
        'national city', 'cupertino', 'union city', 'livermore', 'pleasanton', 'stanford', 'campbell', 'los gatos', 'los altos', 
        'parlier', 'pomona', 'orange', 'emeryville', 'foster city', 'aliso viejo', 'dublin', 'rancho cucamonga', 'costa mesa', 'lake forest',
        'west hollywood', 'walnut creek', 'vallejo', 'petaluma', 'sacramento', 'indio', 'jamul', 'columbia', 'gold river', 'century city',
        'sonoma', 'playa vista', 'santa cruz', 'seaside', 'redwood', 'temecula', 'rancho cordova', 'hollywood', 'ventura', 'roseville', 
        'newport beach', 'oxnard', 'fresno', 'malibu', 'davis', 'half moon bay', 'san bruno', 'san dimas', 'manhattan beach', 'ontario', 
        'diamond bar', 'silicon valley', 'santa ana', 'fullerton', 'elk grove', 'hawthorne', 'south berkeley', 'foothill ranch'], 
        'Colorado, CO': ['colorado', 'denver', 'longmont', 'boulder', 'aurora', 'arvada', 'englewood', 'vail', 'broomfield', 'westminster',
        'greenwood village', 'centennial', 'applewood', 'lone tree', 'loveland', 'lakewood', 'greeley', 'louisville', 'littleton',
        'lone tree', 'fort collins', 'loveland'],
        'Connecticut, CT': ['connecticut'], 'Delaware, DE': ['delaware'], 'District of Columbia, DC': ['district of columbia'],
        'Florida, FL': ['florida', 'clearwater', 'fort lauderdale', 'miami lakes', 'miami', 'tampa', 'winter park', 'melbourne', 'tallahassee',
        'orlando', 'jacksonville', 'gainesville', 'west palm', 'boca raton', 'pensacola', 'bradenton', 'ocala', 'delray', 'deerfield',
        'saint petersburg', 'juno', 'lake mary', 'plantation', 'fort myers', 'medley', 'lake nona', 'pinellas park', 'palm', 'jupiter',
        'coral gables', 'davie', 'lakeland', 'dania', 'st petersburg', 'daytona', 'key west'], 
        'Georgia, GA': ['georgia'], 'Guam, GU': ['guam'], 'Hawaii, HI': ['hawaii'], 'Idaho, ID': ['idaho'], 
        'Illinois, IL': ['illinois', 'chicago', 'kingdom', 'peoria', 'hoopeston', 'bloomington', 'schaumburg', 'evanston', 'melrose park',
        'lemont', 'rosemont', 'northbrook', 'elmhurst', 'downers grove', 'rolling meadows', 'naperville', 'maywood', 'oak brook',
        'des plaines', 'hines', 'evergreen park', 'bridgeview', 'libertyville', 'round lake', 'glenview', 'dekalb', 'time', 'glenview',
        'warrenville', 'scott afb', 'buffalo grove', 'champaign', 'westchester', 'mettawa', 'lake county', 'kankakee', 'campus', 'waukegan',
        'lake bluff', 'batavia', 'bradley'], 'Indiana, IN': ['indiana'], 'Iowa, IA': ['iowa'],
        'Kansas, KS': ['kansas'], 'Kentucky, KY': ['kentucky'], 'Louisiana, LA': ['louisiana'],
        'Maine, ME': ['maine'], 'Maryland, MD': ['maryland', 'woodlawn', 'linthicum heights', 'baltimore'], 
        'Massachusetts, MA': ['massachusetts', 'cambridge', 'worcester', 'ipswich', 'marlborough', 'boston', 'framingham', 'north reading',
        'woburn', 'springfield', 'quincy', 'newton', 'natick', 'burlington', 'waltham', 'wellesley', 'watertown', 'holyoke', 'lakeville',
        'beverly', 'hanscom afb', 'stoughton', 'mansfield', 'newburyport', 'lynn', 'hingham', 'norwell', 'foxborough', 'attleboro',
        'roxbury', 'arlington', 'wilbraham', 'wakefield', 'southborough', 'somerville', 'northborough', 'rockland', 'billerica', 'westborough',
        'needham'],
        'Michigan, MI': ['michigan'], 'Minnesota, MN': ['minnesota', 'apple valley'], 'Mississippi, MS': ['mississippi'],
        'Missouri, MO': ['missouri'], 'Montana, MT': ['montana'], 'Nebraska, NE': ['nebraska'],
        'Nevada, NV': ['nevada'], 'New Hampshire, NH': ['new hampshire'], 'New Jersey, NJ': ['new jersey', 'fort lee', 'newark'],
        'New Mexico, NM': ['new mexino'], 'New York, NY': ['new york', 'new hyde park,'], 'North Carolina, NC': ['north carolina'],
        'North Dakota, ND': ['north dakota'], 'Northern Mariana Islands, MP': ['northern mariana islands'],
        'Ohio, OH': ['ohio', 'bedford', 'brunswick', 'mentor', 'oincinnati', 'richfield', 'dayton', 'beavercreek', 'columbus', 'gahanna',
        'cincinnati', 'westlake', 'west jefferson, oh', 'westerville', 'lockbourne', 'slough', 'blacklick', 'hilliard', 'norwalk', 'brooklyn, oh',
        'cleveland', 'mayfield', 'valley view', 'mason', 'north canton', 'grove city', 'marysville, oh', 'piketon', 'colerain',
        'wright-patterson afb', 'fairfield', 'wickliffe', 'findlay', 'west chester, oh', 'athens', 'akron', 'franklin', 'ironton', 'toledo'], 
        'Oklahoma, OK': ['oklahoma'], 'Oregon, OR': ['oregon', 'medford'], 
        'Pennsylvania, PA': ['Pennsylvania', 'philadelphia', 'coraopolis', 'pittsburgh', 'harrisburg', 'phila', 'collegeville', 'west reading',
        'king of prussia', 'newtown', 'blue bell', 'exton', 'valley forge', 'bala cynwyd', 'conshohocken', 'west chester', 'wayne', 'horsham',
        'boothwyn', 'bensalem', 'feasterville trevose', 'malvern', 'paoli', 'allegheny west', 'radnor', 'plymouth meeting', 'bristol', 'doylestown',
        'chester township', 'ridley park', 'west point', 'warminster', 'media', 'berwyn', 'norristown', 'north wales', 'oaks', 'audubon', 'trevose',
        'clearfield', 'flourtown', 'annville', 'warrendale', 'alburtis', 'ambler', 'bala-cynwyd', 'irwin', 'chester', 'etters', 'hershey', 'wexford',
        'mechanicsburg', 'state college', 'allison park', 'ivyland', 'langhorne', 'catawissa', 'elizabethtown', 'bryn mawr', 'danville', 'canonsburg'],
        'Puerto Rico, PR': ['puerto rico'], 'Rhode Island, RI': ['rhode island'], 'South Carolina, SC': ['southc carolina'],
        'South Dakota, SD': ['south dakota'], 'Tennessee, TN': ['tennessee'], 
        'Texas, TX': ['texas', 'dallas', 'plano', 'austin', 'houston', 'fort worth', 'burleson', 'irving', 'san antonio', 'alvin', 'stafford',
        'pearland', 'spring', 'sugar land', 'webster', 'baytown', 'nassau bay', 'west university place', 'bellaire', 'schertz', 'lackland afb',
        'allen', 'richardson', 'grapevine', 'grand prairie', 'addison', 'lewisville', 'coppell', 'farmers branch', 'carrollton', 'southlake',
        'round rock', 'west lake hills', 'cedar park', 'kyle', 'frisco', 'el paso', 'lufkin', 'garland', 'la porte', 'abilene', 'college station',
        'georgetown', 'italy', 'belton', 'laredo', 'forney', 'canyon'],
        'Trust Territories, TT': ['trust territories'], 'Utah, UT': ['utah'], 'Vermont, VT': ['vermont'],
        'Virginia, VA': ['virginia'], 'Virgin Islands, VI': ['virgin islands'], 'Washington, WA': ['washington', 'richland', 'seattle'],
        'West Virginia, WV': ['west virginia'], 'Wisconsin, WI': ['wisconsin'], 'Wyoming, WY': ['wyoming']
    }

    location_lower = location.lower()
    for state, keywords_list in state_mapping.items():
        if any(keyword in location_lower for keyword in keywords_list):
            return state
    return location


df_locations = df['Location'].unique()
resultados = [get_state(location) for location in df_locations]
resultados_series = pd.Series(resultados).unique()
df_resultados = pd.DataFrame(resultados_series, columns=['States'])

df_resultados
pd.set_option('display.max_rows', None)
print(df_resultados)


                                                States
0                                         Maryland, MD
1                                           Oregon, OR
2                                          Florida, FL
3                                         New York, NY
4                                       Washington, WA
5                                       California, CA
6                                        Minnesota, MN
7                                       New Jersey, NJ
8                                         Illinois, IL
9                                            Texas, TX
10                                   Massachusetts, MA
11                                       Chantilly, VA
12                                             Toronto
13                                        Colorado, CO
14                                            Montreal
15                                         Herndon, VA
16                                         Detroit, MI
17        

In [11]:
# Gerando a classe do dataframe que utilizaremos no projeto
class Science:
    def __init__(self, name, company, location, description, salary):
        self.name = name
        self.company = company
        self.location = location
        self.description = description
        self.salary = salary
    
    @staticmethod
    def showById(lst, idx):
        if 0 <= idx < len(lst):
            print(f'Recuperado da lista o objeto de id: {idx}')
            item = lst[idx]
            print(f"Name: {item.name}," +
                  f"\nCompany: {item.company}," +
                  f"\nLocation: {item.location}," +
                  f"\nDescription: {item.description}," +
                  f"\nSalary: {item.salary}")
        else:
            print(f"O index informado ({idx}) não existe, por favor, selecione algo entre 0 e {len(lst)-1}.")

    @staticmethod
    def showAll(lst, top = None):
        if top is not None:
            print(f'Mostrando o(s) {top} primeiro(s) registro(s).')
            lst = lst[:top]
            
        for idx, item in enumerate(lst):
            print(f"ID: {idx}, Name: {item.name}, Company: {item.company}, Location: {item.location}, Description: {item.description}, Salary: {item.salary}")

list_science = [Science(row['Name'], row['Company'], row['Location'], row['Description'], row['Salary']) for index, row in df.iterrows()]

Science.showById(list_science, 20000)
Science.showAll(list_science, 2)

O index informado (20000) não existe, por favor, selecione algo entre 0 e 14818.
Mostrando o(s) 2 primeiro(s) registro(s).
ID: 0, Name: data scientist, Company: tantus technologies, inc., Location: woodlawn, md, Description: tantus technologies, inc, tantus, - recognized by the washington post as a top workplace - is seeking a data scientist with focus on artifical intellenge, machine learning, nlp and federal government healthcare space to help tantus build solutions for our customers that enable them to extract value from their data. drive the development of ai, ml, nlp solutions for tantus emerging technologies teamassist the team in identifying, building and optimizing machine learning algorithms and models, annotations, entity recognition and custom classifier models, assist with defining and driving ai strategiesbachelor’s or master’s degree in data science or artificial intelligence and machine learning, strongly preferred, ; bachelor’s or master’s degree in applied mathematics 