In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

In [2]:
df = pd.read_csv("./data/salaries.csv", sep=";")
df

Unnamed: 0,neighborhood,no_salary,one_salary,between_one_and_three,between_three_and_five,between_five_ten,more_than_ten
0,Água Verde,3.85,1.79,20.45,25.43,34.0,14.48
1,Badenfurt,0.73,2.84,29.13,31.49,29.42,6.4
2,Boa Vista,1.02,2.72,19.18,30.56,34.8,11.72
3,Bom Retiro,0.74,2.22,18.19,17.94,29.98,30.96
4,Centro,1.36,4.13,26.39,25.52,24.97,17.64
5,Da Glória,1.91,3.82,28.96,32.4,28.01,4.91
6,Do Salto,0.83,3.39,25.47,28.79,27.13,14.39
7,Escola Agrícola,1.61,3.07,22.96,25.86,30.03,16.47
8,Fidélis,0.44,3.13,30.3,35.82,25.99,4.32
9,Fortaleza,3.58,2.54,23.67,28.8,31.89,9.52


In [3]:
df.describe()

Unnamed: 0,no_salary,one_salary,between_one_and_three,between_three_and_five,between_five_ten,more_than_ten
count,35.0,35.0,35.0,35.0,35.0,35.0
mean,1.700286,2.558286,22.777429,27.782857,30.013429,15.168
std,1.26643,0.840009,6.201638,6.545384,3.790997,12.640356
min,0.44,0.4,4.29,7.37,19.25,2.29
25%,0.735,2.08,19.025,24.97,28.235,6.23
50%,1.36,2.5,24.16,29.56,30.73,11.72
75%,2.31,3.015,25.6,32.57,32.405,18.965
max,6.18,4.43,37.68,36.56,38.37,66.15


### Agrupando bairros com base na renda

In [4]:
from typing import List


def get_range(brute_salary: float) -> List[str]:

    if brute_salary < 0:
        return ["no_salary"]

    minimum_salary = 1302

    proximities = {}
    categories = {"no_salary": 0, "one_salary": 1, "between_one_and_three": 3, "between_three_and_five": 5, "between_five_ten": 10, "more_than_ten": 20}

    for category, value in categories.items():
        max_of_range = value * minimum_salary

        if category == "more_than_ten" and brute_salary > max_of_range:
            proximity = 100

        elif brute_salary == 0:
            proximity = 0

        elif max_of_range < brute_salary:
            proximity = (max_of_range / brute_salary) * 100
        
        else:
            proximity = (brute_salary / max_of_range) * 100

        proximities[category] = round(proximity, 2)


    return proximities

In [5]:
get_range(5000)

{'no_salary': 0.0,
 'one_salary': 26.04,
 'between_one_and_three': 78.12,
 'between_three_and_five': 76.8,
 'between_five_ten': 38.4,
 'more_than_ten': 19.2}

Busca bairros com base nas 2 categorias com mais similiaridade

In [6]:
def get_neighbors_by_salary_similarity(salary: float, top: int=5):
    ranges = get_range(salary)

    best_fits = [(None, 0), (None, 0)]

    for category, proximity in ranges.items():
        for fit in best_fits:
            if fit[1] < proximity and category != best_fits[0][0] and category != best_fits[1][0]:
                best_fits.insert(0, (category, proximity))

            if len(best_fits) > 2:
                best_fits.pop()

    return df.sort_values(by=[best_fits[0][0], best_fits[1][0]], ascending=False).iloc[:top, :]

In [7]:
get_neighbors_by_salary_similarity(3000, 5)

Unnamed: 0,neighborhood,no_salary,one_salary,between_one_and_three,between_three_and_five,between_five_ten,more_than_ten
15,Itoupavazinha,0.67,2.5,27.03,36.56,28.69,4.57
24,Testo Salto,0.61,1.97,24.78,36.11,30.73,5.81
8,Fidélis,0.44,3.13,30.3,35.82,25.99,4.32
20,Pogresso,0.68,3.21,26.32,34.54,29.8,5.44
29,Velha Grande,2.62,3.85,37.68,34.32,19.25,2.29


: 