In [1]:
import numpy as np
from matplotlib import pyplot as plt
from dataclasses import dataclass, asdict, field
from itertools import count
import random, itertools, statistics

In [13]:
class Board:
    PERCENTAGE_ATTRIBUTES = ['white', 'latino', 'black', 'white_salary', 'latino_salary', 'black_salary', 'children', 'pets']
    POPULATION = ['population']
    
    def __init__(self, board_size) -> None:
        self.board = self.generate_board(board_size)
        self.groups = {}

    @staticmethod
    def generate_board(board_size):
        return np.matrix([[District() for _ in range(board_size)] for _ in range (board_size)])
    
    def get_int_board(self):
        int_board = np.zeros((self.board.shape[0], self.board.shape[1]))
        for i, j in itertools.product(range(self.board.shape[0]), range(self.board.shape[1])):
            int_board[i, j] = repr(self.board[i, j])
        return int_board
        
    def count_unique(self):
        unique = set()
        for i, j in itertools.product(range(self.board.shape[0]), range(self.board.shape[1])):
            unique.add(self.board[i,j].identifier)
        return len(unique)

    def merge_units(self, u_1xy, u_2xy):
        unit_1_x, unit_1_y = u_1xy
        unit_1 = self.board[unit_1_x, unit_1_y]
        unit_2_x, unit_2_y = u_2xy
        unit_2 = self.board[unit_2_x, unit_2_y]
        
        if unit_1.identifier not in self.groups.keys() and unit_2.identifier not in self.groups.keys():
            group = Group()
            group.add_unit(unit_1)
            group.add_unit(unit_2)
            self.groups[group.identifier] = group
            self.board[unit_1_x, unit_1_y] = group
            self.board[unit_2_x, unit_2_y] = group
        
        if unit_1.identifier in self.groups.keys() and unit_2.identifier not in self.groups.keys():
            self.groups[unit_1.identifier].add_unit(unit_2)
            self.board[unit_2_x, unit_2_y] = self.groups[unit_1.identifier]

        if unit_1.identifier not in self.groups.keys() and unit_2.identifier in self.groups.keys():
            self.groups[unit_2.identifier].add_unit(unit_1)
            self.board[unit_1_x, unit_1_y] = self.groups[unit_2.identifier]
            
            
        if unit_1.identifier in self.groups.keys() and unit_2.identifier in self.groups.keys():
            self.groups[unit_1.identifier].add_unit(unit_2)
            for i, j in itertools.product(range(self.board.shape[0]), range(self.board.shape[1])):
                if self.board[i, j] == unit_2 or self.board[i, j] == unit_1: self.board[i, j] = self.groups[unit_1.identifier]
            del self.groups[unit_2.identifier]
            
    def clusters_raport(self):
        used_ids = []
        
        print("Clusters: ")
        for i, j in itertools.product(range(self.board.shape[0]), range(self.board.shape[1])):
            if self.board[i, j].__class__.__name__ == 'Group' and self.board[i, j].identifier not in used_ids:
                used_ids.append(self.board[i, j].identifier)
                print(f"Amount of units: {len(self.board[i, j].units)}, average cluster information: {str(self.board[i, j])}")
                print(f"Standard deviation: {self.board[i, j].group_standard_deviation()}")
                for unit in self.board[i, j].units:
                    print(f"District data: {str(unit)}")
                print("\n")
                
        print("Single districts: ")
        for i, j in itertools.product(range(self.board.shape[0]), range(self.board.shape[1])):
            if self.board[i, j].__class__.__name__ == 'District' and self.board[i, j].identifier not in used_ids:
                used_ids.append(self.board[i, j].identifier)
                print(f"District data: {str(self.board[i, j])}")
        
    def show_board(self) -> None:
        plt.figure()
        plt.imshow(self.get_int_board(), cmap='gray')
        plt.show()

    @staticmethod
    def bergman_divergence(p, q):
        pd = asdict(p)
        qd = asdict(q)
        result = 0
        for characteristic in Board.PERCENTAGE_ATTRIBUTES:
            result += abs(pd[characteristic] - qd[characteristic])
        return result

    @staticmethod
    def kullback_leibler_divergence(p, q):
        pd = asdict(p)
        qd = asdict(q)
        result = 0
        for characteristic in Board.PERCENTAGE_ATTRIBUTES:
            try:
                result += pd[characteristic] * np.log2(pd[characteristic]/qd[characteristic])
            except ZeroDivisionError:
                ...
        return result
    
    def agglompartition(self, k, divergence_function):
        def neighbours(x, y):
            _min_divergence = np.Inf
            min_x, min_y = 0, 0
            
            for i, j in itertools.product([-1, 0, 1], [-1, 0, 1]):
                if i == 0 and j == 0: continue
                if not (0 <= x+i < self.board.shape[0] and 0 <= y+j < self.board.shape[1]): continue
                if self.board[x, y].identifier == self.board[x+i, y+j].identifier: continue
                divergence = divergence_function(self.board[x, y], self.board[x+i, y+j])
                
                if divergence < _min_divergence:
                    _min_divergence = divergence
                    min_x, min_y = x+i, y+j
                    
            return _min_divergence, min_x, min_y
        
        R_size = self.board.shape[0]
        unique = self.count_unique()
        
        while unique > k:
            min_divergence = np.Inf
            min_x, min_y = np.Inf, np.Inf
            min_x1, min_y1 = np.Inf, np.Inf
            
            for x, y in itertools.product(range(R_size), range(R_size)):
                divergence_min_temp, min_x_temp, min_y_temp = neighbours(x, y)
                if divergence_min_temp < min_divergence:
                    min_divergence = divergence_min_temp
                    min_x, min_y = min_x_temp, min_y_temp
                    min_x1, min_y1 = x, y
                    
            # print(f"""to merge: 
            #       divergence: {min_divergence}
            #       {min_x, min_y}: id {b.board[min_x, min_y].identifier, b.board[min_x, min_y].latino, b.board[min_x, min_y].black, b.board[min_x, min_y].white}
            #       {min_x1, min_y1}: id {b.board[min_x1, min_y1].identifier, b.board[min_x1, min_y1].latino, b.board[min_x1, min_y1].black, b.board[min_x1, min_y1].white}
            #       """)
            # print(b.board, "\n\n")
            
            self.merge_units((min_x, min_y), (min_x1, min_y1))
            unique = self.count_unique()
            
        print(self.clusters_raport())
        print(self.board)
            
        
@dataclass(frozen=False)
class District:
    identifier: int = field(default_factory=count().__next__, init=False, repr=True)
    __min_salary: int = 3010
    __max_salary: int = 6156
    latino: int = field(init=False)
    black: int = field(init=False)
    white: int = field(init=False)
    latino_salary: int = field(init=False)
    black_salary: int = field(init=False)
    white_salary: int = field(init=False)
    children: int = field(init=False)
    pets: int = field(init=False)
    population: int = field(init=False)
    
    def __post_init__(self):
        self.latino = random.randint(0,100)
        self.white = random.randint(0,100-self.latino)
        self.black = 100-self.latino-self.white
        self.latino_salary = int(random.randint(self.__min_salary, self.__max_salary) / self.__max_salary * 100)
        self.black_salary = int(random.randint(self.__min_salary, self.__max_salary) / self.__max_salary * 100)
        self.white_salary = int(random.randint(self.__min_salary, self.__max_salary) / self.__max_salary * 100)
        self.children = random.randint(0, 100)
        self.pets = random.randint(0, 100)
        self.population = random.randint(1000, 10000)
        
    def __repr__(self) -> str:
        return str(self.identifier)
    
    def __str__(self) -> str:
        return f"{self.__class__.__name__} id: {self.identifier}, black: {self.black}%, white: {self.white}%, latino: {self.latino}%, latino salary: {self.latino_salary}%, black salary: {self.black_salary}%, white salary: {self.white_salary}%, pets: {self.pets}%, children: {self.children}% population: {self.population}"
       
@dataclass(frozen=False)
class Group(District):
    def __post_init__(self):
        self.units = []
        self.latino: int = 0
        self.black: int = 0
        self.white: int = 0
        self.latino_salary: int = 0
        self.black_salary: int = 0
        self.white_salary: int = 0
        self.children = int = 0
        self.pets = int = 0
        self.population: int = 0
    
    def add_unit(self, unit):
        if unit.__class__.__name__ == 'District':
            self.units.append(unit)
            self.update_group_info(unit)
        elif unit.__class__.__name__ == "Group":
            self.units.extend(unit.units)
            for unit in unit.units:
               self.update_group_info(unit)
               
    def __str__(self) -> str:
        return super().__str__()
        
    def __repr__(self):
        return super().__repr__()
    
    def update_group_info(self, unit) -> None:
        for characteristic in Board.PERCENTAGE_ATTRIBUTES:
            exec(f"self.{characteristic} = int(((self.{characteristic} * self.population / 100 + unit.{characteristic} * unit.population / 100) / (unit.population + self.population) ) * 100) ")
        self.population += unit.population
        
    def group_standard_deviation(self):
        result = {k: [] for k in Board.PERCENTAGE_ATTRIBUTES}
        for unit in self.units:
            for characteristic in Board.PERCENTAGE_ATTRIBUTES:
                result[characteristic].append(eval(f"unit.{characteristic}"))
        for characteristic in Board.PERCENTAGE_ATTRIBUTES:
            result[characteristic] = statistics.stdev(result[characteristic])
        return result



In [15]:
b = Board(3)
print(b.board)
b.agglompartition(4, b.bergman_divergence)


[[12 13 14]
 [15 16 17]
 [18 19 20]]
Clusters: 
Amount of units: 5, average cluster information: Group id: 21, black: 14%, white: 18%, latino: 65%, latino salary: 70%, black salary: 93%, white salary: 73%, pets: 54%, children: 18% population: 28870
Standard deviation: {'white': 12.116104984688768, 'latino': 11.344602240713423, 'black': 7.981227975693966, 'white_salary': 11.717508267545622, 'latino_salary': 13.0, 'black_salary': 7.5960516059331775, 'children': 38.494155400528015, 'pets': 21.568495543268657}
District data: District id: 14, black: 13%, white: 42%, latino: 45%, latino salary: 81%, black salary: 90%, white salary: 60%, pets: 39%, children: 48% population: 1232
District data: District id: 13, black: 3%, white: 28%, latino: 69%, latino salary: 82%, black salary: 94%, white salary: 71%, pets: 26%, children: 31% population: 5959
District data: District id: 16, black: 20%, white: 18%, latino: 62%, latino salary: 62%, black salary: 95%, white salary: 62%, pets: 48%, children: 4% 

Group id: 26, black: 6%, white: 6%, latino: 83%, latino salary: 70%, black salary: 82%, white salary: 67%, pets: 42%, children: 28% population: 18801
{'white': [], 'latino': [], 'black': [], 'white_salary': [], 'latino_salary': [], 'black_salary': [], 'children': [], 'pets': []}
{'white': [18, 24, 4, 7, 0, 5], 'latino': [67, 73, 90, 85, 100, 58], 'black': [15, 3, 6, 8, 0, 37], 'white_salary': [75, 81, 68, 49, 64, 68], 'latino_salary': [71, 89, 76, 64, 58, 63], 'black_salary': [98, 98, 82, 66, 85, 54], 'children': [35, 34, 28, 6, 41, 22], 'pets': [4, 43, 35, 16, 84, 57]}
{'white': 9.266426855410161, 'latino': 15.61302874738488, 'black': 13.48703080740902, 'white_salary': 10.894952959971878, 'latino_salary': 11.19672571186178, 'black_salary': 17.592612085759182, 'children': 12.43650540411842, 'pets': 28.743115117653247}


In [10]:
b = Board(5)
print(b.board)
agglompartition(b, 15, b.kullback_leibler_divergence)

[[0 1 2 3 4]
 [5 6 7 8 9]
 [10 11 12 13 14]
 [15 16 17 18 19]
 [20 21 22 23 24]]
Clusters: 
Amount of units: 5, average cluster information: Group id: 27, black: 18%, white: 10%, latino: 68%, latino salary: 75%, black salary: 67%, white salary: 74%, pets: 68%, children: 51% population: 34324
District data: District id: 0, black: 0%, white: 2%, latino: 98%, latino salary: 76%, black salary: 63%, white salary: 92%, pets: 84%, children: 94% population: 9655
District data: District id: 1, black: 0%, white: 12%, latino: 88%, latino salary: 88%, black salary: 58%, white salary: 88%, pets: 20%, children: 77% population: 6122
District data: District id: 3, black: 54%, white: 7%, latino: 39%, latino salary: 92%, black salary: 98%, white salary: 50%, pets: 87%, children: 39% population: 5755
District data: District id: 2, black: 29%, white: 25%, latino: 46%, latino salary: 61%, black salary: 65%, white salary: 65%, pets: 88%, children: 4% population: 8943
District data: District id: 8, black: 25

  result += pd[characteristic] * np.log2(pd[characteristic]/qd[characteristic])
  result += pd[characteristic] * np.log2(pd[characteristic]/qd[characteristic])
