In [1]:
import numpy as np
import pandas as pd


In [2]:
# сильные шкалы
def d_strong(column, i, l):
    xmax = np.max(column)
    xmin = np.min(column)
    return np.abs(i - l) / (xmax - xmin)


In [3]:
# шкала порядка
def d_order_k(i, l, k):
    if i < k < l or i > k > l:
        return 1
    elif i == k and l != k or i != k and l == k:
        return 0.5
    return 0


def d_order(column, i, l):
    sum = 0
    for k in column:
        sum += d_order_k(i, l, k)
    return sum / (len(column) - 1)


In [4]:
# шкала наименований
def d_name_k(i, l, k):
    if i == k and l != k or i != k and l == k:
        return 1
    return 0


def d_name(column, i, l):
    sum = 0
    for k in column:
        sum += d_name_k(i, l, k)
    return sum / (len(column))


In [5]:
# абсолютная=1, отношений=2, интервалов=3, порядка=4, наименований=5
def get_d_by_scale(scale):
    if scale <= 3:
        return d_strong
    elif scale == 4:
        return d_order
    elif scale == 5:
        return d_name


def distance_between_objects(scale, column, first, second):
    d = get_d_by_scale(scale)
    return d(column, first, second)


In [6]:
# чтение таблицы
column_names = ["country", "geo_zone", "area", "population", "continent(desc area)",
                "continent(desc population)", "language", "religion", "num of vert stripe",
                "num of horiz stripes", "num of dif colors", "hasRed", "hasGreen", "hasBlue",
                "hasGold", "hasWhite", "hasBlack", "hasOrange", "predominant color"]
column_scale = [5, 5, 2, 2, 4, 4, 5, 5, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5]
flags_table = pd.read_csv("Flags.csv", delimiter=";", names=column_names)
flags_table

Unnamed: 0,country,geo_zone,area,population,continent(desc area),continent(desc population),language,religion,num of vert stripe,num of horiz stripes,num of dif colors,hasRed,hasGreen,hasBlue,hasGold,hasWhite,hasBlack,hasOrange,predominant color
0,Afghanistan,1,648,16,1,1,10,2,0,3,5,1,1,0,1,1,1,0,green
1,Albania,1,29,3,5,3,6,6,0,0,3,1,0,0,1,0,1,0,red
2,Algeria,1,2388,20,2,2,8,2,2,0,3,1,1,0,0,1,0,0,green
3,American-Samoa,3,0,0,6,6,1,1,0,0,5,1,0,1,1,1,0,1,blue
4,Andorra,1,0,0,5,3,6,0,3,0,3,1,0,1,1,0,0,0,gold
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,Western-Samoa,3,3,0,6,6,1,1,0,0,3,1,0,1,0,1,0,0,red
190,Yugoslavia,1,256,22,5,3,6,6,0,3,4,1,0,1,1,1,0,0,red
191,Zaire,2,905,28,2,2,10,5,0,0,4,1,1,0,1,0,0,1,green
192,Zambia,2,753,6,2,2,10,5,3,0,4,1,1,0,0,0,1,1,green


In [7]:
# вычисление попарного расстояния между объектами для каждого признака
matrix = [[0 for i in range(len(flags_table))] for j in range(len(flags_table))]
for attr in range(len(column_names)):
    column = pd.Series.to_numpy(flags_table.loc[:, column_names[attr]])
    for i in range(column.size):
        for j in range(i, column.size):
            tmp = distance_between_objects(column_scale[attr], column, column[i], column[j])
            matrix[i][j] += pow(tmp, 2)
            # матрица симметричная в силу симметричности расстояния, так что экономим ресурсы компьютера и симметричной клетке прибавляем это же расстояние
            matrix[j][i] += pow(tmp, 2)
matrix = np.array(matrix)
for row in range(len(matrix)):
    for obj in range(len(matrix)):
        matrix[row][obj] = np.sqrt(matrix[row][obj]) / np.sqrt(len(column_names))
print(matrix)

[[0.         0.4195319  0.36397822 ... 0.43925376 0.4603077  0.18934247]
 [0.4195319  0.         0.50556826 ... 0.46778386 0.48755686 0.4271681 ]
 [0.36397822 0.50556826 0.         ... 0.44634712 0.43927953 0.40393829]
 ...
 [0.43925376 0.46778386 0.44634712 ... 0.         0.35248208 0.4149467 ]
 [0.4603077  0.48755686 0.43927953 ... 0.35248208 0.         0.43714261]
 [0.18934247 0.4271681  0.40393829 ... 0.4149467  0.43714261 0.        ]]


In [8]:
# нахождение объекта, суммарное расстояние до которого, будет максимальным
def sum_distance(row):
    return sum(row)


[max_elem, obj] = [-1, -1]
for col in range(len(matrix)):
    column_sum = sum_distance(matrix[col])
    if column_sum > max_elem:
         #я так поняла, из датафрейма нельзя взять просто строку, нужно именно диапазон, поэтому так
        el = flags_table[col:col + 1] 
        [max_elem, obj] = [column_sum, el]
print(max_elem, obj)

103.2193711130614        country  geo_zone  area  population  continent(desc area)  \
157  Sri-Lanka         1    66          15                     1   

     continent(desc population)  language  religion  num of vert stripe  \
157                           1        10         3                   2   

     num of horiz stripes  num of dif colors  hasRed  hasGreen  hasBlue  \
157                     0                  4       0         1        0   

     hasGold  hasWhite  hasBlack  hasOrange predominant color  
157        1         0         0          1              gold  


In [9]:
# функция, вычисляющая расстояние между порядковыми признаками
def d_il_attrs_order(i, l, x1, x2):
    if (x1[i] > x1[l] and x2[i] < x2[l]) or (x1[i] < x1[l] and x2[i] > x2[l]):
        return 1
    elif (x1[i] != x1[l] and x2[i] == x2[l]) or (x1[i] == x1[l] and x2[i] != x2[l]):
        return 0.5
    return 0


def order_attrs_distance(column1, column2):
    _sum = 0
    for i in range(column1.size):
        for l in range(i, column1.size):
            _sum += d_il_attrs_order(i, l, column1, column2)
    c_m_2 = (column1.size - 1) * column1.size / 2
    return _sum / c_m_2


column1 = pd.Series.to_numpy(flags_table.loc[:, column_names[4]])
column2 = pd.Series.to_numpy(flags_table.loc[:, column_names[5]])
print(order_attrs_distance(column1, column2))

0.08973879600448693
