In [356]:
import numpy as np
import json
import pandas as pd
import pymysql.cursors
from sklearn.metrics import mutual_info_score as mi
import kmedoids

In [357]:
connection = pymysql.connect("localhost", "root", "password", "test")

In [358]:
try:
    with connection.cursor() as cursor:
        sql = "SELECT * from ResearcherRelatedArea"
        cursor.execute(sql)
        result = cursor.fetchall()
        print(result)
finally:
    connection.close()

(('q1', 'a1'), ('q1', 'a2'), ('q1', 'a3'), ('q2', 'a2'), ('q3', 'a2'), ('q3', 'a4'), ('q4', 'a1'), ('q4', 'a2'), ('q5', 'a1'), ('q5', 'a3'), ('q5', 'a4'))


In [359]:
raw = pd.DataFrame(list(result),columns = ['Researcher_id','Area_id'])

In [360]:
rid = list(raw.groupby('Researcher_id').groups.keys())

In [361]:
data = list(raw.groupby('Researcher_id').apply(lambda x: list(x['Area_id'].values)))

In [362]:
data

[['a1', 'a2', 'a3'], ['a2'], ['a2', 'a4'], ['a1', 'a2'], ['a1', 'a3', 'a4']]

In [363]:
unique_area_idx = {}

In [364]:
idx = 0
for d in data:
    for area in d:
        if area not in unique_area_idx:
            unique_area_idx[area] = idx
            idx += 1

In [365]:
freq = np.zeros((len(unique_area_idx),len(unique_area_idx)))

In [366]:
unique_area_idx

{'a1': 0, 'a2': 1, 'a3': 2, 'a4': 3}

In [367]:
for d in data:
    for a1 in d:
        for a2 in d:
            freq[unique_area_idx[a1],unique_area_idx[a2]] += 1

In [368]:
freq

array([[3., 2., 2., 1.],
       [2., 4., 1., 1.],
       [2., 1., 2., 1.],
       [1., 1., 1., 2.]])

In [369]:
area_dist = 1-freq/np.diag(freq)

In [370]:
np.fill_diagonal(area_dist,0)

In [371]:
area_dist = (area_dist + area_dist.T) /2 * 100

In [372]:
def manhattan(r1,r2):
    dist = 0
    for e1 in r1:
        e1min = 100
        for e2 in r2:
            if area_dist[unique_area_idx[e1],unique_area_idx[e2]] < e1min:
                e1min = area_dist[unique_area_idx[e1],unique_area_idx[e2]]
        dist += e1min
    return dist

In [373]:
area_dist

array([[ 0.        , 41.66666667, 16.66666667, 58.33333333],
       [41.66666667,  0.        , 62.5       , 62.5       ],
       [16.66666667, 62.5       ,  0.        , 50.        ],
       [58.33333333, 62.5       , 50.        ,  0.        ]])

In [374]:
researcher_dist = np.ones((len(data),len(data)))

In [375]:
for x1,d1 in enumerate(data):
    for x2,d2 in enumerate(data):
        researcher_dist[x1,x2] = manhattan(d1,d2)

In [376]:
manhattan(data[0],data[4])

41.66666666666667

In [377]:
print(C)

{0: array([1]), 1: array([0, 2, 3, 4])}


In [378]:
researcher_dist = np.maximum(researcher_dist,researcher_dist.T)

In [379]:
researcher_dist 

array([[  0.        , 104.16666667,  91.66666667,  16.66666667,
         50.        ],
       [104.16666667,   0.        ,  62.5       ,  41.66666667,
        166.66666667],
       [ 91.66666667,  62.5       ,   0.        ,  58.33333333,
         91.66666667],
       [ 16.66666667,  41.66666667,  58.33333333,   0.        ,
         75.        ],
       [ 50.        , 166.66666667,  91.66666667,  75.        ,
          0.        ]])

In [380]:
medoids, clusters = kmedoids.kMedoids(researcher_dist, 2)
print(clusters)

{0: array([0, 1, 2, 3]), 1: array([4])}


In [387]:
output = {}

In [390]:
for c in clusters.values():
    group = []
    for d in c:
        group += [rid[d]]
    for d in c:
        output[rid[d]] = list(set(group) - set([rid[d]]))

In [391]:
output

{'q1': ['q4', 'q2', 'q3'],
 'q2': ['q4', 'q1', 'q3'],
 'q3': ['q4', 'q2', 'q1'],
 'q4': ['q3', 'q2', 'q1'],
 'q5': []}