In [6]:
import pandas as pd
import matplotlib.pyplot as plt
#sample data load
df = pd.read_csv('./data/sample_data_100.csv', dtype=str)

# df = pd.read_csv('./data/my_data/source1_attr.csv', dtype=str)

In [7]:
df

Unnamed: 0,measurement_concept_id,value_source_value
0,FEA29VE,neg -
1,HJ1UHH1,+
2,HJ1UHH1,+
3,U23G5CX,neg -
4,U23G5CX,pos +
...,...,...
576003,QWK9ZKD,norm -
576004,PWDTJ7F,NEGATIVE
576005,ZVYQICG,Non-Reactive(0.16)
576006,QWK9ZKD,norm -


In [3]:
count = df.groupby(['value_source_value']).size()

In [4]:
count

value_source_value
+                    16239
-                       68
0.00(0)                  5
0.00(Neg)                5
0.01(0)                  5
                     ...  
positive                 4
straw                    1
weakD                    1
°ñ¼öÆÇµ¶°á°ú ÂüÁ¶       51
º°ÁöÂüÁ¶                17
Length: 2337, dtype: int64

In [8]:
# 1) Data preparation
from collections import defaultdict
import re

def _general_preprocessing(df):
    
    # window user_encoding
    df['value_source_value'] = df['value_source_value'].str.encode('utf8', errors = 'strict').str.decode('utf8', errors = 'strict')
    
    # lower, strip space, \d\+ restore
    df['value_source_value'] = df['value_source_value'].str.lower()
    df['value_source_value'] = df['value_source_value'].str.strip()
    df['value_source_value'] = df['value_source_value'].str.replace(r'(\d)(\+)', lambda x: x.groups()[1]*int(x.groups()[0]))
    df['value_source_value'] = df['value_source_value'].str.replace(r'(\d+)(\s+)(\++)', lambda x: x.groups()[2])
    df['value_source_value'] = df['value_source_value'].str.replace(r'(\++)(\d+)(\s+)', lambda x: x.groups()[0])
    
    #replace + to Rh+ only for 'HJ1UHH1'(Rh type laboratory code) 
    _rh_pos_df = df[df['measurement_concept_id'] == 'BL7002']['value_source_value'].str.replace('+','rh+')
    _rh_neg_df = df[df['measurement_concept_id'] == 'BL7002']['value_source_value'].str.replace('-','rh-')
    df.loc[df.measurement_concept_id == 'HJ1UHH1', 'value_source_value'] = _rh_pos_df 
    df.loc[df.measurement_concept_id == 'HJ1UHH1', 'value_source_value'] = _rh_neg_df
    return df

df_with_freq = df.groupby(df.columns.tolist()).size().reset_index(name = 'Freq')
df_with_freq_sort = df_with_freq.sort_values(by = 'Freq',ascending = False)
df_cleaned = _general_preprocessing(df_with_freq_sort)
df_cleaned_regrouped = df_cleaned.groupby(['measurement_concept_id','value_source_value'])
df_cleaned_sort = df_cleaned_regrouped.sum().reset_index().sort_values(by = 'Freq', ascending = False)

dist_dicts = defaultdict(dict)

for cid, value, freq in df_cleaned_sort.values:
    dist_dicts[cid][value] = freq
    
dist_dicts = dict(dist_dicts)
for table_cid, dist_dict in dist_dicts.items():
    print(table_cid, dist_dict)


U23G5CX {'neg  -': 41611, 'pos  +': 1316, 'neg -': 16, 'neg   -': 7, 'pos   +': 5, 'posi +': 1, 'pos +': 1}
XK14TFL {'clear': 41435, 'cloudy': 1530, 'hazy': 1}
ECBQ7XK {'neg  -': 41216, '+': 1328, '++': 208, '+++': 178, 'neg -': 16, 'neg   -': 7}
0B2KNQM {'neg  -': 38872, '5    tr': 1779, '+': 820, '++': 581, '+++': 442, '5  tr': 436, 'neg -': 15, 'neg   -': 7}
QWK9ZKD {'norm -': 35351, 'norm  -': 5272, '1    tr': 876, '1  tr': 737, '+': 505, '++': 160, '+++': 49, 'neg  -': 1}
7HTHG0I {'neg  -': 34862, '25   tr': 3184, '+': 1938, '++': 1494, '+++': 953, '25  tr': 503, 'neg -': 11, 'neg   -': 4, '25 tr': 1}
A4K5XQW {'neg  -': 34664, '+': 4106, '++': 2516, '+++': 1643, 'neg -': 14, 'neg   -': 5}
VJNLF45 {'norm -': 29865, 'norm  -': 5645, 'neg  -': 3378, '+': 981, '50   tr': 974, '++': 838, '++++': 627, '+++': 493, '50  tr': 150}
YWIURH9 {'straw': 29834, 'yellow': 7293, 'amber': 5105, 'brown': 426, 'reddish': 126, 'orange': 119, 'red': 42, 'other': 9, 'green': 1, 'colorless': 1}
FEA29VE {

  df['value_source_value'] = df['value_source_value'].str.replace(r'(\d)(\+)', lambda x: x.groups()[1]*int(x.groups()[0]))
  df['value_source_value'] = df['value_source_value'].str.replace(r'(\d+)(\s+)(\++)', lambda x: x.groups()[2])
  df['value_source_value'] = df['value_source_value'].str.replace(r'(\++)(\d+)(\s+)', lambda x: x.groups()[0])
  _rh_pos_df = df[df['measurement_concept_id'] == 'BL7002']['value_source_value'].str.replace('+','rh+')


In [24]:
# 2) Laboratory tests Categorization
import re
import collections, re


def get_majors_and_minors(dist_dict, acc_thr=99.5, diff_thr=100, n_thr=9):

    # print(len(dist_dict))
    print("the dictionary is")
    print(dist_dict)
    print("__---------------")

    if len(dist_dict) < 5:
        return dist_dict

    total = sum(dist_dict.values())

    # print(total)
    
    tmp = {}
    sorted_dist_dict = sorted(dist_dict.items(), key=lambda t: -t[1])
    # print("sorted list")
    # print(sorted_dist_dict)
    # print("-----------------")
    # print("-----------------")
    
    for idx, (k, v) in enumerate(sorted_dist_dict):
        print("-----------------")
        print("the idx is")
        print(idx)
        print("the k is")
        print(k)
        print("the v is")
        print(v)
        print("----------------")

        
        if idx==0:
            prev = v
            prev_acc = 0

        acc_ratio = (prev_acc + v) / total * 100
        print("the ratio for "+" "+ k +" in perc: "+str(acc_ratio))
        print("-----------------")
        print("-----------------")

        #Escape Rules
        if (acc_ratio >= acc_thr):
            tmp.update({k:v})
            print("the updated temp after escape: ")
            print(tmp)
            print("--------------")

            break

        if (idx>n_thr):
            break

        tmp.update({k:v})
        # print("the updated: ")
        # print(tmp)
        # print("--------------")
        prev = v
        prev_acc += v


    return tmp


def get_groups(majors,
    rules_A = ['straw','clear','cloudy','yellow','amber','brown','orange'],
    rules_B = ['++', '+++', '++++'],
    rules_C = ['po.*', 'ne.*'],
    rules_D = ['ab','rh+'],
    rules_E = ['rea.*', 'non.*']):

    group_A = [] 
    group_B = [] 
    group_C = [] 
    group_D = []  
    group_E = []  

    message_form = '\n'.join(['Rules_%s: {}' % c for c in 'ABCDE'])
    print(message_form.format(rules_A, rules_B, rules_C,rules_D, rules_E))
    
    def is_group_A(major_dict):
        return sum([1 for k in major_dict.keys() if k in rules_A])
    def is_group_B(major_dict):
        return sum([1 for k in major_dict.keys() if k in rules_B])
    def is_group_C(major_dict):
        return sum([1 for s in major_dict.keys() for r in rules_C if re.findall(r, s)])
    def is_group_D(major_dict):
        return sum([1 for k in major_dict.keys() if k in rules_D])
    def is_group_E(major_dict):
        return sum([1 for s in major_dict.keys() for r in rules_E if re.findall(r,s)])

    for table_cid, major_dict in majors.items():
        
        if is_group_A(major_dict): 
            group_A.append(table_cid)
            
        elif is_group_B(major_dict):
            group_B.append(table_cid)
            
        elif is_group_C(major_dict):
            group_C.append(table_cid)
            
        elif is_group_D(major_dict): 
            group_D.append(table_cid)
            
        elif is_group_E(major_dict):
            group_E.append(table_cid)
            

    merge_group_set = set(group_A + group_B + group_C + group_D + group_E)
    group_others = [table_cid for table_cid in majors.keys() if table_cid not in merge_group_set]
    return [group_A, group_B, group_C, group_D, group_E, group_others]


def get_group_dict(group, regex=lambda x:x):
    
    group_dict = defaultdict(int)
               
    for table_cid in group:
        for k, v in dist_dicts[table_cid].items():
            group_dict[regex(re.sub(r'\s+', ' ', k))] += v
    
    return group_dict


majors = {table_cid:get_majors_and_minors(dist_dict, acc_thr=99.5) 
          for table_cid, dist_dict in dist_dicts.items()}
print(majors)
# print("The majors is: ")
# print("--------------------------")
# print(majors)


# print("The group is: ")
# print("--------------------------")


groups = get_groups(majors)
# print(groups)


groups_dict = [get_group_dict(group) for group in groups] 
groups_sample = [group_dict.keys() for group_dict in groups_dict]
               

the dictionary is
{'neg  -': 41611, 'pos  +': 1316, 'neg -': 16, 'neg   -': 7, 'pos   +': 5, 'posi +': 1, 'pos +': 1}
__---------------
-----------------
the idx is
0
the k is
neg  -
the v is
41611
----------------
the ratio for  neg  - in perc: 96.8666340759364
-----------------
-----------------
-----------------
the idx is
1
the k is
pos  +
the v is
1316
----------------
the ratio for  pos  + in perc: 99.9301627208604
-----------------
-----------------
the updated temp after escape: 
{'neg  -': 41611, 'pos  +': 1316}
--------------
the dictionary is
{'clear': 41435, 'cloudy': 1530, 'hazy': 1}
__---------------
the dictionary is
{'neg  -': 41216, '+': 1328, '++': 208, '+++': 178, 'neg -': 16, 'neg   -': 7}
__---------------
-----------------
the idx is
0
the k is
neg  -
the v is
41216
----------------
the ratio for  neg  - in perc: 95.95604497939608
-----------------
-----------------
-----------------
the idx is
1
the k is
+
the v is
1328
----------------
the ratio for  + in perc: 

In [29]:
from itertools import chain
import pandas as pd
from scipy import spatial


class Mapper():
    def __init__(self, samples, ref_list):
        print("sample list")
        print(samples)
        print("ref list")
        print(ref_list)

        self.samples = samples
        self.ref_list = ref_list
        
    def create_feature_matrix(self, samples1, samples2):
        def _create_feature_matrix(samples, refs):
            # print(samples, refs)
            #refs는 feature list
            result = []
            row_idx = []
            for r_idx, ng_str in enumerate(samples):
                result.append([0]*len(refs))
                for ng_char in ng_str:
                    result[r_idx][refs.index(ng_char)] += 1
            return result

        samples1 = [s.lower() for s in samples1]
        samples2 = [s.lower() for s in samples2]
        refs = list(set(chain.from_iterable(samples1+samples2)))
        print(samples1)
        print(refs)
        matrix1 = _create_feature_matrix(samples1, refs)
        print(matrix1)
        matrix2 = _create_feature_matrix(samples2, refs)

        return matrix1, matrix2
    
    def assign_to_ref(self, sample_vectors, ref_vectors, sample_list, ref_list):
        def cosine_similarity_matrix(matrix1, matrix2):

            distance = []
            for s_idx, sample in enumerate(matrix1):
                distance.append([])
                for ref in matrix2:
                    distance[s_idx].append(
                        round(1-spatial.distance.cosine(sample, ref),2)
                    )
            return distance
        
        def lp_dist_matrix(matrix1, matrix2):
            distance = []
            for s_idx, sample in enumerate(matrix1):
                distance.append([])
                for ref in matrix2:
                    distance[s_idx].append(
                        round(spatial.distance.euclidean(sample, ref),2)
                    )
            return distance
        
        def levenshtein_measure(sample_list, ref_list):
            def levenshtein(s1, s2):
            # based on Wikipedia/Levenshtein_distance
                if len(s1) < len(s2):
                    return levenshtein(s2, s1)

                if len(s2) == 0:
                    return len(s1)
    
                previous_row = range(len(s2) + 1)
                for i, c1 in enumerate(s1):
                    current_row = [i + 1]
                    for j, c2 in enumerate(s2):
                        insertions = previous_row[j + 1] + 1
                        deletions = current_row[j] + 1
                        substitutions = previous_row[j] + (c1 != c2)
                        current_row.append(min(insertions, deletions, substitutions))
                    previous_row = current_row
    
                return previous_row[-1]
    
            distance = []
            for s_idx, sample in enumerate(sample_list):
                distance.append([])
                for ref in ref_list:
                    distance[s_idx].append(levenshtein(sample, ref))
            return distance

        
        sample_list = list(sample_list)

        cos_scores = cosine_similarity_matrix(sample_vectors, ref_vectors)
        lev_scores  = levenshtein_measure(sample_list, ref_list)
        lp_dist_scores = lp_dist_matrix(sample_vectors, ref_vectors)
        
        result = []
        outlier = []
        for s_idx, sample in enumerate(sample_vectors):
            max_cos_score = max(cos_scores[s_idx])
            max_cos_idx = cos_scores[s_idx].index(max_cos_score)
            max_cos_cnt = cos_scores[s_idx].count(max_cos_score)
            min_lp_dist_score = min(lp_dist_scores[s_idx])
            min_lp_dist_idx = lp_dist_scores[s_idx].index(min_lp_dist_score)
            min_lp_dist_cnt = lp_dist_scores[s_idx].count(min_lp_dist_score)
            min_lev_score = min(lev_scores[s_idx])
            min_lev_idx = lev_scores[s_idx].index(min_lev_score)
            
            if max_cos_cnt == len(cos_scores[s_idx]):
                outlier.append(
                    [sample_list[s_idx],
                     ref_list[max_cos_idx],
                     cos_scores[s_idx]]
                )
            else:
                result.append(
                    [sample_list[s_idx],
                     ref_list[max_cos_idx],
                     cos_scores[s_idx],
                     ref_list[min_lp_dist_idx],
                     lp_dist_scores[s_idx],
                     ref_list[min_lev_idx],
                     lev_scores[s_idx],
                     'cosine' if max_cos_cnt <= 1 else 'Eucl',
                     ref_list[max_cos_idx] if max_cos_cnt <= 1 else ref_list[min_lp_dist_idx]]
                )

        return result, outlier
    
    def get_report_df(self, results):

        report_df = pd.DataFrame(
            results,
            columns=[
                'Data',
                'cos_word',
                'cos_score',
                'euc_word',
                'euc_score',
                'lev_word',
                'lev_score',
                'Measure',
                'Final'
            ]
        )
        report_df = report_df[['Data','Final']]
        return report_df
    
    def analyze(self):
        self._sample_vectors, self._ref_vectors = self.create_feature_matrix(
            self.samples, self.ref_list)
        
        self._cluster_results, self._outlier = self.assign_to_ref(
            self._sample_vectors, self._ref_vectors, self.samples, self.ref_list)
        
        self.report_df= self.get_report_df(self._cluster_results)

        
ref_list_A = ['straw', 'amber', 'brown', 'green', 'yellow', 'orange', 'black','blue', 'red', 'other', 'clear', 'cloudy', 'hazy', 'turbid', 'bloody']
ref_list_B = ['neg', 'trace', '+', '++', '+++', '++++']
ref_list_C = ['posi', 'neg-','weak-pos']
ref_list_D = ['a','b','c','ab','cisab','rh+','rh-','partiald','weakd','variantd'] 
ref_list_E = ['reac', 'non-reac','weak-reac']

groups_agents = [
    Mapper(sample, ref_list) for sample, ref_list in zip(
        groups_sample[:-1],
        [ref_list_A, ref_list_B, ref_list_C,ref_list_D,ref_list_E]
    )
]

for agent in groups_agents:
    agent.analyze()

sample list
dict_keys(['clear', 'cloudy', 'hazy', 'straw', 'yellow', 'amber', 'brown', 'reddish', 'orange', 'red', 'other', 'green', 'colorless'])
ref list
['straw', 'amber', 'brown', 'green', 'yellow', 'orange', 'black', 'blue', 'red', 'other', 'clear', 'cloudy', 'hazy', 'turbid', 'bloody']
sample list
dict_keys(['neg -', '+', '++', '+++', '5 tr', '25 tr', 'norm -', '50 tr', '++++', '10 tr', '1 tr', 'norm', 'neg'])
ref list
['neg', 'trace', '+', '++', '+++', '++++']
sample list
dict_keys(['neg -', 'pos +', 'posi +', 'negative', 'pos(11.00)', 'pos(6.6)', 'pos(6.9)', 'pos(7.0)', 'pos(7.2)', 'pos(7.1)', 'pos(6.8)', 'pos(6.0)', 'pos(6.1)', 'pos(6.2)', 'pos(6.3)', 'pos(6.4)', 'pos(6.5)', 'pos(6.7)', 'pos(7.3)', 'pos(7.4)', 'pos(7.5)', 'pos(7.6)', 'pos(7.7)', 'pos(7.8)', 'pos(7.9)', 'pos(8.0)', 'positive', 'neg(0.22)', 'neg(0.23)', 'neg(0.24)', 'neg(0.25)', 'neg(0.26)', 'neg(0.21)', 'neg(0.27)', 'neg(0.28)', 'neg(0.20)', 'neg(0.29)', 'neg(0.19)', 'neg(0.30)', 'neg(0.31)', 'neg(0.32)', 'neg(

In [14]:
# 4) Result table
groups_agents[0].report_df

Unnamed: 0,Data,Final
0,clear,clear
1,cloudy,cloudy
2,hazy,hazy
3,straw,straw
4,yellow,yellow
5,amber,amber
6,brown,brown
7,reddish,red
8,orange,orange
9,red,red


In [15]:
groups_agents[1].report_df

Unnamed: 0,Data,Final
0,neg -,neg
1,+,+
2,++,++
3,+++,+++
4,5 tr,trace
5,25 tr,trace
6,norm -,neg
7,50 tr,trace
8,++++,++++
9,10 tr,trace


In [16]:
groups_agents[2].report_df

Unnamed: 0,Data,Final
0,neg -,neg-
1,pos +,posi
2,posi +,posi
3,negative,neg-
4,pos(11.00),posi
...,...,...
2031,pos(4),posi
2032,pos(3),posi
2033,pos,posi
2034,positive (>300.0),posi


In [17]:
groups_agents[3].report_df

Unnamed: 0,Data,Final
0,a,a
1,b,b
2,ab,ab
3,bmt,b
4,a2b,ab
5,a2b3,ab
6,a1b3,ab
7,a3,a
8,b3,b


In [18]:
groups_agents[4].report_df

Unnamed: 0,Data,Final
0,non-reactive(0.01),non-reac
1,non-reactive(0.02),non-reac
2,non-reactive(0.03),non-reac
3,non-reactive(0.05),non-reac
4,non-reactive(0.04),non-reac
...,...,...
170,w-rea(1:1),weak-reac
171,react(1:1),reac
172,non-reactive(csf),non-reac
173,weakly-reactive(1:1),weak-reac
