In [5]:
import pandas as pd

#sample data load
df = pd.read_csv('./data/sample_data_100.csv', dtype=str)

In [24]:
# 1) Data preparation
from collections import defaultdict
import re

def _general_preprocessing(df):
    
    # window user_encoding
    df['value_source_value'] = df['value_source_value'].str.encode('utf8', errors = 'strict').str.decode('utf8', errors = 'strict')
    
    # lower, strip space, \d\+ restore
    df['value_source_value'] = df['value_source_value'].str.lower()
    df['value_source_value'] = df['value_source_value'].str.strip()
    df['value_source_value'] = df['value_source_value'].str.replace(r'(\d)(\+)', lambda x: x.groups()[1]*int(x.groups()[0]))
    df['value_source_value'] = df['value_source_value'].str.replace(r'(\d+)(\s+)(\++)', lambda x: x.groups()[2])
    df['value_source_value'] = df['value_source_value'].str.replace(r'(\++)(\d+)(\s+)', lambda x: x.groups()[0])
    
    #replace + to Rh+ only for 'HJ1UHH1'(Rh type laboratory code) 
    _rh_pos_df = df[df['measurement_concept_id'] == 'BL7002']['value_source_value'].str.replace('+','rh+')
    _rh_neg_df = df[df['measurement_concept_id'] == 'BL7002']['value_source_value'].str.replace('-','rh-')
    df.loc[df.measurement_concept_id == 'HJ1UHH1', 'value_source_value'] = _rh_pos_df 
    df.loc[df.measurement_concept_id == 'HJ1UHH1', 'value_source_value'] = _rh_neg_df
    return df

df_with_freq = df.groupby(df.columns.tolist()).size().reset_index(name = 'Freq')
df_with_freq_sort = df_with_freq.sort_values(by = 'Freq',ascending = False)
df_cleaned = _general_preprocessing(df_with_freq_sort)
df_cleaned_regrouped = df_cleaned.groupby(['measurement_concept_id','value_source_value'])
df_cleaned_sort = df_cleaned_regrouped.sum().reset_index().sort_values(by = 'Freq', ascending = False)

dist_dicts = defaultdict(dict)

for cid, value, freq in df_cleaned_sort.values:
    dist_dicts[cid][value] = freq
    
dist_dicts = dict(dist_dicts)

  df['value_source_value'] = df['value_source_value'].str.replace(r'(\d)(\+)', lambda x: x.groups()[1]*int(x.groups()[0]))
  df['value_source_value'] = df['value_source_value'].str.replace(r'(\d+)(\s+)(\++)', lambda x: x.groups()[2])
  df['value_source_value'] = df['value_source_value'].str.replace(r'(\++)(\d+)(\s+)', lambda x: x.groups()[0])
  _rh_pos_df = df[df['measurement_concept_id'] == 'BL7002']['value_source_value'].str.replace('+','rh+')


In [25]:
# 2) Laboratory tests Categorization
import re
import collections, re


def get_majors_and_minors(dist_dict, acc_thr=99.5, diff_thr=100, n_thr=9):

    if len(dist_dict) < 5:
        return dist_dict

    total = sum(dist_dict.values())
    
    tmp = {}
    sorted_dist_dict = sorted(dist_dict.items(), key=lambda t: -t[1])
    
    for idx, (k, v) in enumerate(sorted_dist_dict):
        
        if idx==0:
            prev = v
            prev_acc = 0

        acc_ratio = (prev_acc + v) / total * 100
        
        #Escape Rules
        if (acc_ratio >= acc_thr): 
            tmp.update({k:v})
            break
            
        if (idx>n_thr): 
            break
            
        tmp.update({k:v})
        prev = v
        prev_acc += v
        
    return tmp


def get_groups(majors,
    rules_A = ['straw','clear','cloudy','yellow','amber','brown','orange'],
    rules_B = ['++', '+++', '++++'],
    rules_C = ['po.*', 'ne.*'],
    rules_D = ['ab','rh+'],
    rules_E = ['rea.*', 'non.*']):

    group_A = [] 
    group_B = [] 
    group_C = [] 
    group_D = []  
    group_E = []  

    message_form = '\n'.join(['Rules_%s: {}' % c for c in 'ABCDE'])
    print(message_form.format(rules_A, rules_B, rules_C,rules_D, rules_E))
    
    def is_group_A(major_dict):
        return sum([1 for k in major_dict.keys() if k in rules_A])
    def is_group_B(major_dict):
        return sum([1 for k in major_dict.keys() if k in rules_B])
    def is_group_C(major_dict):
        return sum([1 for s in major_dict.keys() for r in rules_C if re.findall(r, s)])
    def is_group_D(major_dict):
        return sum([1 for k in major_dict.keys() if k in rules_D])
    def is_group_E(major_dict):
        return sum([1 for s in major_dict.keys() for r in rules_E if re.findall(r,s)])

    for table_cid, major_dict in majors.items():
        
        if is_group_A(major_dict): 
            group_A.append(table_cid)
            
        elif is_group_B(major_dict):
            group_B.append(table_cid)
            
        elif is_group_C(major_dict):
            group_C.append(table_cid)
            
        elif is_group_D(major_dict): 
            group_D.append(table_cid)
            
        elif is_group_E(major_dict):
            group_E.append(table_cid)
            

    merge_group_set = set(group_A + group_B + group_C + group_D + group_E)
    group_others = [table_cid for table_cid in majors.keys() if table_cid not in merge_group_set]
    return [group_A, group_B, group_C, group_D, group_E, group_others]


def get_group_dict(group, regex=lambda x:x):
    
    group_dict = defaultdict(int)
               
    for table_cid in group:
        for k, v in dist_dicts[table_cid].items():
            group_dict[regex(re.sub(r'\s+', ' ', k))] += v
    
    return group_dict


majors = {table_cid:get_majors_and_minors(dist_dict, acc_thr=99.5) 
          for table_cid, dist_dict in dist_dicts.items()}
groups = get_groups(majors)
groups_dict = [get_group_dict(group) for group in groups] 
groups_sample = [group_dict.keys() for group_dict in groups_dict]
               

Rules_A: ['straw', 'clear', 'cloudy', 'yellow', 'amber', 'brown', 'orange']
Rules_B: ['++', '+++', '++++']
Rules_C: ['po.*', 'ne.*']
Rules_D: ['ab', 'rh+']
Rules_E: ['rea.*', 'non.*']


In [22]:
from itertools import chain
import pandas as pd
from scipy import spatial


class Mapper():
    def __init__(self, samples, ref_list):
        self.samples = samples
        self.ref_list = ref_list
        
    def create_feature_matrix(self, samples1, samples2):
        def _create_feature_matrix(samples, refs):
            #refs는 feature list
            result = []
            row_idx = []
            for r_idx, ng_str in enumerate(samples):
                result.append([0]*len(refs))
                for ng_char in ng_str:
                    result[r_idx][refs.index(ng_char)] += 1
            return result

        samples1 = [s.lower() for s in samples1]
        samples2 = [s.lower() for s in samples2]
        refs = list(set(chain.from_iterable(samples1+samples2)))
        matrix1 = _create_feature_matrix(samples1, refs)
        matrix2 = _create_feature_matrix(samples2, refs)
        return matrix1, matrix2
    
    def assign_to_ref(self, sample_vectors, ref_vectors, sample_list, ref_list):
        def cosine_similarity_matrix(matrix1, matrix2):

            distance = []
            for s_idx, sample in enumerate(matrix1):
                distance.append([])
                for ref in matrix2:
                    distance[s_idx].append(
                        round(1-spatial.distance.cosine(sample, ref),2)
                    )
            return distance
        
        def lp_dist_matrix(matrix1, matrix2):
            distance = []
            for s_idx, sample in enumerate(matrix1):
                distance.append([])
                for ref in matrix2:
                    distance[s_idx].append(
                        round(spatial.distance.euclidean(sample, ref),2)
                    )
            return distance
        
        def levenshtein_measure(sample_list, ref_list):
            def levenshtein(s1, s2):
            # based on Wikipedia/Levenshtein_distance
                if len(s1) < len(s2):
                    return levenshtein(s2, s1)

                if len(s2) == 0:
                    return len(s1)
    
                previous_row = range(len(s2) + 1)
                for i, c1 in enumerate(s1):
                    current_row = [i + 1]
                    for j, c2 in enumerate(s2):
                        insertions = previous_row[j + 1] + 1
                        deletions = current_row[j] + 1
                        substitutions = previous_row[j] + (c1 != c2)
                        current_row.append(min(insertions, deletions, substitutions))
                    previous_row = current_row
    
                return previous_row[-1]
    
            distance = []
            for s_idx, sample in enumerate(sample_list):
                distance.append([])
                for ref in ref_list:
                    distance[s_idx].append(  )
            return distance

        sample_list = list(sample_list)

        cos_scores = cosine_similarity_matrix(sample_vectors, ref_vectors)
        lev_scores  = levenshtein_measure(sample_list, ref_list)
        lp_dist_scores = lp_dist_matrix(sample_vectors, ref_vectors)
        
        
        result = []
        outlier = []
        for s_idx, sample in enumerate(sample_vectors):
            max_cos_score = max(cos_scores[s_idx])
            max_cos_idx = cos_scores[s_idx].index(max_cos_score)
            max_cos_cnt = cos_scores[s_idx].count(max_cos_score)
            min_lp_dist_score = min(lp_dist_scores[s_idx])
            min_lp_dist_idx = lp_dist_scores[s_idx].index(min_lp_dist_score)
            min_lp_dist_cnt = lp_dist_scores[s_idx].count(min_lp_dist_score)
            min_lev_score = min(lev_scores[s_idx])
            min_lev_idx = lev_scores[s_idx].index(min_lev_score)
            
            if max_cos_cnt == len(cos_scores[s_idx]):
                outlier.append(
                    [sample_list[s_idx],
                     ref_list[max_cos_idx],
                     cos_scores[s_idx]]
                )
            else:
                result.append(
                    [sample_list[s_idx],
                     ref_list[max_cos_idx],
                     cos_scores[s_idx],
                     ref_list[min_lp_dist_idx],
                     lp_dist_scores[s_idx],
                     ref_list[min_lev_idx],
                     lev_scores[s_idx],
                     'cosine' if max_cos_cnt <= 1 else 'Eucl',
                     ref_list[max_cos_idx] if max_cos_cnt <= 1 else ref_list[min_lp_dist_idx]]
                )
        result_df = pd.DataFrame(result, columns=['smp','ref','cos','ref_lp','lp','ref_lev','lv','ecul','refe'])
        result_df.to_csv("result_df.csv")
        return result, outlier
    
    def get_report_df(self, results):

        report_df = pd.DataFrame(
            results,
            columns=[
                'Data',
                'cos_word',
                'cos_score',
                'euc_word',
                'euc_score',
                'lev_word',
                'lev_score',
                'Measure',
                'Final'
            ]
        )
        report_df.to_csv("r.csv")
        report_df = report_df[['Data','Final']]
        
        return report_df
    
    def analyze(self):
        self._sample_vectors, self._ref_vectors = self.create_feature_matrix(
            self.samples, self.ref_list)
        
        self._cluster_results, self._outlier = self.assign_to_ref(
            self._sample_vectors, self._ref_vectors, self.samples, self.ref_list)
        
        self.report_df= self.get_report_df(self._cluster_results)

        
ref_list_A = ['straw', 'amber', 'brown', 'green', 'yellow', 'orange', 'black','blue', 'red', 'other', 'clear', 'cloudy', 'hazy', 'turbid', 'bloody']
ref_list_B = ['neg', 'trace', '+', '++', '+++', '++++']
ref_list_C = ['posi', 'neg-','weak-pos']
ref_list_D = ['a','b','c','ab','cisab','rh+','rh-','partiald','weakd','variantd'] 
ref_list_E = ['reac', 'non-reac','weak-reac']

groups_agents = [
    Mapper(sample, ref_list) for sample, ref_list in zip(
        groups_sample[:-1],
        [ref_list_A, ref_list_B, ref_list_C,ref_list_D,ref_list_E]
    )
]

for agent in groups_agents:
    agent.analyze()

In [13]:
from itertools import chain
import pandas as pd
from scipy import spatial


class Mapper():
    def __init__(self, samples, ref_list):
        self.samples = samples
        self.ref_list = ref_list
        
    def create_feature_matrix(self, samples1, samples2):
        def _create_feature_matrix(samples, refs):
            #refs는 feature list
            result = []
            row_idx = []
            for r_idx, ng_str in enumerate(samples):
                result.append([0]*len(refs))
                for ng_char in ng_str:
                    result[r_idx][refs.index(ng_char)] += 1
            return result

        samples1 = [s.lower() for s in samples1]
        samples2 = [s.lower() for s in samples2]
        refs = list(set(chain.from_iterable(samples1+samples2)))
        matrix1 = _create_feature_matrix(samples1, refs)
        matrix2 = _create_feature_matrix(samples2, refs)
        return matrix1, matrix2
    
    def assign_to_ref(self, sample_vectors, ref_vectors, sample_list, ref_list):
#         def cosine_similarity_matrix(matrix1, matrix2):

#             distance = []
#             for s_idx, sample in enumerate(matrix1):
#                 distance.append([])
#                 for ref in matrix2:
#                     distance[s_idx].append(
#                         round(1-spatial.distance.cosine(sample, ref),2)
#                     )
#             return distance
        
        def lp_dist_matrix(matrix1, matrix2):
            distance = []
            for s_idx, sample in enumerate(matrix1):
                distance.append([])
                for ref in matrix2:
                    distance[s_idx].append(
                        round(spatial.distance.euclidean(sample, ref),2)
                    )
            return distance
        
        def levenshtein_measure(sample_list, ref_list):
            def levenshtein(s1, s2):
            # based on Wikipedia/Levenshtein_distance
                if len(s1) < len(s2):
                    return levenshtein(s2, s1)

                if len(s2) == 0:
                    return len(s1)
    
                previous_row = range(len(s2) + 1)
                for i, c1 in enumerate(s1):
                    current_row = [i + 1]
                    for j, c2 in enumerate(s2):
                        insertions = previous_row[j + 1] + 1
                        deletions = current_row[j] + 1
                        substitutions = previous_row[j] + (c1 != c2)
                        current_row.append(min(insertions, deletions, substitutions))
                    previous_row = current_row
    
                return previous_row[-1]
    
            distance = []
            for s_idx, sample in enumerate(sample_list):
                distance.append([])
                for ref in ref_list:
                    distance[s_idx].append(levenshtein(sample, ref))
            return distance
        # Python3 implementation of above approach
        from math import floor
        def jaro_winkler_measures(sample_list, ref_list):
            # Function to calculate the
            # Jaro Similarity of two strings
            def jaro_distance(s1, s2) :

                # If the strings are equal
                if (s1 == s2) :
                    return 1.0;

                # Length of two strings
                len1 = len(s1);
                len2 = len(s2);

                if (len1 == 0 or len2 == 0) :
                    return 0.0;

                # Maximum distance upto which matching
                # is allowed
                max_dist = (max(len(s1), len(s2)) // 2 ) - 1;

                # Count of matches
                match = 0;

                # Hash for matches
                hash_s1 = [0] * len(s1) ;
                hash_s2 = [0] * len(s2) ;

                # Traverse through the first string
                for i in range(len1) :

                    # Check if there is any matches
                    for j in range( max(0, i - max_dist),
                                min(len2, i + max_dist + 1)) :

                        # If there is a match
                        if (s1[i] == s2[j] and hash_s2[j] == 0) :
                            hash_s1[i] = 1;
                            hash_s2[j] = 1;
                            match += 1;
                            break;

                # If there is no match
                if (match == 0) :
                    return 0.0;

                # Number of transpositions
                t = 0;

                point = 0;

                # Count number of occurrences
                # where two characters match but
                # there is a third matched character
                # in between the indices
                for i in range(len1) :
                    if (hash_s1[i]) :

                        # Find the next matched character
                        # in second string
                        while (hash_s2[point] == 0) :
                            point += 1;

                        if (s1[i] != s2[point]) :
                            point += 1;
                            t += 1;
                        else :
                            point += 1;

                    t /= 2;

                # Return the Jaro Similarity
                return ((match / len1 + match / len2 +
                        (match - t) / match ) / 3.0);

            # Jaro Winkler Similarity
            def jaro_Winkler(s1, s2) :

                jaro_dist = jaro_distance(s1, s2);

                # If the jaro Similarity is above a threshold
                if (jaro_dist > 0.7) :

                    # Find the length of common prefix
                    prefix = 0;

                    for i in range(min(len(s1), len(s2))) :

                        # If the characters match
                        if (s1[i] == s2[i]) :
                            prefix += 1;

                        # Else break
                        else :
                            break;

                    # Maximum of 4 characters are allowed in prefix
                    prefix = min(4, prefix);

                    # Calculate jaro winkler Similarity
                    jaro_dist += 0.1 * prefix * (1 - jaro_dist);

                return jaro_dist;
            
            distance = []
            for s_idx, sample in enumerate(sample_list):
                distance.append([])
                for ref in ref_list:
                    distance[s_idx].append(jaro_Winkler(sample, ref))
            return distance



        
        sample_list = list(sample_list)

        # cos_scores = cosine_similarity_matrix(sample_vectors, ref_vectors)
        lev_scores  = levenshtein_measure(sample_list, ref_list)
        lp_dist_scores = lp_dist_matrix(sample_vectors, ref_vectors)
        jw_scores  = jaro_winkler_measures(sample_list, ref_list)
        
        result = []
        outlier = []
        for s_idx, sample in enumerate(sample_vectors):
            # max_cos_score = max(cos_scores[s_idx])
            # max_cos_idx = cos_scores[s_idx].index(max_cos_score)
            # max_cos_cnt = cos_scores[s_idx].count(max_cos_score)
            max_jw_score = max(jw_scores[s_idx])
            max_jw_idx = jw_scores[s_idx].index(max_jw_score)
            max_jw_cnt = jw_scores[s_idx].count(max_jw_score )
            min_lp_dist_score = min(lp_dist_scores[s_idx])
            min_lp_dist_idx = lp_dist_scores[s_idx].index(min_lp_dist_score)
            min_lp_dist_cnt = lp_dist_scores[s_idx].count(min_lp_dist_score)
            min_lev_score = min(lev_scores[s_idx])
            min_lev_idx = lev_scores[s_idx].index(min_lev_score)
            
            if max_cos_cnt == len(cos_scores[s_idx]):
                outlier.append(
                    [sample_list[s_idx],
                     ref_list[max_cos_idx],
                     cos_scores[s_idx]]
                )
            else:
                result.append(
                    [sample_list[s_idx],
                     ref_list[max_cos_idx],
                     cos_scores[s_idx],
                     ref_list[min_lp_dist_idx],
                     lp_dist_scores[s_idx],
                     ref_list[min_lev_idx],
                     lev_scores[s_idx],
                     'cosine' if max_cos_cnt <= 1 else 'Eucl',
                     ref_list[max_cos_idx] if max_cos_cnt <= 1 else ref_list[min_lp_dist_idx]]
                )

        return result, outlier
    
    def get_report_df(self, results):

        report_df = pd.DataFrame(
            results,
            columns=[
                'Data',
                'cos_word',
                'cos_score',
                'euc_word',
                'euc_score',
                'lev_word',
                'lev_score',
                'Measure',
                'Final'
            ]
        )
        report_df.to_csv("r.csv")
        report_df = report_df[['Data','Final']]
        
        return report_df
    
    def analyze(self):
        self._sample_vectors, self._ref_vectors = self.create_feature_matrix(
            self.samples, self.ref_list)
        
        self._cluster_results, self._outlier = self.assign_to_ref(
            self._sample_vectors, self._ref_vectors, self.samples, self.ref_list)
        
        self.report_df= self.get_report_df(self._cluster_results)

        
ref_list_A = ['straw', 'amber', 'brown', 'green', 'yellow', 'orange', 'black','blue', 'red', 'other', 'clear', 'cloudy', 'hazy', 'turbid', 'bloody']
ref_list_B = ['neg', 'trace', '+', '++', '+++', '++++']
ref_list_C = ['posi', 'neg-','weak-pos']
ref_list_D = ['a','b','c','ab','cisab','rh+','rh-','partiald','weakd','variantd'] 
ref_list_E = ['reac', 'non-reac','weak-reac']

groups_agents = [
    Mapper(sample, ref_list) for sample, ref_list in zip(
        groups_sample[:-1],
        [ref_list_A, ref_list_B, ref_list_C,ref_list_D,ref_list_E]
    )
]

for agent in groups_agents:
    agent.analyze()

In [17]:
from itertools import chain
import pandas as pd
from scipy import spatial


class Mapper():
    def __init__(self, samples, ref_list):
        self.samples = samples
        self.ref_list = ref_list
        
    def create_feature_matrix(self, samples1, samples2):
        def _create_feature_matrix(samples, refs):
            #refs는 feature list
            result = []
            row_idx = []
            for r_idx, ng_str in enumerate(samples):
                result.append([0]*len(refs))
                for ng_char in ng_str:
                    result[r_idx][refs.index(ng_char)] += 1
            return result

        samples1 = [s.lower() for s in samples1]
        samples2 = [s.lower() for s in samples2]
        refs = list(set(chain.from_iterable(samples1+samples2)))
        matrix1 = _create_feature_matrix(samples1, refs)
        matrix2 = _create_feature_matrix(samples2, refs)
        return matrix1, matrix2
    
    def assign_to_ref(self, sample_vectors, ref_vectors, sample_list, ref_list):
        def cosine_similarity_matrix(matrix1, matrix2):

            distance = []
            for s_idx, sample in enumerate(matrix1):
                distance.append([])
                for ref in matrix2:
                    distance[s_idx].append(
                        round(1-spatial.distance.cosine(sample, ref),2)
                    )
            return distance
        from math import floor
        def jaro_winkler_measures(sample_list, ref_list):
            # Function to calculate the
            # Jaro Similarity of two strings
            def jaro_distance(s1, s2) :

                # If the strings are equal
                if (s1 == s2) :
                    return 1.0;

                # Length of two strings
                len1 = len(s1);
                len2 = len(s2);

                if (len1 == 0 or len2 == 0) :
                    return 0.0;

                # Maximum distance upto which matching
                # is allowed
                max_dist = (max(len(s1), len(s2)) // 2 ) - 1;

                # Count of matches
                match = 0;

                # Hash for matches
                hash_s1 = [0] * len(s1) ;
                hash_s2 = [0] * len(s2) ;

                # Traverse through the first string
                for i in range(len1) :

                    # Check if there is any matches
                    for j in range( max(0, i - max_dist),
                                min(len2, i + max_dist + 1)) :

                        # If there is a match
                        if (s1[i] == s2[j] and hash_s2[j] == 0) :
                            hash_s1[i] = 1;
                            hash_s2[j] = 1;
                            match += 1;
                            break;

                # If there is no match
                if (match == 0) :
                    return 0.0;

                # Number of transpositions
                t = 0;

                point = 0;

                # Count number of occurrences
                # where two characters match but
                # there is a third matched character
                # in between the indices
                for i in range(len1) :
                    if (hash_s1[i]) :

                        # Find the next matched character
                        # in second string
                        while (hash_s2[point] == 0) :
                            point += 1;

                        if (s1[i] != s2[point]) :
                            point += 1;
                            t += 1;
                        else :
                            point += 1;

                    t /= 2;

                # Return the Jaro Similarity
                return ((match / len1 + match / len2 +
                        (match - t) / match ) / 3.0);

            # Jaro Winkler Similarity
            def jaro_Winkler(s1, s2) :

                jaro_dist = jaro_distance(s1, s2);

                # If the jaro Similarity is above a threshold
                if (jaro_dist > 0.7) :

                    # Find the length of common prefix
                    prefix = 0;

                    for i in range(min(len(s1), len(s2))) :

                        # If the characters match
                        if (s1[i] == s2[i]) :
                            prefix += 1;

                        # Else break
                        else :
                            break;

                    # Maximum of 4 characters are allowed in prefix
                    prefix = min(4, prefix);

                    # Calculate jaro winkler Similarity
                    jaro_dist += 0.1 * prefix * (1 - jaro_dist);

                return jaro_dist;
            
            distance = []
            for s_idx, sample in enumerate(sample_list):
                distance.append([])
                for ref in ref_list:
                    distance[s_idx].append(jaro_Winkler(sample, ref))
            return distance
        
        def lp_dist_matrix(matrix1, matrix2):
            distance = []
            for s_idx, sample in enumerate(matrix1):
                distance.append([])
                for ref in matrix2:
                    distance[s_idx].append(
                        round(spatial.distance.euclidean(sample, ref),2)
                    )
            return distance
        
        def levenshtein_measure(sample_list, ref_list):
            def levenshtein(s1, s2):
            # based on Wikipedia/Levenshtein_distance
                if len(s1) < len(s2):
                    return levenshtein(s2, s1)

                if len(s2) == 0:
                    return len(s1)
    
                previous_row = range(len(s2) + 1)
                for i, c1 in enumerate(s1):
                    current_row = [i + 1]
                    for j, c2 in enumerate(s2):
                        insertions = previous_row[j + 1] + 1
                        deletions = current_row[j] + 1
                        substitutions = previous_row[j] + (c1 != c2)
                        current_row.append(min(insertions, deletions, substitutions))
                    previous_row = current_row
    
                return previous_row[-1]
    
            distance = []
            for s_idx, sample in enumerate(sample_list):
                distance.append([])
                for ref in ref_list:
                    distance[s_idx].append(levenshtein(sample, ref))
            return distance

        
        sample_list = list(sample_list)

        cos_scores = cosine_similarity_matrix(sample_vectors, ref_vectors)
        jw_scores = jaro_winkler_measures(sample_vectors, ref_vectors)
        lev_scores  = levenshtein_measure(sample_list, ref_list)
        lp_dist_scores = lp_dist_matrix(sample_vectors, ref_vectors)
        
        result = []
        outlier = []
        for s_idx, sample in enumerate(sample_vectors):
            max_cos_score = max(cos_scores[s_idx])
            max_cos_idx = cos_scores[s_idx].index(max_cos_score)
            max_cos_cnt = cos_scores[s_idx].count(max_cos_score)
            max_jw_score = max(jw_scores[s_idx])
            max_jw_idx = jw_scores[s_idx].index(max_jw_score)
            max_jw_cnt = jw_scores[s_idx].count(max_jw_score)
            min_lp_dist_score = min(lp_dist_scores[s_idx])
            min_lp_dist_idx = lp_dist_scores[s_idx].index(min_lp_dist_score)
            min_lp_dist_cnt = lp_dist_scores[s_idx].count(min_lp_dist_score)
            min_lev_score = min(lev_scores[s_idx])
            min_lev_idx = lev_scores[s_idx].index(min_lev_score)
            
            if max_jw_cnt == len(jw_scores[s_idx]):
                outlier.append(
                    [sample_list[s_idx],
                     ref_list[max_cos_idx],
                     jw_scores[s_idx]]
                )
            else:
                result.append(
                    [sample_list[s_idx],
                     ref_list[max_cos_idx],
                     jw_scores[s_idx],
                     ref_list[min_lp_dist_idx],
                     lp_dist_scores[s_idx],
                     ref_list[min_lev_idx],
                     lev_scores[s_idx],
                     'jw' if max_jw_cnt <= 1 else 'Eucl',
                     ref_list[max_jw_idx] if max_jw_cnt <= 1 else ref_list[min_lp_dist_idx]]
                )

        return result, outlier
    
    def get_report_df(self, results):

        report_df = pd.DataFrame(
            results,
            columns=[
                'Data',
                'jw_word',
                'jw_score',
                'euc_word',
                'euc_score',
                'lev_word',
                'lev_score',
                'Measure',
                'Final'
            ]
        )
        report_df = report_df[['Data','jw_word','jw_score','euc_word','euc_score','lev_word','lev_score','Measure','Final']]
        return report_df
    
    def analyze(self):
        self._sample_vectors, self._ref_vectors = self.create_feature_matrix(
            self.samples, self.ref_list)
        
        self._cluster_results, self._outlier = self.assign_to_ref(
            self._sample_vectors, self._ref_vectors, self.samples, self.ref_list)
        
        self.report_df= self.get_report_df(self._cluster_results)

        
ref_list_A = ['straw', 'amber', 'brown', 'green', 'yellow', 'orange', 'black','blue', 'red', 'other', 'clear', 'cloudy', 'hazy', 'turbid', 'bloody']
ref_list_B = ['neg', 'trace', '+', '++', '+++', '++++']
ref_list_C = ['posi', 'neg-','weak-pos']
ref_list_D = ['a','b','c','ab','cisab','rh+','rh-','partiald','weakd','variantd'] 
ref_list_E = ['reac', 'non-reac','weak-reac']

groups_agents = [
    Mapper(sample, ref_list) for sample, ref_list in zip(
        groups_sample[:-1],
        [ref_list_A, ref_list_B, ref_list_C,ref_list_D,ref_list_E]
    )
]

for agent in groups_agents:
    agent.analyze()

In [16]:
# 4) Result table
groups_agents[0].report_df

Unnamed: 0,Data,jw_word,jw_score,euc_word,euc_score,lev_word,lev_score,Measure,Final
0,clear,clear,"[0.9957737255096435, 0.9983007303873699, 0.993...",clear,"[2.45, 2.0, 2.83, 2.45, 2.65, 2.24, 2.0, 2.24,...",clear,"[4, 4, 5, 4, 5, 6, 4, 4, 4, 4, 0, 4, 5, 6, 5]",jw,clear
1,cloudy,cloudy,"[0.965404487074467, 0.9533089855261015, 0.9696...",cloudy,"[3.32, 3.32, 3.0, 3.61, 2.45, 3.16, 2.65, 2.45...",cloudy,"[6, 6, 5, 6, 6, 6, 5, 4, 5, 6, 4, 0, 5, 6, 2]",jw,cloudy
2,hazy,hazy,"[0.969675800925807, 0.9784476189864308, 0.9663...",hazy,"[2.65, 2.65, 3.0, 3.32, 3.16, 2.83, 2.65, 2.83...",hazy,"[5, 5, 5, 5, 6, 5, 4, 4, 4, 5, 5, 5, 0, 6, 5]",jw,hazy
3,straw,straw,"[1.0, 0.9938768100738525, 0.997392733891805, 0...",straw,"[0.0, 2.45, 2.45, 3.16, 3.32, 2.65, 2.83, 3.0,...",straw,"[0, 5, 4, 5, 5, 5, 5, 5, 4, 4, 4, 6, 5, 5, 6]",jw,straw
4,yellow,yellow,"[0.9619675786871659, 0.9662853140580027, 0.969...",yellow,"[3.32, 3.32, 3.0, 3.32, 0.0, 3.16, 3.0, 2.45, ...",yellow,"[5, 6, 5, 6, 0, 6, 5, 5, 5, 6, 5, 6, 6, 6, 5]",jw,yellow
5,amber,amber,"[0.9938768100738525, 1.0, 0.9957975864410401, ...",amber,"[2.45, 0.0, 2.45, 2.45, 3.32, 2.24, 2.45, 2.24...",amber,"[5, 0, 5, 4, 6, 5, 5, 4, 4, 3, 4, 6, 5, 5, 6]",jw,amber
6,brown,brown,"[0.997392733891805, 0.9957975864410401, 1.0, 0...",brown,"[2.45, 2.45, 0.0, 2.83, 3.0, 2.24, 2.83, 2.65,...",brown,"[4, 5, 0, 3, 5, 5, 4, 4, 4, 5, 5, 5, 5, 5, 4]",jw,brown
7,reddish,red,"[0.9735094492059004, 0.9610682778609426, 0.965...",red,"[3.16, 3.16, 3.46, 3.16, 3.87, 3.32, 3.74, 3.3...",red,"[7, 7, 7, 6, 6, 7, 7, 7, 4, 7, 7, 7, 7, 6, 7]",jw,straw
8,orange,orange,"[0.9661128830491451, 0.9556621183428847, 0.975...",orange,"[2.65, 2.24, 2.24, 1.73, 3.16, 0.0, 3.0, 2.83,...",orange,"[5, 5, 5, 5, 6, 0, 5, 5, 5, 5, 6, 6, 5, 6, 6]",jw,orange
9,red,red,"[0.9358082516988118, 0.9571988254123264, 0.928...",red,"[2.45, 2.0, 2.45, 2.0, 3.0, 2.24, 2.83, 2.24, ...",red,"[4, 4, 4, 3, 5, 5, 5, 4, 0, 4, 4, 5, 4, 4, 5]",jw,red


In [18]:
groups_agents[1].report_df

Unnamed: 0,Data,jw_word,jw_score,euc_word,euc_score,lev_word,lev_score,Measure,Final
0,neg -,neg,"[0.9463169642857142, 0.9880056381225586, 0.843...",neg,"[1.41, 2.83, 2.45, 3.0, 3.74, 4.58]",neg,"[2, 5, 5, 5, 5, 5]",jw,trace
1,+,+,"[0.9221540178571429, 0.8298543294270834, 1.0, ...",+,"[2.0, 2.45, 0.0, 1.0, 2.0, 3.0]",+,"[3, 5, 0, 1, 2, 3]",jw,+
2,++,+,"[0.8875, 0.7916666666666666, 0.975, 1.0, 0.975...",++,"[2.65, 3.0, 1.0, 0.0, 1.0, 2.0]",++,"[3, 5, 1, 0, 1, 2]",jw,++
3,+++,+,"[0.8875, 0.7916666666666666, 0.975, 0.975, 1.0...",+++,"[3.46, 3.74, 2.0, 1.0, 0.0, 1.0]",+++,"[3, 5, 2, 1, 0, 1]",jw,+++
4,5 tr,trace,"[0.9508489990234374, 0.9513668484157987, 0.917...",trace,"[2.65, 2.24, 2.24, 2.83, 3.61, 4.47]",neg,"[4, 5, 4, 4, 4, 4]",jw,trace
5,25 tr,trace,"[0.9138508387974331, 0.993138313293457, 0.8916...",trace,"[2.83, 2.45, 2.45, 3.0, 3.74, 4.58]",neg,"[5, 5, 5, 5, 5, 5]",jw,trace
6,norm -,neg,"[0.8810962383563703, 0.9556935628255209, 0.848...",neg,"[2.65, 3.0, 2.65, 3.16, 3.87, 4.69]",neg,"[5, 5, 6, 6, 6, 6]",jw,trace
7,50 tr,trace,"[0.9138547624860491, 0.9931421279907227, 0.855...",trace,"[2.83, 2.45, 2.45, 3.0, 3.74, 4.58]",neg,"[5, 5, 5, 5, 5, 5]",jw,trace
8,++++,+,"[0.8875, 0.7916666666666666, 0.975, 0.975, 0.9...",++++,"[4.36, 4.58, 3.0, 2.0, 1.0, 0.0]",++++,"[4, 5, 3, 2, 1, 0]",jw,++++
9,10 tr,trace,"[0.9135199410574777, 0.9983097712198893, 0.855...",trace,"[2.83, 2.45, 2.45, 3.0, 3.74, 4.58]",neg,"[5, 5, 5, 5, 5, 5]",jw,trace


In [23]:
groups_agents[2].report_df.to_csv("negpos.csv")

In [20]:
groups_agents[3].report_df

Unnamed: 0,Data,jw_word,jw_score,euc_word,euc_score,lev_word,lev_score,Measure,Final
0,a,a,"[1.0, 0.9826086956521739, 0.9999986731487772, ...",a,"[0.0, 1.41, 1.41, 1.0, 2.0, 2.0, 2.0, 2.65, 2....",a,"[0, 1, 1, 1, 4, 3, 3, 7, 4, 7]",jw,a
1,o,a,"[0.9999995559885883, 0.9710144927536232, 0.999...",a,"[1.41, 1.41, 1.41, 1.73, 2.45, 2.0, 2.0, 3.32,...",a,"[1, 1, 1, 2, 5, 3, 3, 8, 5, 8]",jw,a
2,b,b,"[0.9826086956521739, 1.0, 0.9994554602581521, ...",b,"[1.41, 0.0, 1.41, 1.0, 2.0, 2.0, 2.0, 3.32, 2....",b,"[1, 0, 1, 1, 4, 3, 3, 8, 5, 8]",jw,b
3,ab,ab,"[0.9826086956521739, 0.9809041501976286, 0.982...",ab,"[1.0, 1.0, 1.73, 0.0, 1.73, 2.24, 2.24, 2.83, ...",ab,"[1, 1, 2, 0, 3, 3, 3, 7, 4, 7]",jw,ab
4,bmt,b,"[0.9651984994702705, 0.9628178377329192, 0.965...",b,"[2.0, 1.41, 2.0, 1.73, 2.45, 2.45, 2.45, 3.32,...",b,"[3, 2, 3, 3, 5, 3, 3, 7, 5, 7]",jw,rh+
5,a2b,ab,"[0.9652173913043479, 0.9627620341614906, 0.965...",ab,"[1.41, 1.41, 2.0, 1.0, 2.0, 2.45, 2.45, 3.0, 2...",ab,"[2, 2, 3, 1, 4, 3, 3, 7, 4, 7]",jw,rh+
6,a2b3,ab,"[0.9478254384579866, 0.9427479619565217, 0.947...",ab,"[1.73, 1.73, 2.24, 1.41, 2.24, 2.65, 2.65, 3.1...",ab,"[3, 3, 4, 2, 5, 4, 4, 7, 5, 7]",jw,rh+
7,a1b3,ab,"[0.9478254384579866, 0.9428236455502718, 0.947...",ab,"[1.73, 1.73, 2.24, 1.41, 2.24, 2.65, 2.65, 3.1...",ab,"[3, 3, 4, 2, 5, 4, 4, 7, 5, 7]",jw,rh+
8,a3,a,"[0.982608106108051, 0.9652173913043479, 0.9826...",a,"[1.0, 1.73, 1.73, 1.41, 2.24, 2.24, 2.24, 2.83...",a,"[1, 2, 2, 1, 4, 3, 3, 7, 4, 7]",Eucl,a
9,b3,b,"[0.9826083835405794, 0.9809041501976286, 0.982...",b,"[1.73, 1.0, 1.73, 1.41, 2.24, 2.24, 2.24, 3.46...",b,"[2, 1, 2, 2, 5, 3, 3, 8, 5, 8]",jw,ab


In [22]:
groups_agents[4].report_df.to_csv("reac.csv")