### Coding Data for Deidentification

In [144]:
import itertools
import random
import pandas as pd

In [145]:
groundtruth=pd.read_csv("c:/code/groundtruth.csv").iloc[:,1:]
groundtruth=groundtruth.sort_values("trainingSemester")
groundtruth.info()
groundtruth.trainingSemester.unique()

<class 'pandas.core.frame.DataFrame'>
Index: 1304 entries, 0 to 875
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   idx                        1304 non-null   int64 
 1   trainingSemester           1303 non-null   object
 2   trainingSerie              1304 non-null   object
 3   trainingClass              1304 non-null   object
 4   trainingCompany            1304 non-null   object
 5   trainingPeriod             1304 non-null   object
 6   name                       1304 non-null   object
 7   klass                      1158 non-null   object
 8   trainingTeacher            1302 non-null   object
 9   trainingUnit               951 non-null    object
 10  address                    1304 non-null   object
 11  registerProblem            1304 non-null   bool  
 12  contact                    854 non-null    object
 13  isFulltimeTrainingTeacher  1304 non-null   bool  
dtypes: bool(2), in

array(['3-1', '3-1.5', '3-2', '3-2.5', '4-1', '4-1.5', nan], dtype=object)

In [146]:
objectColumns=["idx","name","trainingClass","trainingCompany","trainingUnit","trainingTeacher","address","contact"]

In [147]:
char=list(set(itertools.chain.from_iterable(
    ["".join(w) for w in [groundtruth[q].astype(str).unique() for q in objectColumns]]
)))
sieveNumber=random.sample(range(len(char)),k=len(char))
charMix=[char[q] for q in sieveNumber]

In [148]:
if (len(char)==len(charMix)) + (len(char)==len(sieveNumber)) < 2:
    raise Exception()

In [149]:
charMap=dict(zip(char,charMix))
charMapData=pd.DataFrame({"char":char,"charMix":charMix})
charMapData.to_csv("c:/code/charMapData.csv",encoding="utf-8")

In [150]:
tx=groundtruth[objectColumns].map(lambda q:"".join([charMap[w] for w in list(str(q))]) if isinstance(q,(str,int)) else q)
tx.sample(10)

Unnamed: 0,name,trainingClass,trainingCompany,trainingUnit,trainingTeacher,address,contact
728,을심혈,심울아위통충학근별,민시병숙0언,,늘1앙,시신,급환급습종허종종습허급급정
669,다제E,현엘통충학근별,산래현언우혈,협화,금,시신,급환급습맹구맹한습허구정구
373,향분시,적새통충학근별,료외우혈,허,미유앙,시신,급환급습금허금정습정화맹화
887,미간임,통충권빈학근별,공담하뇨혈,충M윤M소N3,을욱서,시명,
1299,을연서,7재통충학근별,시선하뇨혈,국권환한실찬소백새시K두,B현태,시명,
525,드연육,심울아위통충학근별,신앙병숙0언,,한,시명,급환급습구화맹맹습환허금정
364,층스유,적새통충학근별,민시채뇌우혈,금맹,B욱앙,시신,급환급습정금급허습종종정환
530,임자유,심울아위통충학근별,민시병숙0언,,늘1앙,시명,급환급습종환허금습금맹종급
849,민피연,반현06통충학근별,산래우혈,화정래권,택피심,시신,급환급습정한정급습맹환급금
72,앙대암,리래통충학근별,N매경우혈,,늘선박,시명,급환급습금허맹맹습정환종맹


In [151]:
charMapInverse=dict(zip(charMap.values(),charMap.keys()))

In [None]:
tx.map(lambda q:"".join([charMapInverse[w] for w in list(str(q))]) if isinstance(q,(str,int)) else q)

### Above to Method
* Applicable to a column with object (aka str) values

In [None]:
class trainingData:
    def __init__(self,data,
        objectColumns=[
            "idx","name","trainingClass","trainingCompany","trainingUnit","trainingTeacher","address","contact"
        ],
        charMapData=False
    ):
        self.data=data
        self.cols=objectColumns
        self.encoded=None

        if charMapData:
            self.charMapData=pd.read_csv(charMapData)
            self.getCharMap()
            self.encoded=True
        else:
            self.charMapData=None        
            self.charMap=None
            self.charMapInverse=None
            self.sieve=None
        return 
    
    def getCharMap(self):
        if self.encoded==False:
            charIn=list(set(itertools.chain.from_iterable(
                ["".join(w) for w in [self.data[q].astype(str).unique() for q in self.cols]]
            )))
            sieve=random.sample(range(len(charIn)),k=len(charIn))
            charOut=[charIn[q] for q in sieve]

            if (len(charIn)==len(charOut)) + (len(charIn)==len(sieve)) < 2:
                raise Exception("Char numbers are unmatching")

            self.charMapData=pd.DataFrame(
                {"charIn":charIn,"charOut":charOut,"sieve":sieve})
        
        self.charMap=dict(zip(self.charMapData.charIn,self.charMapData.charOut))
        self.charMapInverse=dict(zip(self.charMap.values(),self.charMap.keys()))
        self.sieve=self.charMapData.sieve.tolist()
        return 
    
    def saveCharMapData(self):
        self.charMapData.to_csv("c:/code/charMapData.csv",encoding="utf-8",index=0)
        return
    
    def _code(self,direction):
        if direction=="in" and self.encoded:
            raise Exception("Data already been encoded.")
        if direction=="out" and self.encoded==False:
            raise Exception("This isn't encoded data.")

        if direction=="out":
            sieve=self.charMap
        elif direction=="in":
            sieve=self.charMapInverse
        else:
            raise Exception(f"Unknown direction {direction}")

        if sieve is None:
            raise Exception("Run getCharMap first.")
        
        self.data.loc[:,self.cols]=self.data.loc[:,self.cols].map(
            lambda w:"".join([sieve[q] for q in list(str(w))]) if isinstance(w,(str,int)) else w
        )

        self.encoded=True if direction=="in" else False
        return 
    
    def encode(self):
        return self._code(direction="in")
    
    def decode(self):
        return self._code(direction="out")