#### 알파벳 기반 언어 식별 모델
- 데이터셋 : lang.zip 사용해서 생성 => csv, json, ...
- 학습방법 : 지도학습 > 분류 - 다중분류
- 알고리즘 : 다양하게 사용

[1] 모듈 로딩 및 데이터 준비

In [40]:
# 모듈로딩
import pandas as pd
import os
from collections import Counter

In [45]:
# 데이터 파일
TRAIN_PATH='../lang_train/'
TEST_PATH='../lang_test/'

# 테스트용으로 만듦. 존재하지 않는 파일
TRAIN_CSV='../DATA/lang_train.csv'
TEST_CSV='../lang_test.csv'

In [46]:
# 파일 읽기 => open() => read(), readline(), readlines() => close()
#          => with open() as 구문

if not os.path.exists(TRAIN_CSV):
    print(f'{TRAIN_CSV}파일이 없습니다.')
else:
    with open(TRAIN_CSV, mode='r') as f:
        alldata=f.read()
        print(f'[File Data] {alldata}')

[File Data] 


In [47]:
# 폴더 안의 파일을 가져오기
files=os.listdir(TRAIN_PATH)

for file in files:
    print(file, TRAIN_PATH+file, os.path.exists(TRAIN_PATH+file))
    label=file[:2]
    print(f'Label => {label}')

    with open(TRAIN_PATH+file, mode='r') as f:
        alldata=f.read()

    # 대소문자 통일
    alldata=alldata.lower()

    # 줄바꿈 제거
    alldata=alldata.replace('\n','')

    # 알파벳 아닌 문자들 제거
    for ch in alldata:
        if ord('a')> ord(ch) or ord(ch) > ord('z'):
            alldata=alldata.replace(ch,'')
    print(f'alldata => {len(alldata)}')
    
    with open(TRAIN_CSV, mode='a') as f :
        # a~z 개수 카운팅
        cc=dict(Counter(alldata))
        f.write(label+',')
        f.write(','.join([str(value) for value in cc.values()])+'\n')
    
    break

en-1.txt ../lang_train/en-1.txt True
Label => en
alldata => 4595


In [44]:
# 파일 쓰기 
# mode='w' : 파일이 존재하면 덮어쓰기, 파일이 없으면 생성 후 쓰기
# mode='a' : 파일이 존재하면 추가하기, 파일이 없으면 생성 후 쓰기
with open('../DATA/lang_train.csv', mode='r') as f:
    with open('backup.csv', mode='w') as wf:
        data=f.read()
        wf.write(data)

In [50]:
a_z=[]
for _ in range(ord('a'), ord('z')+1):
    cnt=alldata.count(chr(_))
    a_z.append(str(cnt/len(alldata)))

a_z

['0.07595212187159957',
 '0.012840043525571273',
 '0.04570184983677911',
 '0.04613710554951034',
 '0.10533188248095757',
 '0.015669205658324265',
 '0.019151251360174103',
 '0.043743199129488576',
 '0.07399347116430903',
 '0.0017410228509249185',
 '0.00544069640914037',
 '0.05375408052230685',
 '0.026332970620239392',
 '0.07747551686615888',
 '0.08966267682263329',
 '0.016539717083786723',
 '0.0',
 '0.07769314472252448',
 '0.061371055495103376',
 '0.08052230685527748',
 '0.02589771490750816',
 '0.009793253536452665',
 '0.014145810663764961',
 '0.0006528835690968443',
 '0.02002176278563656',
 '0.0004352557127312296']

In [54]:
# 함수화
def make_dataset(dir_path, file_path):
    files=os.listdir(dir_path)

    for file in files:
        label=file[:2]
        
        with open(dir_path+file, mode='r') as f: 
            alldata=f.read()
        
        #- 대소문자 통일
        alldata=alldata.lower()
        
        #- 알파벳 아닌 문자들 제거
        alldata=alldata.replace('\n', '')
        print(f'alldata => {len(alldata)}')
        
        for ch in alldata:
            if ord('a')> ord(ch) or ord(ch)> ord('z'): 
                alldata=alldata.replace(ch, '')
        print(f'alldata => {len(alldata)}')
        
        # 파일에 데이터 쓰기
        with open(file_path ,mode='a') as f:
            # - a ~ z 개수 카운팅  ==> 정렬, 전체 단어수로 나누기 
            a_z=[]
            for _ in range(ord('a'), ord('z')+1):
                cnt = alldata.count(chr(_))
                a_z.append(str(cnt/len(alldata)))
                
            f.write(label+',')
            f.write(','.join( a_z )+'\n')


In [52]:
make_dataset(TRAIN_PATH, TRAIN_CSV)
make_dataset(TEST_PATH, TEST_CSV)

alldata => 5358
alldata => 4595
alldata => 10782
alldata => 8387
alldata => 4397
alldata => 3615
alldata => 6815
alldata => 5304
alldata => 6146
alldata => 4566
alldata => 21300
alldata => 15596
alldata => 36732
alldata => 27749
alldata => 22105
alldata => 16915
alldata => 11142
alldata => 8058
alldata => 7016
alldata => 5130
alldata => 2634
alldata => 2007
alldata => 9938
alldata => 7936
alldata => 30287
alldata => 25451
alldata => 17324
alldata => 14905
alldata => 78297
alldata => 61604
alldata => 34630
alldata => 27338
alldata => 40515
alldata => 32871
alldata => 22366
alldata => 17848
alldata => 15555
alldata => 11568
alldata => 9000
alldata => 7259
alldata => 59001
alldata => 45619
alldata => 139001
alldata => 101952
alldata => 35425
alldata => 26566
alldata => 63459
alldata => 45301
alldata => 7930
alldata => 6154
alldata => 33000
alldata => 25641
alldata => 33673
alldata => 26235
alldata => 2052
alldata => 1652


In [55]:
col_names=[]
trainDF=pd.read_csv(TRAIN_CSV)
trainDF

Unnamed: 0,en,370,201,484,121,349,340,356,357,92,...,247,88,210,45,76,8,65,25,3,2
en,0.075952,0.01284,0.045702,0.046137,0.105332,0.015669,0.019151,0.043743,0.073993,0.001741,...,0.0,0.077693,0.061371,0.080522,0.025898,0.009793,0.014146,0.000653,0.020022,0.000435
en,0.084178,0.019912,0.030404,0.03887,0.136998,0.017408,0.031239,0.027423,0.075355,0.002623,...,0.005485,0.09014,0.071659,0.077739,0.030643,0.013712,0.01395,0.002027,0.010731,0.000596
en,0.071646,0.012172,0.045643,0.032642,0.120055,0.014661,0.025173,0.023513,0.094606,0.00249,...,0.00166,0.053942,0.087967,0.081051,0.029046,0.018811,0.011895,0.000553,0.017981,0.000553
en,0.07221,0.027715,0.029977,0.039593,0.121041,0.01678,0.023567,0.059012,0.065234,0.001508,...,0.000377,0.059201,0.073341,0.093703,0.024321,0.00509,0.019608,0.006033,0.017534,0.001697
en,0.073806,0.020368,0.031099,0.039641,0.141261,0.020368,0.020368,0.056943,0.065046,0.003285,...,0.000438,0.072492,0.059571,0.095488,0.024967,0.010731,0.023872,0.003066,0.014893,0.000657
fr,0.078738,0.010451,0.037253,0.053283,0.150487,0.016222,0.010708,0.015517,0.069377,0.002308,...,0.005899,0.075083,0.071621,0.077584,0.053475,0.014299,0.000705,0.003911,0.003655,0.000834
fr,0.077913,0.014919,0.035749,0.04483,0.149735,0.011784,0.011496,0.012613,0.072003,0.002126,...,0.006847,0.074741,0.082093,0.070561,0.054452,0.010631,0.004541,0.003892,0.005334,0.000468
fr,0.072717,0.013065,0.035412,0.04499,0.150754,0.010523,0.010582,0.011528,0.071416,0.003015,...,0.006621,0.076914,0.078333,0.065681,0.050902,0.012711,0.002601,0.004966,0.004848,0.000118
fr,0.072599,0.015761,0.039836,0.051129,0.157111,0.013031,0.013527,0.014396,0.085629,0.004344,...,0.005833,0.08203,0.066518,0.061057,0.04269,0.01514,0.000745,0.005088,0.004964,0.001986
fr,0.078947,0.011501,0.035283,0.051852,0.143275,0.012086,0.020078,0.019493,0.089864,0.003899,...,0.004678,0.077778,0.07232,0.063353,0.04386,0.014035,0.00039,0.003314,0.005263,0.00117
