## 시스템의 인코딩 방식을 확인하기

In [1]:
import sys

print(sys.stdin.encoding)

print(sys.stdout.encoding)

utf-8
UTF-8


## 코드에서 인코딩을 설정해서 실행하는 주석

* python 3.7 버전이하
* python 3.8 버전 이상부터는 기본이 utf-8임

In [2]:
# -*- coding: utf-8 -*- # 가장 많이 사용(100%)
# -*- coding: latin-1 -*-

## 기본 파이썬 패키지 활용

### 유니코드

ord() : 특정문자의 유니코드 값을 확인  
chr() : 유니코드 값을 문자로 확인

In [3]:
print(ord('가'))

44032


In [4]:
chr(44032)

'가'

## 인코딩

In [5]:
str_data = '가나다라'
print(type(str_data))

str_data_encoded = str_data.encode('utf-8')
print(type(str_data_encoded))
print(str_data_encoded)

<class 'str'>
<class 'bytes'>
b'\xea\xb0\x80\xeb\x82\x98\xeb\x8b\xa4\xeb\x9d\xbc'


## 디코딩

In [6]:
str_data_decoded = str_data_encoded.decode('utf-8')
print(type(str_data_decoded))
print(str_data_decoded)

<class 'str'>
가나다라


In [7]:
print(str_data)

가나다라


In [9]:
type(1), type('1') , '1'

(int, str, '1')

## 유니코드를 base64로 인코딩 및 디코딩하기

In [10]:
import base64

text_a = "Hello World!"
text_b = "Hello Python"

print(text_a)
print(text_b)

a_encode_unicode = text_a.encode('utf-8')
b_encode_unicode = text_b.encode('utf-8')

print(a_encode_unicode)
print(b_encode_unicode)

a_encode_base64 = base64.b64encode(a_encode_unicode)
b_encode_base64 = base64.b64encode(b_encode_unicode)

print(a_encode_base64)
print(b_encode_base64)

Hello World!
Hello Python
b'Hello World!'
b'Hello Python'
b'SGVsbG8gV29ybGQh'
b'SGVsbG8gUHl0aG9u'


In [12]:
a_decode_unicode = a_encode_base64.decode('utf-8')
b_decode_unicode = b_encode_base64.decode('utf-8')

print(a_decode_unicode)
print(b_decode_unicode)

a_decode_base64 = base64.b64decode(a_decode_unicode)
b_decode_base64 = base64.b64decode(b_decode_unicode)

print(a_decode_base64)
print(b_decode_base64)

SGVsbG8gV29ybGQh
SGVsbG8gUHl0aG9u
b'Hello World!'
b'Hello Python'


## chardet 라이브러리 사용하기

* encoding : 인코딩 예측 결과
* confidence : 신뢰도(언어의 확률 분포에 기반하여 계산)
* language : 언어 종류

In [13]:
pip install chardet

Note: you may need to restart the kernel to use updated packages.


In [14]:
import urllib
import chardet

In [15]:
url_path = 'https://ko.wikipedia.org/wiki/%EC%9E%90%EC%97%B0%EC%96%B4_%EC%B2%98%EB%A6%AC'

In [16]:
raw_data = urllib.request.urlopen(url_path).read()
raw_data

b'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled vector-feature-custom-font-size-clientpref-disabled vector-feature-client-preferences-disabled" lang="ko" dir="ltr">\n<head>\n<meta charset="UTF-8">\n<title>\xec\x9e\x90\xec\x97\xb0\xec\x96\xb4 \xec\xb2\x98\xeb\xa6\xac - \xec\x9c\x84\xed\x82\xa4\xeb\xb0\xb1\xea\xb3\xbc, \xec\x9a\xb0\xeb\xa6\xac \xeb\xaa\xa8\xeb\x91\x90\xec\x9d\x98 \xeb\xb0\xb1\xea\xb3\xbc\xec\x82\xac\xec\xa0\x84</title>\n<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-f

In [17]:
chardet.detect(raw_data)

{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}

In [18]:
print(b_encode_base64)
print(chardet.detect(b_encode_base64))

b'SGVsbG8gUHl0aG9u'
{'encoding': 'ascii', 'confidence': 1.0, 'language': ''}
