# 05

## 5.1 データの読み込み

### 5.1.1 CSV

In [None]:
!wget https://raw.githubusercontent.com/taroyabuki/fromzero/master/data/exam.csv

#### 5.1.1.1 CSVの読み込み

In [None]:
import pandas as pd
my_df = pd.read_csv('exam.csv')
my_df
#>   name  english  math gender
#> 0    A       60    70      f
#> 1    B       90    80      m
#> 2    C       70    90      m
#> 3    D       90   100      f

In [None]:
my_url = ('https://raw.githubusercontent.com/taroyabuki'
          '/fromzero/master/data/exam.csv')
my_df = pd.read_csv(my_url)

In [None]:
my_df2 = pd.read_csv('exam.csv',
    index_col='name')
my_df2
#>       english  math gender
#> name
#> A          60    70      f
#> B          90    80      m
#> C          70    90      m
#> D          90   100      f

#### 5.1.1.2 CSVファイルへの書き出し

In [None]:
my_df.to_csv('exam2.csv', index=False)

In [None]:
my_df2.to_csv('exam3.csv')

### 5.1.2 文字コード

In [None]:
my_df = pd.read_csv('exam.csv',
    encoding='UTF-8')

In [None]:
my_df.to_csv('exam2.csv', index=False, encoding='UTF-8')

### 5.1.3 ウェブ上の表

In [None]:
my_url = 'https://taroyabuki.github.io/fromzero/exam.html'
my_tables = pd.read_html(my_url)

In [None]:
my_tables
#> [   Unnamed: 0 name  english ...
#>  0         NaN    A       60 ...
#>  1         NaN    B       90 ...
#>  2         NaN    C       70 ...
#>  3         NaN    D       90 ...]

In [None]:
my_tables[0]
#>    Unnamed: 0 name  english ...
#> 0         NaN    A       60 ...
#> 1         NaN    B       90 ...
#> 2         NaN    C       70 ...
#> 3         NaN    D       90 ...

In [None]:
# 1列目以降を取り出す．
my_data = my_tables[0].iloc[:, 1:]
my_data
#>   name  english  math gender
#> 0    A       60    70      f
#> 1    B       90    80      m
#> 2    C       70    90      m
#> 3    D       90   100      f

### 5.1.4 JSONとXML

#### 5.1.4.1 JSONデータの読み込み

In [None]:
my_url = ('https://raw.githubusercontent.com/taroyabuki'
          '/fromzero/master/data/exam.json')
my_data = pd.read_json(my_url)
#my_data = pd.read_json('exam.json') # （ファイルを使う場合）
my_data
#>   name  english  math gender
#> 0    A       60    70      f
#> 1    B       90    80      m
#> 2    C       70    90      m
#> 3    D       90   100      f

#### 5.1.4.2 XMLデータの読み込み

In [None]:
import xml.etree.ElementTree as ET
from urllib.request import urlopen

my_url = ('https://raw.githubusercontent.com/taroyabuki'
          '/fromzero/master/data/exam.xml')
with urlopen(my_url) as f:
    my_tree = ET.parse(f)       # XMLデータの読み込み

#my_tree = ET.parse('exam.xml') # （ファイルを使う場合）
my_ns = '{https://www.example.net/ns/1.0}' # 名前空間

In [None]:
my_records = my_tree.findall(f'.//{my_ns}record')

In [None]:
def f(record):
    my_dic1 = record.attrib # 属性を取り出す．
    # 子要素の名前と内容のペアを辞書にする．
    my_dic2 = {child.tag.replace(my_ns, ''): child.text for child in list(record)}
    return {**my_dic1, **my_dic2} # 辞書を結合する．

In [None]:
my_data = pd.DataFrame([f(record) for record in my_records])
my_data['english'] = pd.to_numeric(my_data['english'])
my_data['math']    = pd.to_numeric(my_data['math'])
my_data
#>    english  math gender name
#> 0       60    70      f    A
#> 1       90    80      m    B
#> 2       70    90      m    C
#> 3       90   100      f    D

## 5.2 データの変換

### 5.2.1 標準化

In [None]:
import numpy as np
from scipy.stats import zscore

x1 = [1, 2, 3]

z1 = ((x1 - np.mean(x1)) /
      np.std(x1, ddof=1))
# あるいは
z1 = zscore(x1, ddof=1)

z1
#> array([-1.,  0.,  1.])

In [None]:
z1.mean(), np.std(z1, ddof=1)
#> (0.0, 1.0)

In [None]:
z1 * np.std(x1, ddof=1) + np.mean(x1)
#> array([1., 2., 3.])

In [None]:
x2 = [1, 3, 5]
z2 = ((x2 - np.mean(x1)) /
      np.std(x1, ddof=1))
z2.mean(), np.std(z2, ddof=1)
#> (1.0, 2.0)

### 5.2.2 ワンホットエンコーディング

In [None]:
import pandas as pd
import sklearn
from packaging import version
from sklearn.preprocessing import (
    OneHotEncoder)

my_df = pd.DataFrame({
    'id':    [ 1 ,  2 ,  3 ],
    'class': ['A', 'B', 'C']})

my_enc = OneHotEncoder()
tmp = my_enc.fit_transform(
    my_df[['class']]).toarray()
if version.parse(sklearn.__version__) >= version.parse("1.0"):
    my_names = my_enc.get_feature_names_out()
else:
    my_names = my_enc.get_feature_names()
pd.DataFrame(tmp, columns=my_names)
#>    x0_A  x0_B  x0_C
#> 0   1.0   0.0   0.0
#> 1   0.0   1.0   0.0
#> 2   0.0   0.0   1.0

In [None]:
my_df2 = pd.DataFrame({
    'id':    [ 4 ,  5,   6 ],
    'class': ['B', 'C', 'B']})
tmp = my_enc.transform(
    my_df2[['class']]).toarray()
pd.DataFrame(tmp, columns=my_names)
#>    x0_A  x0_B  x0_C
#> 0   0.0   1.0   0.0
#> 1   0.0   0.0   1.0
#> 2   0.0   1.0   0.0

#### 5.2.2.1 補足：冗長性の排除

In [None]:
my_enc = OneHotEncoder(drop='first')

tmp = my_enc.fit_transform(
    my_df[['class']]).toarray()
if version.parse(sklearn.__version__) >= version.parse("1.0"):
    my_names = my_enc.get_feature_names_out()
else:
    my_names = my_enc.get_feature_names()
pd.DataFrame(tmp, columns=my_names)
#>    x0_B  x0_C
#> 0   0.0   0.0
#> 1   1.0   0.0
#> 2   0.0   1.0

tmp = my_enc.transform(
    my_df2[['class']]).toarray()
pd.DataFrame(tmp, columns=my_names)
#>    x0_B  x0_C
#> 0   1.0   0.0
#> 1   0.0   1.0
#> 2   1.0   0.0