[辻真吾・矢吹太朗『ゼロからはじめるデータサイエンス入門』（講談社, 2021）](https://github.com/taroyabuki/fromzero)

In [None]:
# Google Colaboratoryの環境設定
import os
if 'COLAB_GPU' in os.environ:
  !python -m pip install h2o pandarallel pca pmdarima | tail -n 1

## 5.1 データの読み込み

In [None]:
!wget https://raw.githubusercontent.com/taroyabuki/fromzero/master/data/exam.csv

In [None]:
import pandas as pd
my_df = pd.read_csv('exam.csv')
my_df

In [None]:
my_url = ('https://raw.githubusercontent.com/taroyabuki'
          '/fromzero/master/data/exam.csv')
my_df = pd.read_csv(my_url)

In [None]:
my_df2 = pd.read_csv('exam.csv',
    index_col='name')
my_df2

In [None]:
my_df.to_csv('exam2.csv', index=False)

In [None]:
my_df2.to_csv('exam3.csv')

In [None]:
my_df = pd.read_csv('exam.csv',
    encoding='UTF-8')

In [None]:
my_df.to_csv('exam2.csv', index=False, encoding='UTF-8')

In [None]:
my_url = 'https://taroyabuki.github.io/fromzero/exam.html'
my_tables = pd.read_html(my_url)

In [None]:
my_tables

In [None]:
my_tables[0]

In [None]:
# 1列目以降を取り出す．
my_data = my_tables[0].iloc[:, 1:]
my_data

In [None]:
my_url = ('https://raw.githubusercontent.com/taroyabuki'
          '/fromzero/master/data/exam.json')
my_data = pd.read_json(my_url)
#my_data = pd.read_json('exam.json') # （ファイルを使う場合）
my_data

In [None]:
import xml.etree.ElementTree as ET
from urllib.request import urlopen

my_url = ('https://raw.githubusercontent.com/taroyabuki'
          '/fromzero/master/data/exam.xml')
with urlopen(my_url) as f:
    my_tree = ET.parse(f)       # XMLデータの読み込み

#my_tree = ET.parse('exam.xml') # （ファイルを使う場合）
my_ns = '{https://www.example.net/ns/1.0}' # 名前空間

In [None]:
my_records = my_tree.findall(f'.//{my_ns}record')

In [None]:
def f(record):
    my_dic1 = record.attrib # 属性を取り出す．
    # 子要素の名前と内容のペアを辞書にする．
    my_dic2 = {child.tag.replace(my_ns, ''): child.text for child in list(record)}
    return {**my_dic1, **my_dic2} # 辞書を結合する．

In [None]:
my_data = pd.DataFrame([f(record) for record in my_records])
my_data['english'] = pd.to_numeric(my_data['english'])
my_data['math']    = pd.to_numeric(my_data['math'])
my_data

## 5.2 データの変換

In [None]:
import numpy as np
from scipy.stats import zscore

x1 = [1, 2, 3]

z1 = ((x1 - np.mean(x1)) /
      np.std(x1, ddof=1))
# あるいは
z1 = zscore(x1, ddof=1)

z1

In [None]:
z1.mean(), np.std(z1, ddof=1)

In [None]:
z1 * np.std(x1, ddof=1) + np.mean(x1)

In [None]:
x2 = [1, 3, 5]
z2 = ((x2 - np.mean(x1)) /
      np.std(x1, ddof=1))
z2.mean(), np.std(z2, ddof=1)

In [None]:
import pandas as pd
from sklearn.preprocessing import (
    OneHotEncoder)

my_df = pd.DataFrame({
    'id':    [ 1 ,  2 ,  3 ],
    'class': ['A', 'B', 'C']})

my_enc = OneHotEncoder()
tmp = my_enc.fit_transform(
    my_df[['class']]).toarray()
my_names = my_enc.get_feature_names() \
if hasattr(my_enc, 'get_feature_names') \
else my_enc.get_feature_names_out()
pd.DataFrame(tmp, columns=my_names)

In [None]:
my_df2 = pd.DataFrame({
    'id':    [ 4 ,  5,   6 ],
    'class': ['B', 'C', 'B']})
tmp = my_enc.transform(
    my_df2[['class']]).toarray()
pd.DataFrame(tmp, columns=my_names)

In [None]:
my_enc = OneHotEncoder(drop='first')

tmp = my_enc.fit_transform(
    my_df[['class']]).toarray()
my_names = my_enc.get_feature_names() \
if hasattr(my_enc, 'get_feature_names') \
else my_enc.get_feature_names_out()
pd.DataFrame(tmp, columns=my_names)

tmp = my_enc.transform(
    my_df2[['class']]).toarray()
pd.DataFrame(tmp, columns=my_names)