# ３章 データ構造 

## 3.1 Pythonのネイティブなデータ構造

### 3.1.1 リスト

In [None]:
small_list = list(range(10))

In [None]:
%%timeit
last_element = small_list[-1]

In [None]:
large_list = list(range(10_000))

In [None]:
%%timeit
last_element = large_list[-1]

In [None]:
%%timeit
4200 in small_list   # 4200が small_listにあるか

In [None]:
%%timeit
4200 in large_list

### 3.1.3 辞書

##### メモ： 「`pip install Faker`」でインストール

In [None]:
from faker import Faker

fake = Faker()

In [None]:
small_dict = {}
for i in range(10):
    small_dict[fake.name()] = fake.address()
name = list(small_dict.keys())[0]  # 先頭要素のキーを得る（ここでは0でなくても構わない）    

In [None]:
name

In [None]:
%%timeit
small_dict[name]

In [None]:
large_dict = {}
for i in range(10_000):
    large_dict[fake.name()] = fake.address()
name = list(large_dict.keys())[0]

In [None]:
%%timeit
large_dict[name]

### 3.1.4 セット（集合）

In [None]:
%%timeit
4200 in large_list

In [None]:
%%timeit
large_set = set(large_list)
4200 in large_set

In [None]:
large_set = set(large_list)

In [None]:
%%timeit
4592 in large_set

## 3.2 numPy

### 3.2.1　NumPyの関数

In [None]:
python_2d_list =[[1, 3, 5], [2, 4, 6], [7, 9, 11]]

In [None]:
first_column = [python_2d_list[i][0] for i in range(len(python_2d_list))]

In [None]:
first_column

In [None]:
import numpy as np

np_2d_array = np.array([[1, 3, 5], [2, 4, 6], [7, 9, 11]])

first_column = np_2d_array[:, 0]

In [None]:
first_column

### 3.2.2　NumPyのパフォーマンスに関する考察

In [None]:
mixed_type_list = ["one", 2, 3.14]

In [None]:
mixed_type_array = np.array(["one", 2, 3.14])

In [None]:
print(mixed_type_array)

In [None]:
integer_array = np.array([1, 2, 3])
print(integer_array.dtype)

##### ●Pythonのリストと比較して、NumPyを使うことでパフォーマンスが向上する例

In [None]:
random_int_array = np.random.randint(1, 100_000, 100_000)
random_int_list = list(random_int_array)

In [None]:
%%timeit -r 7 -n 100
sum(random_int_list)

In [None]:
%%timeit -r 7 -n 100
np.sum(random_int_array)


##### ●配列を適切なスペースで初期化する

In [None]:
array_to_fill = np.zeros(1000)

In [None]:
random_int_array = np.random.randint(1, 100_000, 100_000)

In [None]:
random_int_array.nbytes

In [None]:
random_int_array.dtype

In [None]:
random_int_array_32 = random_int_array.astype(np.int32)

In [None]:
random_int_array_32.nbytes

In [None]:
small_array = np.array([1, 3, 5], dtype=np.int16)

In [None]:
small_array.dtype

### 3.2.3　Daskを使った配列演算

##### ●環境によってはすごく時間がかかるかもしれないので、注意。30億 -> 10億でよいかも

In [None]:
large_np_array = np.random.randint(1, 100000, 30_0000_0000)

In [None]:
%%timeit -r 1 -n 7
np.max(large_np_array)

In [None]:
import dask.array as da

large_np_array = np.random.randint(1, 100000, 30_0000_0000)

In [None]:
large_dask_array = da.from_array(large_np_array)

In [None]:
%%timeit -r 1 -n 7
array_max = large_dask_array.max()
array_max.compute()

In [None]:
from dask.distributed import Client

client = Client(n_workers=4)

In [None]:
client

### 3.3 機械学習における配列

In [None]:
import numpy as np

np_tensor = np.random.rand(4,4)

In [None]:
np_tensor

##### メモ： 「`pip install tensorflow`」でインストール

In [None]:
import tensorflow as tf

tf_tensor = tf.convert_to_tensor(np_tensor)

In [None]:
tf_tensor

##### メモ： 「`pip install torch`」でインストール

In [None]:
import torch

pytorch_tensor = torch.from_numpy(np_tensor)

## 3.4 pandas

### 3.4.1 DataFrame の機能

In [None]:
import pandas as pd
usa_data = pd.Series([13.33, 14.02, 14.02, 14.25],
                     index=["2000", "2001", "2002", "2003"])

In [None]:
usa_data

In [None]:
india_data = pd.Series([9.02, 9.01, 8.84, 8.84],
                       index=["2000", "2001", "2002", "2003"])
df = pd.DataFrame({'USA': usa_data, 'India': india_data})

In [None]:
df

### 3.4.2　DataFrameのパフォーマンス

In [None]:
%%timeit
df["India_fraction"] = df["India"] / 100

In [None]:
%%timeit
df["India_fraction"] = df["India"].apply(lambda x: x / 100)

In [None]:
%%timeit
df["India_fraction"] = [row['India'] / 100 for index, row in df.iterrows()]