In [None]:
在速度較慢的時候，可以先從哪邊開始檢查？先檢查有沒有使⽤到⾃訂義的函數，如果有可以查詢pandas中是否有
⽀援相關演算法，如果以上都沒辦法改善，那可以從資料讀取⽅式下⼿，將要
讀取的資料改成pkl檔

資料過⼤時應採取什麼⽅式讓記憶體占⽤量下降？將欄位中的型態降級

In [5]:
import pandas as pd
import numpy as np 
import time
score_df = pd.DataFrame([[1,50,80,70,'boy',1], 
              [2,60,45,50,'boy',2],
              [3,98,43,55,'boy',1],
              [4,70,69,89,'boy',2],
              [5,56,79,60,'girl',1],
              [6,60,68,55,'girl',2],
              [7,45,70,77,'girl',1],
              [8,55,77,76,'girl',2],
              [9,25,57,60,'girl',1],
              [10,88,40,43,'girl',3],
              [11,25,60,45,'boy',3],
              [12,80,60,23,'boy',3],
              [13,20,90,66,'girl',3],
              [14,50,50,50,'girl',3],
              [15,89,67,77,'girl',3]],columns=['student_id','math_score','english_score','chinese_score','sex','class'])
score_df

Unnamed: 0,student_id,math_score,english_score,chinese_score,sex,class
0,1,50,80,70,boy,1
1,2,60,45,50,boy,2
2,3,98,43,55,boy,1
3,4,70,69,89,boy,2
4,5,56,79,60,girl,1
5,6,60,68,55,girl,2
6,7,45,70,77,girl,1
7,8,55,77,76,girl,2
8,9,25,57,60,girl,1
9,10,88,40,43,girl,3


In [10]:
star_time = time.time()
score_df.groupby('class').agg('mean')
end_time = time.time()
end_time - star_time

0.001993894577026367

In [13]:
##agg使用自定義函式
star_time = time.time()
score_df.groupby('class').agg(lambda x: x.mean())
end_time = time.time()
end_time - star_time


0.011968374252319336

In [14]:
#transform使用Python的內建函式
star_time = time.time()
score_df.groupby('class').transform('mean')
end_time = time.time()
end_time - star_time

0.027924537658691406

In [15]:
#篩選出對應資料
#用list方式搜索
score_df1 = score_df.copy()
star_time = time.time()
score_df1['Pass_math'] = [i>=60 for i in score_df1.math_score]
end_time = time.time()
end_time - star_time

0.0020051002502441406

In [17]:
#用DataFrame column方式搜索
score_df1 = score_df.copy()
star_time = time.time()
score_df1['Pass_math'] = score_df1.math_score>=60
end_time = time.time()
end_time - star_time

0.0009644031524658203

In [19]:
#用自定義式搜索
score_df2 = score_df.copy()
star_time = time.time()
score_df2['Pass_math'] = score_df2.math_score.apply(lambda x : x>=60)
end_time = time.time()
end_time - star_time

0.0009970664978027344

In [21]:
#用isin()
score_df3 = score_df.copy()
star_time = time.time()
score_df3['Pass_math'] = score_df3.math_score.isin(range(60, 100))
end_time = time.time()
end_time - star_time

0.0019617080688476562

In [24]:
#遇到大資料集時，常有記憶體不足的問題
#首先先生成大資料，因為改善部分不同所以分成浮點數float與整數int的資料集，可以看到不管浮點數還是整數都佔了800128bytes
float_data = pd.DataFrame(np.random.uniform(0,5,100000).reshape(1000,100))
int_data = pd.DataFrame(np.random.randint(0,1000,100000).reshape(1000,100))
int_data.memory_usage(deep=True).sum(), float_data.memory_usage(deep=True).sum()

(400128, 800128)

In [30]:
#整數型態int改成uint減少記憶體正用空間，使用前800128bytes，使用後剩下200128bytes
downcast_int = int_data.apply(pd.to_numeric,downcast='unsigned')
int_data.memory_usage(deep=True).sum(),downcast_int.memory_usage(deep=True).sum()

(400128, 200128)

In [32]:
#浮點數型態float64改成float32減少記憶體正用空間，使用前800128bytes，使用後剩下400128bytes
downcast_float = float_data.apply(pd.to_numeric,downcast='float')
float_data.memory_usage(deep=True).sum(),downcast_float.memory_usage(deep=True).sum()

(800128, 400128)

In [33]:
#原本有100個欄位是float64，經過downcast變成了100個欄位的float32
compare_int = pd.concat([float_data.dtypes,downcast_float.dtypes],axis=1)
compare_int.columns = ['before','after']
compare_int.apply(pd.value_counts)

Unnamed: 0,before,after
float32,,100.0
float64,100.0,
