# Pandas for Bitcoin (part 2)
DataFrame shift、 diff、pct_change函数  
如何删除列、 还有数据排序、合并DataFrame、删除重复数据、reset_index重置索引、set_index设置索引  
http://pandas.pydata.org/pandas-docs/stable/api.html

In [1]:
# import the dependance 
import pandas as pd
import os

pd.set_option('expand_frame_repr', False)  # 当列太多时不换行
# pd.set_option('display.max_rows', 1000)  # 最多显示行数.
# pd.set_option('precision', 6)  # 浮点数的精度
pd.set_option('display.float_format', lambda x: '%.4f' % x)  # 设置不用科学计数法，保留两位小数.

In [2]:
df = pd.read_csv(filepath_or_buffer='./asset/1560038820000.csv')
df.head()

Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_volume,trades,taker_base_volue,taker_quote_volume,ignore
0,1559978820000,7907.46,7910.61,7905.03,7906.25,12.0324,1559978879999,95142.1412,165,3.7865,29943.6224,0
1,1559978880000,7906.42,7912.09,7905.1,7910.13,9.9694,1559978939999,78841.6356,122,5.6658,44810.5527,0
2,1559978940000,7908.07,7917.71,7907.0,7914.33,20.8231,1559978999999,164734.7344,194,14.6195,115657.9911,0
3,1559979000000,7914.99,7918.59,7912.92,7917.43,7.1264,1559979059999,56411.7199,130,4.1728,33031.4692,0
4,1559979060000,7916.52,7918.7,7913.16,7914.09,10.4748,1559979119999,82919.0848,113,4.8062,38046.2986,0


In [3]:
df = df[['open_time', 'open', 'high', 'low', 'close', 'volume']]
df.head()

Unnamed: 0,open_time,open,high,low,close,volume
0,1559978820000,7907.46,7910.61,7905.03,7906.25,12.0324
1,1559978880000,7906.42,7912.09,7905.1,7910.13,9.9694
2,1559978940000,7908.07,7917.71,7907.0,7914.33,20.8231
3,1559979000000,7914.99,7918.59,7912.92,7917.43,7.1264
4,1559979060000,7916.52,7918.7,7913.16,7914.09,10.4748


In [4]:
# shift some data
df['next_close'] = df['close'].shift(-1)
df['last_close'] = df['close'].shift(1)
df[['close','next_close','last_close']].head()

Unnamed: 0,close,next_close,last_close
0,7906.25,7910.13,
1,7910.13,7914.33,7906.25
2,7914.33,7917.43,7910.13
3,7917.43,7914.09,7914.33
4,7914.09,7919.0,7917.43


In [5]:
# delete some data, method 1
del df['last_close']
df.head()

Unnamed: 0,open_time,open,high,low,close,volume,next_close
0,1559978820000,7907.46,7910.61,7905.03,7906.25,12.0324,7910.13
1,1559978880000,7906.42,7912.09,7905.1,7910.13,9.9694,7914.33
2,1559978940000,7908.07,7917.71,7907.0,7914.33,20.8231,7917.43
3,1559979000000,7914.99,7918.59,7912.92,7917.43,7.1264,7914.09
4,1559979060000,7916.52,7918.7,7913.16,7914.09,10.4748,7919.0


In [6]:
# delete some data, method 2
new_df = df.drop('next_close', axis=1)
new_df.head()

Unnamed: 0,open_time,open,high,low,close,volume
0,1559978820000,7907.46,7910.61,7905.03,7906.25,12.0324
1,1559978880000,7906.42,7912.09,7905.1,7910.13,9.9694
2,1559978940000,7908.07,7917.71,7907.0,7914.33,20.8231
3,1559979000000,7914.99,7918.59,7912.92,7917.43,7.1264
4,1559979060000,7916.52,7918.7,7913.16,7914.09,10.4748


## calculate price fluctuation
use diff to calculate is a simpler way

In [7]:
# calc the fluc
df['fluc'] = df['close'].diff(1)
df.head()

Unnamed: 0,open_time,open,high,low,close,volume,next_close,fluc
0,1559978820000,7907.46,7910.61,7905.03,7906.25,12.0324,7910.13,
1,1559978880000,7906.42,7912.09,7905.1,7910.13,9.9694,7914.33,3.88
2,1559978940000,7908.07,7917.71,7907.0,7914.33,20.8231,7917.43,4.2
3,1559979000000,7914.99,7918.59,7912.92,7917.43,7.1264,7914.09,3.1
4,1559979060000,7916.52,7918.7,7913.16,7914.09,10.4748,7919.0,-3.34


In [8]:
# calc the percentage of fluc
df['fluc%'] = df['close'].pct_change(1)
df.head()

Unnamed: 0,open_time,open,high,low,close,volume,next_close,fluc,fluc%
0,1559978820000,7907.46,7910.61,7905.03,7906.25,12.0324,7910.13,,
1,1559978880000,7906.42,7912.09,7905.1,7910.13,9.9694,7914.33,3.88,0.0005
2,1559978940000,7908.07,7917.71,7907.0,7914.33,20.8231,7917.43,4.2,0.0005
3,1559979000000,7914.99,7918.59,7912.92,7917.43,7.1264,7914.09,3.1,0.0004
4,1559979060000,7916.52,7918.7,7913.16,7914.09,10.4748,7919.0,-3.34,-0.0004


In [9]:
# accumulate the fluc
df['final_fluc'] = df['fluc'].cumsum()
df.head()

Unnamed: 0,open_time,open,high,low,close,volume,next_close,fluc,fluc%,final_fluc
0,1559978820000,7907.46,7910.61,7905.03,7906.25,12.0324,7910.13,,,
1,1559978880000,7906.42,7912.09,7905.1,7910.13,9.9694,7914.33,3.88,0.0005,3.88
2,1559978940000,7908.07,7917.71,7907.0,7914.33,20.8231,7917.43,4.2,0.0005,8.08
3,1559979000000,7914.99,7918.59,7912.92,7917.43,7.1264,7914.09,3.1,0.0004,11.18
4,1559979060000,7916.52,7918.7,7913.16,7914.09,10.4748,7919.0,-3.34,-0.0004,7.84


## sorting

In [10]:
df["utc"] = pd.to_datetime(df['open_time'], unit='ms')
df.head()

Unnamed: 0,open_time,open,high,low,close,volume,next_close,fluc,fluc%,final_fluc,utc
0,1559978820000,7907.46,7910.61,7905.03,7906.25,12.0324,7910.13,,,,2019-06-08 07:27:00
1,1559978880000,7906.42,7912.09,7905.1,7910.13,9.9694,7914.33,3.88,0.0005,3.88,2019-06-08 07:28:00
2,1559978940000,7908.07,7917.71,7907.0,7914.33,20.8231,7917.43,4.2,0.0005,8.08,2019-06-08 07:29:00
3,1559979000000,7914.99,7918.59,7912.92,7917.43,7.1264,7914.09,3.1,0.0004,11.18,2019-06-08 07:30:00
4,1559979060000,7916.52,7918.7,7913.16,7914.09,10.4748,7919.0,-3.34,-0.0004,7.84,2019-06-08 07:31:00


In [11]:
# sort by index
df.sort_index(ascending=0).head()

Unnamed: 0,open_time,open,high,low,close,volume,next_close,fluc,fluc%,final_fluc,utc
999,1560038760000,7915.71,7917.87,7913.18,7915.18,3.0263,,0.09,0.0,8.93,2019-06-09 00:06:00
998,1560038700000,7916.98,7919.78,7913.18,7915.09,21.7319,7915.18,-0.95,-0.0001,8.84,2019-06-09 00:05:00
997,1560038640000,7916.93,7918.35,7913.69,7916.04,11.3783,7915.09,-2.26,-0.0003,9.79,2019-06-09 00:04:00
996,1560038580000,7915.74,7922.0,7914.79,7918.3,26.8552,7916.04,2.73,0.0003,12.05,2019-06-09 00:03:00
995,1560038520000,7912.24,7919.0,7910.03,7915.57,14.191,7918.3,4.4,0.0006,9.32,2019-06-09 00:02:00


In [12]:
# sort by column's value
df.sort_values(by='fluc',ascending=0).head()

Unnamed: 0,open_time,open,high,low,close,volume,next_close,fluc,fluc%,final_fluc,utc
902,1560032940000,7889.54,7914.39,7888.34,7910.33,57.3234,7906.59,22.14,0.0028,4.08,2019-06-08 22:29:00
487,1560008040000,7852.36,7873.67,7825.0,7873.67,355.3471,7865.62,21.3,0.0027,-32.58,2019-06-08 15:34:00
612,1560015540000,7785.5,7799.52,7782.16,7798.84,29.9038,7814.99,20.43,0.0026,-107.41,2019-06-08 17:39:00
506,1560009180000,7878.29,7900.88,7877.47,7898.84,22.2818,7896.54,20.38,0.0026,-7.41,2019-06-08 15:53:00
366,1560000780000,7922.46,7952.49,7922.46,7942.29,46.6482,7949.33,19.99,0.0025,36.04,2019-06-08 13:33:00


## dataframe append


In [13]:
df_total = df.append(df)
df_total.tail()

Unnamed: 0,open_time,open,high,low,close,volume,next_close,fluc,fluc%,final_fluc,utc
995,1560038520000,7912.24,7919.0,7910.03,7915.57,14.191,7918.3,4.4,0.0006,9.32,2019-06-09 00:02:00
996,1560038580000,7915.74,7922.0,7914.79,7918.3,26.8552,7916.04,2.73,0.0003,12.05,2019-06-09 00:03:00
997,1560038640000,7916.93,7918.35,7913.69,7916.04,11.3783,7915.09,-2.26,-0.0003,9.79,2019-06-09 00:04:00
998,1560038700000,7916.98,7919.78,7913.18,7915.09,21.7319,7915.18,-0.95,-0.0001,8.84,2019-06-09 00:05:00
999,1560038760000,7915.71,7917.87,7913.18,7915.18,3.0263,,0.09,0.0,8.93,2019-06-09 00:06:00


In [14]:
df_total = df.append(df,ignore_index=True)
df_total.tail()

Unnamed: 0,open_time,open,high,low,close,volume,next_close,fluc,fluc%,final_fluc,utc
1995,1560038520000,7912.24,7919.0,7910.03,7915.57,14.191,7918.3,4.4,0.0006,9.32,2019-06-09 00:02:00
1996,1560038580000,7915.74,7922.0,7914.79,7918.3,26.8552,7916.04,2.73,0.0003,12.05,2019-06-09 00:03:00
1997,1560038640000,7916.93,7918.35,7913.69,7916.04,11.3783,7915.09,-2.26,-0.0003,9.79,2019-06-09 00:04:00
1998,1560038700000,7916.98,7919.78,7913.18,7915.09,21.7319,7915.18,-0.95,-0.0001,8.84,2019-06-09 00:05:00
1999,1560038760000,7915.71,7917.87,7913.18,7915.18,3.0263,,0.09,0.0,8.93,2019-06-09 00:06:00


In [15]:
# remove the duplicate data
df_total.drop_duplicates(
    subset=['open_time'],  # subset参数用来指定根据哪类类数据来判断是否重复。若不指定，则用全部列的数据来判断是否重复
    keep='first',  # 在去除重复值的时候，我们是保留上面一行还是下面一行？first保留上面一行，last保留下面一行，False就是一行都不保留
    ).tail()

Unnamed: 0,open_time,open,high,low,close,volume,next_close,fluc,fluc%,final_fluc,utc
995,1560038520000,7912.24,7919.0,7910.03,7915.57,14.191,7918.3,4.4,0.0006,9.32,2019-06-09 00:02:00
996,1560038580000,7915.74,7922.0,7914.79,7918.3,26.8552,7916.04,2.73,0.0003,12.05,2019-06-09 00:03:00
997,1560038640000,7916.93,7918.35,7913.69,7916.04,11.3783,7915.09,-2.26,-0.0003,9.79,2019-06-09 00:04:00
998,1560038700000,7916.98,7919.78,7913.18,7915.09,21.7319,7915.18,-0.95,-0.0001,8.84,2019-06-09 00:05:00
999,1560038760000,7915.71,7917.87,7913.18,7915.18,3.0263,,0.09,0.0,8.93,2019-06-09 00:06:00


## miscellaneous

In [16]:
# set index by one exiting column
df = df.set_index('open_time')
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,next_close,fluc,fluc%,final_fluc,utc
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1559978820000,7907.46,7910.61,7905.03,7906.25,12.0324,7910.13,,,,2019-06-08 07:27:00
1559978880000,7906.42,7912.09,7905.1,7910.13,9.9694,7914.33,3.88,0.0005,3.88,2019-06-08 07:28:00
1559978940000,7908.07,7917.71,7907.0,7914.33,20.8231,7917.43,4.2,0.0005,8.08,2019-06-08 07:29:00
1559979000000,7914.99,7918.59,7912.92,7917.43,7.1264,7914.09,3.1,0.0004,11.18,2019-06-08 07:30:00
1559979060000,7916.52,7918.7,7913.16,7914.09,10.4748,7919.0,-3.34,-0.0004,7.84,2019-06-08 07:31:00


In [17]:
# reset the index, can choose keep or drop the old index
df = df.reset_index(drop=False)
df.head()

Unnamed: 0,open_time,open,high,low,close,volume,next_close,fluc,fluc%,final_fluc,utc
0,1559978820000,7907.46,7910.61,7905.03,7906.25,12.0324,7910.13,,,,2019-06-08 07:27:00
1,1559978880000,7906.42,7912.09,7905.1,7910.13,9.9694,7914.33,3.88,0.0005,3.88,2019-06-08 07:28:00
2,1559978940000,7908.07,7917.71,7907.0,7914.33,20.8231,7917.43,4.2,0.0005,8.08,2019-06-08 07:29:00
3,1559979000000,7914.99,7918.59,7912.92,7917.43,7.1264,7914.09,3.1,0.0004,11.18,2019-06-08 07:30:00
4,1559979060000,7916.52,7918.7,7913.16,7914.09,10.4748,7919.0,-3.34,-0.0004,7.84,2019-06-08 07:31:00


In [18]:
df_new = df.rename(columns={'close':"CLOSE",
                           'open':'OPEN'}
                  )
df_new.head()

Unnamed: 0,open_time,OPEN,high,low,CLOSE,volume,next_close,fluc,fluc%,final_fluc,utc
0,1559978820000,7907.46,7910.61,7905.03,7906.25,12.0324,7910.13,,,,2019-06-08 07:27:00
1,1559978880000,7906.42,7912.09,7905.1,7910.13,9.9694,7914.33,3.88,0.0005,3.88,2019-06-08 07:28:00
2,1559978940000,7908.07,7917.71,7907.0,7914.33,20.8231,7917.43,4.2,0.0005,8.08,2019-06-08 07:29:00
3,1559979000000,7914.99,7918.59,7912.92,7917.43,7.1264,7914.09,3.1,0.0004,11.18,2019-06-08 07:30:00
4,1559979060000,7916.52,7918.7,7913.16,7914.09,10.4748,7919.0,-3.34,-0.0004,7.84,2019-06-08 07:31:00


## batch csv process 
automatically read csv files in the directory, then concatenate them

In [19]:
csv_file_paths = []
for root, dirs, files in os.walk('./asset'):
    # 当files不为空的时候
    if files:
        for f in files:
            if f.endswith('.csv'):
                file_path = os.path.join(root, f)
                csv_file_paths.append(file_path)
csv_file_paths

['./asset\\1560038820000.csv',
 './asset\\1560098820000.csv',
 './asset\\1560158820000.csv',
 './asset\\1560218820000.csv',
 './asset\\binance_btc_1min.csv']

In [20]:
all_df = pd.DataFrame()
for file in (csv_file_paths):
    if file != "./asset\\binance_btc_1min.csv":
        print(file)
        # 导入数据
        df = pd.read_csv(file)
        #  合并数据
        all_df = all_df.append(df, ignore_index=True)
all_df.describe()

./asset\1560038820000.csv
./asset\1560098820000.csv
./asset\1560158820000.csv
./asset\1560218820000.csv


Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_volume,trades,taker_base_volue,taker_quote_volume,ignore
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,1560098790000.0,7816.3927,7820.8918,7811.433,7816.3994,21.6195,1560098849999.0,168600.1369,226.271,11.4429,89311.3584,0.0
std,69290692.0156,120.724,120.0813,121.3815,120.7605,30.1904,69290692.0156,235745.1291,191.0118,17.6675,138594.3099,0.0
min,1559978820000.0,7515.15,7527.79,7506.66,7515.15,1.7626,1559978879999.0,13883.2888,50.0,0.7498,5890.1068,0.0
25%,1560038805000.0,7700.5775,7705.0,7697.0725,7700.6675,8.462,1560038864999.0,66258.188,126.0,4.1544,32682.4517,0.0
50%,1560098790000.0,7865.765,7869.435,7862.065,7866.05,13.9645,1560098849999.0,109154.0611,176.0,7.0706,55008.1978,0.0
75%,1560158775000.0,7913.7825,7918.32,7909.795,7913.9,23.9906,1560158834999.0,186365.8039,263.0,12.6363,97803.2136,0.0
max,1560218760000.0,8016.38,8020.0,7997.69,8016.37,634.6248,1560218819999.0,5077170.9983,3031.0,424.8686,3399184.5352,0.0


In [21]:
# 删除重复的数据
all_df.drop_duplicates(subset=['open_time'], inplace=True, keep='first')
all_df.describe()

Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_volume,trades,taker_base_volue,taker_quote_volume,ignore
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,1560098790000.0,7816.3927,7820.8918,7811.433,7816.3994,21.6195,1560098849999.0,168600.1369,226.271,11.4429,89311.3584,0.0
std,69290692.0156,120.724,120.0813,121.3815,120.7605,30.1904,69290692.0156,235745.1291,191.0118,17.6675,138594.3099,0.0
min,1559978820000.0,7515.15,7527.79,7506.66,7515.15,1.7626,1559978879999.0,13883.2888,50.0,0.7498,5890.1068,0.0
25%,1560038805000.0,7700.5775,7705.0,7697.0725,7700.6675,8.462,1560038864999.0,66258.188,126.0,4.1544,32682.4517,0.0
50%,1560098790000.0,7865.765,7869.435,7862.065,7866.05,13.9645,1560098849999.0,109154.0611,176.0,7.0706,55008.1978,0.0
75%,1560158775000.0,7913.7825,7918.32,7909.795,7913.9,23.9906,1560158834999.0,186365.8039,263.0,12.6363,97803.2136,0.0
max,1560218760000.0,8016.38,8020.0,7997.69,8016.37,634.6248,1560218819999.0,5077170.9983,3031.0,424.8686,3399184.5352,0.0


In [22]:
# sorting
all_df.sort_values(by=['open_time'], ascending=1, inplace=True)

In [23]:
# transfer time to UTC
all_df['open_time'] = pd.to_datetime(all_df['open_time'], unit='ms')
# select columns
all_df = all_df[['open_time', 'open', 'high', 'low', 'close', 'volume']]
# set the index 
all_df.set_index('open_time', inplace=True)
# save the data
all_df.to_csv('./asset/binance_btc_1min.csv')

## time sampling 
filter the data according to the time stamp

In [24]:
df = pd.read_csv('./asset/binance_btc_1min.csv')
df.head()

Unnamed: 0,open_time,open,high,low,close,volume
0,2019-06-07 14:47:00,7928.75,7932.03,7925.59,7925.6,14.0975
1,2019-06-07 14:48:00,7926.16,7926.66,7912.14,7915.22,30.8772
2,2019-06-07 14:49:00,7915.22,7921.15,7911.0,7919.45,27.407
3,2019-06-07 14:50:00,7918.45,7919.12,7903.76,7910.96,22.0303
4,2019-06-07 14:51:00,7910.96,7915.52,7907.11,7915.52,17.3316


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7939 entries, 0 to 7938
Data columns (total 6 columns):
open_time    7939 non-null object
open         7939 non-null float64
high         7939 non-null float64
low          7939 non-null float64
close        7939 non-null float64
volume       7939 non-null float64
dtypes: float64(5), object(1)
memory usage: 372.2+ KB


In [26]:
# convert the str time to datetime
df['open_time'] = pd.to_datetime(df['open_time'])
df.head()

Unnamed: 0,open_time,open,high,low,close,volume
0,2019-06-07 14:47:00,7928.75,7932.03,7925.59,7925.6,14.0975
1,2019-06-07 14:48:00,7926.16,7926.66,7912.14,7915.22,30.8772
2,2019-06-07 14:49:00,7915.22,7921.15,7911.0,7919.45,27.407
3,2019-06-07 14:50:00,7918.45,7919.12,7903.76,7910.96,22.0303
4,2019-06-07 14:51:00,7910.96,7915.52,7907.11,7915.52,17.3316


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7939 entries, 0 to 7938
Data columns (total 6 columns):
open_time    7939 non-null datetime64[ns]
open         7939 non-null float64
high         7939 non-null float64
low          7939 non-null float64
close        7939 non-null float64
volume       7939 non-null float64
dtypes: datetime64[ns](1), float64(5)
memory usage: 372.2 KB


In [28]:
df = df[df['open_time'] >= pd.to_datetime('2019-06-07 15:00:00')] # in the bracket, will returns a index
df.head()

Unnamed: 0,open_time,open,high,low,close,volume
13,2019-06-07 15:00:00,7916.0,7924.6,7916.0,7920.96,22.1088
14,2019-06-07 15:01:00,7920.96,7924.7,7920.96,7921.07,14.5387
15,2019-06-07 15:02:00,7921.07,7930.0,7921.07,7926.22,12.5767
16,2019-06-07 15:03:00,7927.44,7928.93,7924.15,7926.38,9.0886
17,2019-06-07 15:04:00,7924.68,7930.9,7924.07,7930.9,9.6222


### resample method1


In [29]:
df.set_index('open_time', inplace=True)
df.head()

Unnamed: 0_level_0,open,high,low,close,volume
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-06-07 15:00:00,7916.0,7924.6,7916.0,7920.96,22.1088
2019-06-07 15:01:00,7920.96,7924.7,7920.96,7921.07,14.5387
2019-06-07 15:02:00,7921.07,7930.0,7921.07,7926.22,12.5767
2019-06-07 15:03:00,7927.44,7928.93,7924.15,7926.38,9.0886
2019-06-07 15:04:00,7924.68,7930.9,7924.07,7930.9,9.6222


In [30]:
# 周期转换方法：resample
rule_cycle = '1D'  # rule_cycle='5T'：意思是5分钟，意味着转变为5分钟数据  # 15T  1H  1D 一天
cycle_df = pd.DataFrame()
cycle_df['close'] = df['close'].resample(rule=rule_cycle).last()  # last：取这5分钟的最后一行数据
# # 开、高、低的价格，成交量
cycle_df['open'] = df['open'].resample(rule=rule_cycle).first()  # 五分钟内的第一个值就是开盘价
cycle_df['high'] = df['high'].resample(rule=rule_cycle).max()  # 五分钟内的最高价就是High
cycle_df['low'] = df['low'].resample(rule=rule_cycle).min()  # 五分钟内的最低价就是low
cycle_df['volume'] = df['volume'].resample(rule=rule_cycle).sum()  # 五分钟内的成交量的综合就是成交量
cycle_df

Unnamed: 0_level_0,close,open,high,low,volume
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-06-07,7980.53,7916.0,8100.0,7890.0,11512.6344
2019-06-08,7893.62,7978.94,8044.65,7751.0,22657.3296
2019-06-09,7628.13,7895.28,7935.0,7506.66,31568.4652
2019-06-10,7982.75,7627.57,8020.0,7511.0,36756.0785
2019-06-11,7884.9,7981.0,8010.0,7692.23,30334.9994
2019-06-12,8127.64,7884.9,8200.0,7788.99,41597.0826
2019-06-13,8082.76,8127.64,8147.26,8026.0,3275.2374


### resample method2

In [31]:
# 通过DataFrame直接进行转换.
rule_cycle = '5T'
# df.set_index('open_time', inplace=True)
cycle_df_min = df.resample(rule=rule_cycle).agg(
                                            {'open': 'first',
                                             'high': 'max',
                                             'low': 'min',
                                             'close': 'last',
                                             'volume': 'sum',
                                             })
cycle_df_min.head()

Unnamed: 0_level_0,open,high,low,close,volume
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-06-07 15:00:00,7916.0,7930.9,7916.0,7930.9,67.9351
2019-06-07 15:05:00,7929.68,7935.5,7925.0,7926.24,60.7766
2019-06-07 15:10:00,7927.82,7929.75,7908.23,7919.79,101.2947
2019-06-07 15:15:00,7919.7,7925.0,7918.18,7925.0,71.815
2019-06-07 15:20:00,7924.98,7924.98,7915.0,7919.41,92.0102


### resample method3

In [32]:
rule_cycle = '5T'
# df.set_index('open_time', inplace=True)
df.reset_index(drop=False, inplace=True) 
cycle_df1 = df.resample(rule=rule_cycle, on='open_time').agg(
                                                                {'open': 'first',
                                                                 'high': 'max',
                                                                 'low': 'min',
                                                                 'close': 'last',
                                                                 'volume': 'sum',
                                                                 })

# cycle_df1 = cycle_df1[['open', 'high', 'low', 'close', 'volume']]

# 去除不必要的数据 去除一天都没有交易的周
cycle_df1.dropna(subset=['open'], inplace=True)
# 去除成交量为0的交易周期
cycle_df1 = cycle_df1[cycle_df1['volume'] > 0] # cycle_df1['volume'] > 0
cycle_df1.head()

Unnamed: 0_level_0,open,high,low,close,volume
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-06-07 15:00:00,7916.0,7930.9,7916.0,7930.9,67.9351
2019-06-07 15:05:00,7929.68,7935.5,7925.0,7926.24,60.7766
2019-06-07 15:10:00,7927.82,7929.75,7908.23,7919.79,101.2947
2019-06-07 15:15:00,7919.7,7925.0,7918.18,7925.0,71.815
2019-06-07 15:20:00,7924.98,7924.98,7915.0,7919.41,92.0102


the list of character used in resample
'''
    B       business day frequency
    C       custom business day frequency (experimental)
    D       calendar day frequency
    W       weekly frequency
    M       month end frequency
    SM      semi-month end frequency (15th and end of month)
    BM      business month end frequency
    CBM     custom business month end frequency
    MS      month start frequency
    SMS     semi-month start frequency (1st and 15th)
    BMS     business month start frequency
    CBMS    custom business month start frequency
    Q       quarter end frequency
    BQ      business quarter endfrequency
    QS      quarter start frequency
    BQS     business quarter start frequency
    A       year end frequency
    BA      business year end frequency
    AS      year start frequency
    BAS     business year start frequency
    BH      business hour frequency
    H       hourly frequency
    T       minutely frequency
    S       secondly frequency
    L       milliseonds
    U       microseconds
    N       nanoseconds
'''