In [39]:
import numpy as np
import pandas as pd
import random

# pandas dataframe 修改 index 和 columns

In [42]:
# 生成10名同学，5门功课的数据
score = np.random.randint(40, 100, (10, 5))
# 使用Pandas中的数据结构
score_df = pd.DataFrame(score)
# 构造行索引序列
subjects = ["语文", "数学", "英语", "政治", "体育"]

# 构造列索引序列
stu = ['同学' + str(i) for i in range(score_df.shape[0])]

# 添加行索引
data = pd.DataFrame(score, columns=subjects, index=stu)
data

Unnamed: 0,语文,数学,英语,政治,体育
同学0,55,52,55,81,57
同学1,57,61,73,46,41
同学2,61,43,88,81,54
同学3,91,63,48,47,98
同学4,61,82,68,72,98
同学5,71,83,81,95,99
同学6,97,79,57,64,87
同学7,65,76,62,65,52
同学8,51,51,70,52,43
同学9,95,86,95,95,83


# 重设索引
reset_index(drop=False)

drop:默认为False，不删除原来索引，如果为True,删除原来的索引值

In [44]:
# 重置索引,drop=False
data.reset_index(drop=False)

Unnamed: 0,index,语文,数学,英语,政治,体育
0,同学0,55,52,55,81,57
1,同学1,57,61,73,46,41
2,同学2,61,43,88,81,54
3,同学3,91,63,48,47,98
4,同学4,61,82,68,72,98
5,同学5,71,83,81,95,99
6,同学6,97,79,57,64,87
7,同学7,65,76,62,65,52
8,同学8,51,51,70,52,43
9,同学9,95,86,95,95,83


# 以某列值设置为新的索引
- set_index(keys, drop=True)
- keys : 列索引名成或者列索引名称的列表
- drop : boolean, default True.当做新的索引，删除原来的列

In [46]:
df = pd.DataFrame({'month': [1, 4, 7, 10],
                    'year': [2012, 2014, 2013, 2014],
                    'sale':[55, 40, 84, 31]})
df.set_index('month', drop=True)

Unnamed: 0_level_0,year,sale
month,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2012,55
4,2014,40
7,2013,84
10,2014,31


In [47]:
#设置多个索引，以年和月份
df = pd.DataFrame({'month': [1, 4, 7, 10],
                    'year': [2012, 2014, 2013, 2014],
                    'sale':[55, 40, 84, 31]})
df = df.set_index(['year', 'month'], drop=True)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,sale
year,month,Unnamed: 2_level_1
2012,1,55
2014,4,40
2013,7,84
2014,10,31


In [34]:
df.index

MultiIndex(levels=[[2012, 2013, 2014], [1, 4, 7, 10]],
           labels=[[0, 2, 1, 2], [0, 1, 2, 3]],
           names=['year', 'month'])

# multiIndex的创建

In [48]:
arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']]
pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))


MultiIndex(levels=[[1, 2], ['blue', 'red']],
           labels=[[0, 0, 1, 1], [1, 0, 1, 0]],
           names=['number', 'color'])

# panel的创建
class pandas.Panel(data=None, items=None, major_axis=None, minor_axis=None)

作用：存储3维数组的Panel结构

参数：

data : ndarray或者dataframe

items : 索引或类似数组的对象，axis=0

major_axis : 索引或类似数组的对象，axis=1

minor_axis : 索引或类似数组的对象，axis=2

In [49]:
p = pd.Panel(data=np.arange(24).reshape(4,3,2),
                 items=list('ABCD'),
                 major_axis=pd.date_range('20130101', periods=3),
                 minor_axis=['first', 'second'])
p[:,:,"first"]

Unnamed: 0,A,B,C,D
2013-01-01,0,6,12,18
2013-01-02,2,8,14,20
2013-01-03,4,10,16,22


In [50]:
data = pd.read_csv("./data/stock_day.csv")
# 删除一些列，让数据更简单些，再去做后面的操作
data = data.drop(["ma5","ma10","ma20","v_ma5","v_ma10","v_ma20"], axis=1)
data.head()

Unnamed: 0,open,high,close,low,volume,price_change,p_change,turnover
2018-02-27,23.53,25.88,24.16,23.53,95578.03,0.63,2.68,2.39
2018-02-26,22.8,23.78,23.53,22.8,60985.11,0.69,3.02,1.53
2018-02-23,22.88,23.37,22.82,22.71,52914.01,0.54,2.42,1.32
2018-02-22,22.25,22.76,22.28,22.02,36105.01,0.36,1.64,0.9
2018-02-14,21.49,21.99,21.92,21.48,23331.04,0.44,2.05,0.58


In [19]:
# 直接使用行列索引名字的方式（先列后行）
data['open']['2018-02-27']

23.53

# loc
使用loc:只能指定行列索引的名字

In [22]:
# 使用loc:只能指定行列索引的名字
data.loc['2018-02-27':'2018-02-22', ['open', 'low']]

Unnamed: 0,open,low
2018-02-27,23.53,23.53
2018-02-26,22.8,22.8
2018-02-23,22.88,22.71
2018-02-22,22.25,22.02


# iloc
使用iloc可以通过索引的下标去获取

In [36]:
# 使用iloc可以通过索引的下标去获取
# 获取前3天数据,前5列的结果
data.iloc[:3, :5]

Unnamed: 0,open,high,close,low,volume
2018-02-27,23.53,25.88,24.16,23.53,95578.03
2018-02-26,22.8,23.78,23.53,22.8,60985.11
2018-02-23,22.88,23.37,22.82,22.71,52914.01


# ix 组合索引
获取行第1天到第4天，['open', 'close', 'high', 'low']这个四个指标的结果




In [26]:
data.ix[0:4, ['open', 'close', 'high', 'low']]
# 推荐使用loc和iloc来获取的方式
data.loc[data.index[0:4], ['open', 'close', 'high', 'low']]
data.iloc[0:4, data.columns.get_indexer(['open', 'close', 'high', 'low'])]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,open,close,high,low
2018-02-27,23.53,24.16,25.88,23.53
2018-02-26,22.8,23.53,23.78,22.8
2018-02-23,22.88,22.82,23.37,22.71
2018-02-22,22.25,22.28,22.76,22.02


In [36]:
np.arange(24)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23])