In [2]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np

# Pandas中的数据类型：Series 和 DataFrame

## Series：竖起来的List
默认创建索引为0到N-1的整数型索引

可以通过values和index属性获取其数组表示形式和索引对象

可以将Dict转化为Series对象 

In [2]:
D={"python":8000,"c++":8100,"php":8200}
Series(D)

c++       8100
php       8200
python    8000
dtype: int64

isnull()和notnull()可用于检测缺失数据

运算中自动对齐不同索引的数据（广播）

Series对象本身及其索引都有一个name属性

## DataFrame: 表格型的数据结构
可以通过values,index和columns属性获取其数组表示形式和索引对象(index对象不可修改) 

In [4]:
data = {"state":["Ohio","Ohio","Ohio","Nevada","Nevada"],
        "year":[2000,2001,2002,2001,2002],
        "pop":[1.5,1.7,3.6,2.4,2.9]}
DataFrame(data,columns=["year","state","pop","debt"],index=["one","two","three","four","five"])

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


将列表或数组赋值给某列时，长度必须与DataFrame的长度相匹配

如果赋值的是一个Series，就会精确匹配DataFrame的索引，空位填充NA

# Series和DataFrame的基本功能 

## 索引修改 

### 调整现有索引：reindex（Series） 

In [5]:
obj = Series([4.5,7.2,-5.3,3.6],index=["d","b","a","c"])
obj.reindex(["a","b","c","d","e"],fill_value=0) 

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [6]:
#插值：ffill或pad（前向填充）；bfill（后向填充）
obj = Series(["blue","purple","yellow"],index=[0,2,4])
obj.reindex(range(6),method="ffill")    

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

### 将现有列设为索引：set_index （DataFrame）
set_index(keys, drop=True, append=False, inplace=False, verify_integrity=False)
> drop：删除原index对应的列

In [7]:
frame = DataFrame(np.arange(8).reshape(2,4),index=["three","one"],columns=list("dabc"))
frame.set_index("a")

Unnamed: 0_level_0,d,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,2,3
5,4,6,7


### 重置索引：reset_index (DataFrame)

In [5]:
frame = DataFrame(np.arange(8).reshape(2,4),columns=list("dabc"))
frame.reset_index()

Unnamed: 0,index,d,a,b,c
0,0,0,1,2,3
1,1,4,5,6,7


### 修改列名和索引：rename 
> df.rename(index={1:”A”,2:”B”},columns={“区域”:”新区域”})

### 带有重复值的轴索引：index.is_unique

In [82]:
obj = Series(range(5),index=["a","a","b","b","c"])
obj.index.is_unique

False

## 轴修改 

### 丢弃指定轴：drop 

data.drop(labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise')

>若要在数据源上修改，需添加inplace=True

In [5]:
data = DataFrame(np.arange(16).reshape((4,4)),
                index=["Ohio","Colorado","Utah","New York"],
                columns=["one","two","three","four"])
data.drop(["Colorado","Ohio"])
#或者data.drop(index=data.index[:2])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [11]:
data.drop(["two","three"],axis=1)
#或者data.drop(data.columns[1:3],axis=1)
#或者data.drop(columns=data.columns[1:3])

Unnamed: 0,one,four
Ohio,0,3
Colorado,4,7
Utah,8,11
New York,12,15


### 增加指定轴上的内容 
>添加列：直接创建或insert或者assign

>添加行：先切割再append或concat

In [25]:
#np.where
data = DataFrame(np.arange(16).reshape((4,4)),
                index=["Ohio","Colorado","Utah","New York"],
                columns=["one","two","three","four"])
data["judge"]=np.where(data["four"]>10,"positive","negative")
data

Unnamed: 0,one,two,three,four,judge
Ohio,0,1,2,3,negative
Colorado,4,5,6,7,negative
Utah,8,9,10,11,positive
New York,12,13,14,15,positive


insert(loc, column, value, allow_duplicates=False) 

In [31]:
#insert
data = DataFrame(np.arange(16).reshape((4,4)),
                index=["Ohio","Colorado","Utah","New York"],
                columns=["one","two","three","four"])
data.insert(4,"five",data["four"])
data

Unnamed: 0,one,two,three,four,five
Ohio,0,1,2,3,3
Colorado,4,5,6,7,7
Utah,8,9,10,11,11
New York,12,13,14,15,15


In [32]:
#assign
data = DataFrame(np.arange(16).reshape((4,4)),
                index=["Ohio","Colorado","Utah","New York"],
                columns=["one","two","three","four"])
data = data.assign(accum=lambda x:x["one"]+x["two"]+x["three"]+x["four"])
data

Unnamed: 0,one,two,three,four,accum
Ohio,0,1,2,3,6
Colorado,4,5,6,7,22
Utah,8,9,10,11,38
New York,12,13,14,15,54


In [35]:
#append
data = DataFrame(np.arange(16).reshape((4,4)),
                index=["Ohio","Colorado","Utah","New York"],
                columns=["one","two","three","four"])
insertRow=DataFrame([["a","b","c","d"]],columns=["one","two","three","four"])
newdata=data[:2].append(insertRow,ignore_index=True).append(data[2:],ignore_index=True)  #ignore_index=True：忽略原索引，重新排列
newdata

Unnamed: 0,one,two,three,four
0,0,1,2,3
1,4,5,6,7
2,a,b,c,d
3,8,9,10,11
4,12,13,14,15


In [36]:
#concat
pd.concat([data[:2],insertRow,data[2:]],ignore_index=True)

Unnamed: 0,one,two,three,four
0,0,1,2,3
1,4,5,6,7
2,a,b,c,d
3,8,9,10,11
4,12,13,14,15


## 选取和过滤 

### 对Series
利用标签的切片运算末端封闭，和普通python切片不同（仅对Series成立）

In [37]:
obj = Series(np.arange(4),index=["a","b","c","d"])
obj[obj<2]

a    0
b    1
dtype: int32

In [38]:
obj["b":"c"]

b    1
c    2
dtype: int32

### 对DataFrame 

In [39]:
#取列：无法进行切片运算
data = DataFrame(np.arange(16).reshape((4,4)), 
                index=["Ohio","Colorado","Utah","New York"], 
                columns=["one","two","three","four"])
data[["two","three"]]

Unnamed: 0,two,three
Ohio,1,2
Colorado,5,6
Utah,9,10
New York,13,14


In [40]:
#取行：切片或布尔型数组
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [41]:
data[data["three"]>5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


#### 使用ix（被loc替代，但ix使用更随心所欲）

In [6]:
data.ix[2]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [59]:
data.ix[["Colorado","Utah"],[3,0,1]]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,four,one,two
Colorado,7,4,5
Utah,11,8,9


In [57]:
#data.ix[:"Utah","two"]
data.loc[:"Utah","two"]

Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int32

#### iloc 

In [45]:
data.iloc[1:3,2:4]

Unnamed: 0,three,four
Colorado,6,7
Utah,10,11


## 层次化索引 
>if-then

>if-then-else

>And(&)
>>[(condition1) & (condition2)]

>Or(|)
>>[(condition1) | (condition2)]

In [67]:
df = pd.DataFrame(np.arange(12).reshape(4,3),columns=["AAA","BBB","CCC"])
df.loc[df.AAA >= 5,['BBB','CCC']] = 555
df

Unnamed: 0,AAA,BBB,CCC
0,0,1,2
1,3,4,5
2,6,555,555
3,9,555,555


In [68]:
df = pd.DataFrame(np.arange(12).reshape(4,3),columns=["AAA","BBB","CCC"])
df_mask = pd.DataFrame({'AAA' : [True] * 4, 'BBB' : [False] * 4,'CCC' : [True,False] * 2})
df.where(df_mask,-1000)

Unnamed: 0,AAA,BBB,CCC
0,0,-1000,2
1,3,-1000,-1000
2,6,-1000,8
3,9,-1000,-1000


In [70]:
df = pd.DataFrame(np.arange(12).reshape(4,3),columns=["AAA","BBB","CCC"])
df['logic'] = np.where(df['AAA'] > 5,'high','low')
df

Unnamed: 0,AAA,BBB,CCC,logic
0,0,1,2,low
1,3,4,5,low
2,6,7,8,high
3,9,10,11,high


In [76]:
df = pd.DataFrame(np.arange(12).reshape(4,3),columns=["AAA","BBB","CCC"])
df.loc[(df['BBB'] < 5) & (df['CCC'] >= 4), 'AAA'] = 0.1
df

Unnamed: 0,AAA,BBB,CCC
0,0.0,1,2
1,0.1,4,5
2,6.0,7,8
3,9.0,10,11


In [81]:
#依据条件进行筛选
df = pd.DataFrame(np.arange(12).reshape(4,3),columns=["AAA","BBB","CCC"])
df.loc[(df['BBB'] < 5) & (df['CCC'] >= 4)]

Unnamed: 0,AAA,BBB,CCC
1,3,4,5


In [77]:
df = pd.DataFrame(np.arange(12).reshape(4,3),columns=["AAA","BBB","CCC"])
df.loc[(df['BBB'] > 5) | (df['CCC'] >= 7), 'AAA'] = 0.1
df

Unnamed: 0,AAA,BBB,CCC
0,0.0,1,2
1,3.0,4,5
2,0.1,7,8
3,0.1,10,11


## 算数运算，数据对齐及填充

In [83]:
df1 = DataFrame(np.arange(20).reshape(4,5),columns=list("abcde"))
df2 = DataFrame(np.arange(12).reshape(3,4),columns=list("abcd"))
#相加时，没有重叠的位置会产生NA值
df1+df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [84]:
#利用add方法，传入df2以及一个fill_value参数
df1.add(df2,fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


add sub div mul
>在列上进行广播

## 排序和排名 

### 对行或者索引进行排序
>sort_index(axis=0, ascending=True,inplace=False) 

In [3]:
frame = DataFrame(np.arange(8).reshape(2,4),index=["three","one"],
                  columns=list("dabc"))
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [4]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


### 按值进行排序
>sort_values(by, axis=0, ascending=True, inplace=False)   

In [5]:
frame.sort_values(by=["a","b"])

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


### 排名
>rank(axis=0,method=”average”,ascending=True)

>method : {'average', 'min', 'max', 'first', 'dense'}
>>average: 相等分组中为各个值平均分配排名       
>>min: 整个分组中最小排名       
>>max: 整个分组中最大排名        
>>first: 按值在原始数据中出现顺序分配排名        
>>dense: like 'min', but rank always increases by 1 between groups

In [6]:
obj=Series([-7,4,3,4,-2])
obj.rank()

0    1.0
1    4.5
2    3.0
3    4.5
4    2.0
dtype: float64

In [7]:
obj.rank(method="first")

0    1.0
1    4.0
2    3.0
3    5.0
4    2.0
dtype: float64

In [8]:
frame=DataFrame({"b":[4.3,7,-3,2],"a":[0,1,0,1],"c":[-2,5,8,-2.5]})
frame.rank(axis=1)

Unnamed: 0,a,b,c
0,2.0,3.0,1.0
1,1.0,3.0,2.0
2,2.0,1.0,3.0
3,2.0,3.0,1.0


## 汇总和计算统计描述 

>count()：非NA值的数量

>min max

>argmin argmax：获取最大值或最小值的索引位置

>idxmin idxmax：获取最大值或最小值的索引值

>sum(axis=0)：使用numeric_only=True跳过非数值

>mean(axis=0,skipna=True)

>var std

>cumsum cumin cummax cumprod

>nlargest：获取最大的数行    df.nlargest(5,”人员数量”)

>diff：计算一阶差分，对时间序列很有用

>describe()：
>>对数值型数据：count mean min std max和等分位数

>>对非数值型数据：count unique top freq

## 唯一值、值计数及成员资格
>获得唯一值：unique() 
>#会纳入NA

>值计数：value_counts()    
>#不会纳入NA
>#添加参数normalize=True可以得到计数占比

## 矢量化判断成员资格
>方法1：isin

>方法2：str.contains

In [11]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 4, 7]})
df

Unnamed: 0,A,B
0,1,1
1,2,4
2,3,7


In [12]:
df.isin({'A': [1, 3], 'B': [4, 7, 12]})

Unnamed: 0,A,B
0,True,False
1,False,True
2,True,True


### 去除某一列的特殊值
>方法1：boolean减法^

>方法2：使用~，为翻转运算，将二进制的0和1互换） 

In [10]:
df[True^df["A"].isin([1])]  
#或者df[~df["A"].isin([1])

Unnamed: 0,A,B
1,2,4
2,3,7


## 处理缺失数据
>dropna(axis=0,how=”any”,thresh=None,inpalce=False)
>>#此处axis和其它地方不太一样,axis=0指按行处理

>>how : {'any', 'all'}

>>thresh：int，指定非NA数目

>fillna(value=None,method=None,axis=None,limit=None,inplace=False)
>>method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None

>isnull(),notnull()

In [15]:
data=DataFrame([[1,6.5,3],[1,np.nan,np.nan],[np.nan,np.nan,np.nan],
                [np.nan,6.5,3]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [17]:
data.dropna(how="all") 

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [16]:
data.dropna(axis=1,how="all") 

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [7]:
frame=DataFrame(np.random.randn(7,3))
frame.ix[:4,1]=np.nan
frame.ix[:2,2]=np.nan
frame.fillna({1:0.5,2:-1})

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,0,1,2
0,0.14672,0.5,-1.0
1,-0.719433,0.5,-1.0
2,0.240827,0.5,-1.0
3,-1.062023,0.5,-0.071425
4,0.150435,0.5,0.942021
5,-0.966866,-0.078531,1.499292
6,0.676911,1.717732,-0.198894


## 其它操作

### 矢量化的字符串函数:Series.str 
>contains	

>count	

>endswith、startswith	

>get	获取各元素的第i个字符

>count

>len	

>lower、upper	

>match	根据指定的正则表达式对各个元素执行re.match

>repeat	重复值。例如s.str.repeat(3)相当于对各个字符串执行x * 3

>replace	用指定字符串替换找到的模式