In [1]:
import pandas as pd
import numpy as np

# 1. 创建DataFrame
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html#pandas.DataFrame

In [2]:
data1 = ["小明", 20] # 以list 存储每一行的数据
data2 = ["小雨", 21]
data3 = ["小花", 22]
header = ["姓名", "年龄"]
df = pd.DataFrame([data1, data2, data3],columns=header) # 组成一个csv
df.head()

Unnamed: 0,姓名,年龄
0,小明,20
1,小雨,21
2,小花,22


In [3]:
d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=d)
df

Unnamed: 0,col1,col2
0,1,3
1,2,4


# 2. 生成csv文件
https://pandas.pydata.org/docs/user_guide/io.html#io-store-in-csv

In [4]:
data1 = ["小明", 20] # 以list 存储每一行的数据
data2 = ["小雨", 21]
data3 = ["小花", 22]
header = ["姓名", "年龄"]
df = pd.DataFrame([data1, data2, data3], columns=header) # 组成一个csv
# index=False表示不生成行号
df.to_csv("./mydata.csv", index=False)

# 4. 读取csv文件
https://pandas.pydata.org/docs/user_guide/io.html#io-read-csv-table

In [5]:
csv_datas = pd.read_csv('./mydata.csv')
print(type(csv_datas))
csv_datas.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,姓名,年龄
0,小明,20
1,小雨,21
2,小花,22


# 5. 取单列或者多列的值
列值的类型为pandas.core.series.Series

In [6]:
# 取单列的值
data1 = ["小明", 20] # 以list 存储每一行的数据
data2 = ["小雨", 21]
data3 = ["小花", 22]
header = ["姓名", "年龄"]
df = pd.DataFrame([data1, data2, data3], columns=header) # 组成一个csv
names = df["姓名"]
print(type(names))
print(names)
print(names.index)

<class 'pandas.core.series.Series'>
0    小明
1    小雨
2    小花
Name: 姓名, dtype: object
RangeIndex(start=0, stop=3, step=1)


In [7]:
# 取多列的值
data1 = ["小明", 20] # 以list 存储每一行的数据
data2 = ["小雨", 21]
data3 = ["小花", 22]
header = ["姓名", "年龄"]
df = pd.DataFrame([data1, data2, data3], columns=header) # 组成一个csv
# 和上面的例子不同的是，参数是字段数组
names = df[["姓名"]]
print(type(names))
print(names)
print(names.index)

<class 'pandas.core.frame.DataFrame'>
   姓名
0  小明
1  小雨
2  小花
RangeIndex(start=0, stop=3, step=1)


# 6. pandas.core.series.Series
https://pandas.pydata.org/docs/user_guide/dsintro.html#series

In [8]:
# 从数组创建
s = pd.Series([2,4,6,8,10], index=["a", "b", "c", "d", "e"],name="something")
print(type(s))
s

<class 'pandas.core.series.Series'>


a     2
b     4
c     6
d     8
e    10
Name: something, dtype: int64

In [9]:
# 从np数组创建
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
s

a   -0.313166
b   -0.235279
c   -0.549104
d    2.431418
e    0.320254
dtype: float64

# 7. Series is ndarray-like
可以像使用ndarray一样使用series

https://pandas.pydata.org/docs/user_guide/dsintro.html#series-is-ndarray-like

In [10]:
s[0]

-0.3131656763100293

# 8. get_dummies
https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html

In [11]:
s = pd.Series(list('abca'))
pd.get_dummies(s)

Unnamed: 0,a,b,c
0,1,0,0
1,0,1,0
2,0,0,1
3,1,0,0


In [12]:
s=['a','b','c','a']
d = pd.get_dummies(s,prefix='type_')
print(type(d))
d.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,type__a,type__b,type__c
0,1,0,0
1,0,1,0
2,0,0,1
3,1,0,0


# 9. concat
连接多个对象，对象可以是Series or DataFrame 

axis{0/’index’, 1/’columns’}, default 0

https://pandas.pydata.org/docs/reference/api/pandas.concat.html

In [13]:
# 按行串起来,结果是Series
s1 = pd.Series(['a', 'b'])
s2 = pd.Series(['c', 'd'])
d = pd.concat([s1, s2])
print(type(d))
print(d)

<class 'pandas.core.series.Series'>
0    a
1    b
0    c
1    d
dtype: object


In [14]:
# 按行串起来，结果是DataFrame
s1 = pd.Series(['a', 'b'],name="name")
s2 = pd.Series(['c', 'd'],name="age")
d = pd.concat([s1, s2], axis=1)
print(type(d))
print(d)

<class 'pandas.core.frame.DataFrame'>
  name age
0    a   c
1    b   d


# 10. drop
删除行或者列，axis=1表示列

返回拷贝还是直接操作此对象，由inplace属性决定
inplace bool, default False
If False, return a copy. Otherwise, do operation inplace and return None.

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop.html

In [15]:
df = pd.DataFrame(np.arange(12).reshape(3, 4),
                  columns=['A', 'B', 'C', 'D'])
print(df)
df2 = df.drop(['B', 'C'], axis=1)
print(df2)

   A  B   C   D
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
   A   D
0  0   3
1  4   7
2  8  11


# 11. loc属性
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html

可以按行或列过滤数据
## 获取一行数据
Single label. Note this returns the row as a Series.


In [16]:
# 获取一行数据
df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
     index=['cobra', 'viper', 'sidewinder'],
     columns=['max_speed', 'shield'])
print(df.head())

            max_speed  shield
cobra               1       2
viper               4       5
sidewinder          7       8


In [17]:
# 获取一行数据
row = df.loc['viper']
print(row)
print(type(row))

max_speed    4
shield       5
Name: viper, dtype: int64
<class 'pandas.core.series.Series'>


## 获取某些行，作为DataFrame返回，类似过滤行
List of labels. Note using [[]] returns a DataFrame.



In [18]:
df.loc[['viper', 'sidewinder']]

Unnamed: 0,max_speed,shield
viper,4,5
sidewinder,7,8


## 获取某些行的某列

In [19]:
df.loc['cobra':'viper', 'max_speed']

cobra    1
viper    4
Name: max_speed, dtype: int64

## 获取所有行的某列

In [20]:
# 这个和df['max_speed']是一样的
v=df.loc[:, 'max_speed']
print(type(v))
print(v)

<class 'pandas.core.series.Series'>
cobra         1
viper         4
sidewinder    7
Name: max_speed, dtype: int64


## 获取所有行的多个列

In [21]:
v=df.loc[:, ['max_speed']]
print(type(v))
print(v)

<class 'pandas.core.frame.DataFrame'>
            max_speed
cobra               1
viper               4
sidewinder          7


# 12. values属性
将DataFrame中的数据变为numpy数组

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.values.html

In [22]:
df = pd.DataFrame({'age':    [ 3,  29],
                   'height': [94, 170],
                   'weight': [31, 115]})
print(df)

   age  height  weight
0    3      94      31
1   29     170     115


In [23]:
print(df.values)

[[  3  94  31]
 [ 29 170 115]]


In [24]:
print(type(df.values))

<class 'numpy.ndarray'>


# 13. to_datetime
将数据转换为datetime

https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html

In [25]:
# 创建DataFrame
d = {'dteday': ['2011/1/1','2011/1/2','2011/1/3','2011/1/4'], 'cnt': [3, 4,5,6]}
df = pd.DataFrame(data=d)
print(df)

     dteday  cnt
0  2011/1/1    3
1  2011/1/2    4
2  2011/1/3    5
3  2011/1/4    6


In [26]:
# 获得一个Series，它的item类型是object
dte = df.loc[df.index]['dteday']
print(dte)
print(type(dte))

0    2011/1/1
1    2011/1/2
2    2011/1/3
3    2011/1/4
Name: dteday, dtype: object
<class 'pandas.core.series.Series'>


In [27]:
# 转换为另一个Series，它的item类型是datetime64
dates = pd.to_datetime(dte)
print(dates)
print(type(dates))

0   2011-01-01
1   2011-01-02
2   2011-01-03
3   2011-01-04
Name: dteday, dtype: datetime64[ns]
<class 'pandas.core.series.Series'>


# 13. Timestamp.strftime
https://pandas.pydata.org/docs/reference/api/pandas.Timestamp.strftime.html?highlight=strftime#pandas.Timestamp.strftime



In [28]:
# https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
# %b的含义：Month as locale’s abbreviated name.
# %d的含义：Day of the month as a zero-padded decimal number.
def ele(d):
    print(type(d))
    return d.strftime('%b %d')
d2 = dates.apply(ele)
print(d2)

<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
0    Jan 01
1    Jan 02
2    Jan 03
3    Jan 04
Name: dteday, dtype: object


# 14. Series.apply
https://pandas.pydata.org/docs/reference/api/pandas.Series.apply.html

In [29]:
s = pd.Series([20, 21, 12],
              index=['London', 'New York', 'Helsinki'])
def square(x):
    return x ** 2
s1 = s.apply(square)
s2 = s.apply(lambda x: x ** 2)
print(s)
print(s1)
print(s2)


London      20
New York    21
Helsinki    12
dtype: int64
London      400
New York    441
Helsinki    144
dtype: int64
London      400
New York    441
Helsinki    144
dtype: int64


# 15. DataFrame.columns

In [37]:
# 遍历所有的列
df = pd.DataFrame({'age':    [ 3,  29],
                   'height': [94, 170],
                   'weight': [31, 115]})
print(df.columns.size)
print(type(df.columns))
for i in range(0,df.columns.size):
    print(df.columns[i])

3
<class 'pandas.core.indexes.base.Index'>
age
height
weight


10