# 创建

## 简单创建

In [2]:
# 创建一个空的dataframe
import pandas as pd
pd.DataFrame(columns=['numbers'])

Unnamed: 0,numbers


In [1]:
import pandas as pd
import numpy as np

## 创建非空的DataFrame
classes = ["101", "202", "303"]
df = pd.DataFrame(
    {
        "class":[classes[x % len(classes)] for x in np.random.randint(0, len(classes), 5)],
        "math":np.random.randint(0, 100, 5),
        "physics":np.random.randint(0, 100, 5)  
    })
df

Unnamed: 0,class,math,physics
0,202,19,44
1,303,65,1
2,101,79,62
3,202,5,86
4,101,29,84


In [3]:
df.shape[0]

5

## 指定`index`

In [7]:
import numpy as np
import pandas as pd

# 从列表创建，指定行、列索引
df = pd.DataFrame([10, 20, 30, 40],
                  columns=['numbers'],
                  index=['a', 'b', 'c', 'd'])
print(df)
print(df.columns[0])

   numbers
a       10
b       20
c       30
d       40
numbers


## 从array创建

In [1]:
import numpy as np
import pandas as pd

dates = pd.date_range('20231201', periods=6)
pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

Unnamed: 0,A,B,C,D
2023-12-01,-0.155206,-0.946455,-0.249045,-0.172948
2023-12-02,-0.337347,-0.690909,-0.598659,0.632775
2023-12-03,0.731887,-0.803858,1.070908,-1.078868
2023-12-04,1.242421,1.281355,-0.759936,-1.4829
2023-12-05,-0.824093,0.110502,-0.00858,-0.163337
2023-12-06,-0.67007,-0.127726,1.324386,0.023815


## 从字典创建
使用字典创建时默认以key做为列索引，但有个要求，值不能是标量值（），比如下面这样会出错。这是因为字典的值作为列时需要根据它来生成DataFrame的index，如果你传入的是整数，那么无法生成index。

In [26]:
# 下面这样创建会出错
#pd.DataFrame(
#    {
#        'A':24492,
#        'B':24493
#    }
#)

# 指定index可以解决问题
pd.DataFrame(
    {
        'A':24492,
        'B':24493
    },
    index = [0]
)


Unnamed: 0,A,B
0,24492,24493


下面这样也没有问题：

In [28]:
pd.DataFrame(
    {
        'A':[24492],
        'B':24493
    }
)



Unnamed: 0,A,B
0,24492,24493


将key作为index

如果我要将字典的键作为index, 值作为column呢？那么传入`items()`这样就将key和value作为DataFrame的列，然后重设index就可以了。


In [31]:
my_dict = {'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}
#pd.DataFrame(my_dict)
pd.DataFrame(my_dict.items())
#df=pd.DataFrame(dict.items(), columns=['Case', 'First Fail Build'])
#df.set_index('Case', inplace=True)


Unnamed: 0,0,1
0,a,"[1, 2, 3]"
1,b,"[4, 5, 6]"
2,c,"[7, 8, 9]"


In [35]:
import pandas as pd
data = {'Name':['Lisha', 'Shelly', 'Greay', 'Leo', 'Marry'],
        'Age':[18, 21, 29, 18, 23]}
pd.DataFrame(data,
             columns=['Age', 'Name'],
             index=['one', 'two', 'three', 'four', 'five'])

Unnamed: 0,Age,Name
one,18,Lisha
two,21,Shelly
three,29,Greay
four,18,Leo
five,23,Marry


## 从列表创建

- 创建时可以用`index`指定行标签，`columns`来指定列标签，不指定默认为0。
- 当传入一个单维列表的，默认将该列表作为DataFrame中的第一列
- 当传入一个二维列表时(`np.random.standard_normal((9, 4)).round(6)`)，那么每行对应DataFrame中的一行。




In [34]:
import pandas as pd
students = ['Lisha', 'Shelly', 'Greay', 'Leo', 'Marry']
pd.DataFrame(students)

Unnamed: 0,0
0,Lisha
1,Shelly
2,Greay
3,Leo
4,Marry


- 嵌套列表创建

这里的嵌套列表指的是两层列表，因为它刚好是一个二维数组的结构。所以在创建时，每个外层列表元素都作为单独的一行。

参考：

- [Nested List to Pandas Dataframe with headers](https://stackoverflow.com/questions/32857544/nested-list-to-pandas-dataframe-with-headers)

In [37]:
l = [['a', 'b', 'c', 'd'],
     [1, 2, 3, 4],
     ['i', 'j', 'k', 'l']]
pd.DataFrame(l)

Unnamed: 0,0,1,2,3
0,a,b,c,d
1,1,2,3,4
2,i,j,k,l


- Series列表

In [14]:
s1 = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
s2 = pd.Series([10, 20, 30, 40, 50], index=['a', 'b', 'c', 'd', 'e'])
s3 = pd.Series([10, 20, 30, 40, 50], index=['a', 'b', 'c', 'd', 'e'])

pd.DataFrame([s1, s2, s3])

Unnamed: 0,a,b,c,d,e
0,1,2,3,4,5
1,10,20,30,40,50
2,10,20,30,40,50


## 从元组创建

元组和列表的形式是类似的，只不过元组无法修改元素但是列表可以。

In [38]:
import pandas as pd

data = [('Facebook', 750, True),
        ('Alphabet', 1100, True),
        ('Amazon', 1700, True),
        ('Apple', 2100, False),
        ('Microsoft', 1750, False)]

pd.DataFrame(data, columns=['Name', 'M-cap', 'Internet Companies'])


Unnamed: 0,Name,M-cap,Internet Companies
0,Facebook,750,True
1,Alphabet,1100,True
2,Amazon,1700,True
3,Apple,2100,False
4,Microsoft,1750,False


# 添加

- 添加新列
- 添加新行


## 添加新列

- 使用`df[new_col] = list`添加
- 使用loc属性，`df.loc[:, new_col] = list`添加
- 使用insert方法，`df.insert(pos, new_col, list)`添加

In [2]:
# 添加新列
classes = ["101", "202", "303"]
df = pd.DataFrame(
    {
        "class":[classes[x % len(classes)] for x in np.random.randint(0, len(classes), 5)],
        "math":np.random.randint(0, 100, 5),
        "physics":np.random.randint(0, 100, 5)  
    })
df

Unnamed: 0,class,math,physics
0,101,49,65
1,101,29,12
2,202,45,34
3,101,43,5
4,303,46,78


In [3]:
df['chemical'] = np.random.randint(0, 100, 5)
df.loc[:, 'english'] = np.random.randint(0, 100, 5)
df.insert(1, 'geography', np.random.randint(0, 100, 5))

df

Unnamed: 0,class,geography,math,physics,chemical,english
0,101,41,49,65,55,7
1,101,0,29,12,61,54
2,202,47,45,34,1,38
3,101,41,43,5,20,63
4,303,65,46,78,30,20


In [7]:
new = df[df['class'] == '101']
new.index[-1]

np.int64(3)

In [11]:
df

Unnamed: 0,class,geography,math,physics,chemical,english
0,101,41,49,65,55,7
1,101,0,29,12,61,54
2,202,47,45,34,1,38
3,101,41,100,5,20,63
4,303,65,46,78,30,20


In [13]:
df.loc[new.index[-1], 'math']

np.int32(59)

## 添加新行

- `df.loc[i] = new_row`
- `df.append(new_dataframe)`已经在`1.4.0`中过时了，使用`df.concat(new_dataframe)`替代

参考：

- [Create a Pandas Dataframe by appending one row](https://stackoverflow.com/questions/10715965/create-a-pandas-dataframe-by-appending-one-row-at-a-time)

In [15]:
import numpy as np
import pandas as pd

dates = pd.date_range('20231201', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df


Unnamed: 0,A,B,C,D
2023-12-01,1.991024,-0.18245,1.268003,-1.093968
2023-12-02,-0.901188,1.491229,-1.648214,-2.19535
2023-12-03,-0.185531,-1.987802,-0.001021,-0.026237
2023-12-04,-0.796405,-1.102242,0.343598,0.245049
2023-12-05,-0.471208,-0.276307,-0.039126,-0.074961
2023-12-06,-1.585825,-0.175631,-0.734822,-0.178181


In [26]:
df.loc['20231207'] = np.random.rand(4)

new_df = pd.DataFrame(np.random.randn(2, 4), 
                      index=pd.date_range('20231208', periods=2),
                      columns=list('ABCD'))
pd.concat([df, new_df])


Unnamed: 0,A,B,C,D
2023-12-01 00:00:00,1.991024,-0.18245,1.268003,-1.093968
2023-12-02 00:00:00,-0.901188,1.491229,-1.648214,-2.19535
2023-12-03 00:00:00,-0.185531,-1.987802,-0.001021,-0.026237
2023-12-04 00:00:00,-0.796405,-1.102242,0.343598,0.245049
2023-12-05 00:00:00,-0.471208,-0.276307,-0.039126,-0.074961
2023-12-06 00:00:00,-1.585825,-0.175631,-0.734822,-0.178181
20231207,0.700368,0.897465,0.736365,0.922565
2023-12-08 00:00:00,-0.818686,0.473629,0.35702,-0.60067
2023-12-09 00:00:00,-0.146896,0.403821,0.415918,0.550938


# 查看数据

- `df.dtypes`
- `df.info()`
- `df.describe()`

In [6]:
print("--------df.info()--------")
print(df.info())
print("--------df.describe()--------")
print(df.describe(include='int32'))

--------df.info()--------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   class    15 non-null     object
 1   math     15 non-null     int32 
 2   physics  15 non-null     int32 
dtypes: int32(2), object(1)
memory usage: 372.0+ bytes
None
--------df.describe()--------
            math    physics
count  15.000000  15.000000
mean   43.133333  39.000000
std    31.475312  29.014775
min     5.000000   2.000000
25%    11.500000  22.000000
50%    45.000000  33.000000
75%    72.500000  50.500000
max    98.000000  99.000000


## 鸟瞰数据

- df.sample()
- df.head()
- df.tail()

In [9]:
df.sample(6)

Unnamed: 0,class,math,physics
1,101,98,99
13,202,38,85
3,202,54,3
8,101,45,44
6,202,47,33
9,202,14,45


## 取消科学计数法

In [13]:
import pandas as pd
import numpy as np

df = pd.DataFrame(
    {
        'A' : np.random.standard_normal(5),
        'B' : np.random.standard_normal(5),
        'C' : np.random.randint(0, 100, 5),
    }
)
#pd.set_option('display.float_format', lambda x: '%.2f' % x)
#pd.set_option('display.float_format', "{:.2f}".format)
pd.set_option('display.precision', 2)

print(df)

      A     B   C
0 -0.48 -0.23  14
1  1.13 -1.28  37
2  0.16 -1.59   4
3 -1.62 -0.23  47
4  0.43 -0.60  77
