# 删除数据

## 删除某列

- `df.drop(0, axis=1, inplace)`
- `df.drop(columns='col', inplace)`



In [4]:
import pandas as pd
import numpy as np

classes = ["101", "202", "303"]
df = pd.DataFrame(
    {
        "class":[classes[x % len(classes)] for x in np.random.randint(0, len(classes), 5)],
        "math":np.random.randint(0, 100, 5),
        "physics":np.random.randint(0, 100, 5)  
    })
df

Unnamed: 0,class,math,physics
0,202,98,23
1,202,8,15
2,202,98,81
3,202,54,73
4,303,32,99


In [6]:
df.drop(columns='physics', inplace=True)
df

Unnamed: 0,class,math
0,202,98
1,202,8
2,202,98
3,202,54
4,303,32


In [7]:
del df['math']
print(df)

  class
0   202
1   202
2   202
3   202
4   303


## 删除行

- `df.drop(0, axis=0, inplace)`
- `df.drop(0, inplace)`
- `df.drop(index='a', inplace)`

In [2]:
import pandas as pd

df = pd.DataFrame([10, 20, 30, 40],
                      columns=['numbers'],
                      index=['a', 'b', 'c', 'd'])
print(df)
df.drop('a', inplace=True)
print(df)
df.drop(index='c', inplace=True)
print(df)


   numbers
a       10
b       20
c       30
d       40
   numbers
b       20
c       30
d       40
   numbers
b       20
d       40


## 按条件删除行

两种方式：

- `df.drop(df[<some boolean condition>].index)`。
- 更高效的方式是`df = df[df.score > 50]`。


In [7]:
import pandas as pd

del_df = pd.DataFrame([10, 20, 30, 40],
                      columns=['numbers'],
                      index=['a', 'b', 'c', 'd'])
print(del_df)
del_df = del_df[del_df["numbers"] > 10]
print(del_df)

   numbers
a       10
b       20
c       30
d       40
   numbers
b       20
c       30
d       40


## 删除所有行

两种方式：`df = df.iloc[0:0]`和`df.drop(df.index, inplace=True)`。

参考： 

- [Drop all data in a pandas dataframe](https://stackoverflow.com/questions/39173992/drop-all-data-in-a-pandas-dataframe)

In [41]:
del_df = pd.DataFrame([10, 20, 30, 40],
                      columns=['numbers'],
                      index=['a', 'b', 'c', 'd'])
print(del_df)

del_df = del_df.iloc[0:0]
del_df.to_csv("del_df.csv")
print(del_df)

   numbers
a       10
b       20
c       30
d       40
Empty DataFrame
Columns: [numbers]
Index: []


# 缺失值处理

## “np.nan”和“np.NaN”

In [None]:
import numpy as np

print("np.nan = {}, type(np.nan) = {}".format(np.nan, type(np.nan)))
print("np.NaN = {}, type(np.NaN) = {}".format(np.NaN, type(np.NaN)))
print(np.nan == np.NaN)
print(np.nan is np.NaN)

np.nan = nan, type(np.nan) = <class 'float'>
np.NaN = nan, type(np.NaN) = <class 'float'>
False
True


## 使用`isnull()`

In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame([[1.2, 6.5, 3.0],
               [1., np.NaN, np.NaN],
               [np.NaN, np.NaN, np.NaN],
               [np.NaN, 6.2, 3.1]])
print(df.isnull())
print(pd.isnull(df.loc[0,0]))

       0      1      2
0  False  False  False
1  False   True   True
2   True   True   True
3   True  False  False
False


使用`notnull()`

 [What is the difference between NaN and None?](https://stackoverflow.com/questions/17534106/what-is-the-difference-between-nan-and-none)里提到`np.isnan(p)`对传入的参数有要求，如果是string类型那么会crash，使用`pd.isnull()`则要安全得多。

In [None]:
print(df.notnull())
print(pd.notnull(df.loc[2,0]))

import numpy as np
print(np.isnan('h'))

       0      1      2
0   True   True   True
1   True  False  False
2  False  False  False
3  False   True   True
False


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

## 填充NaN

不管是填充还是替换，都默认不在原有dataframe上生效，如果需要生效需要使用`inplace=True`参数。

In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame([[1.2, 6.5, 3.0],
               [1., np.NaN, np.NaN],
               [np.NaN, np.NaN, np.NaN],
               [np.NaN, 6.2, 3.1]])
print(df)

df.fillna(0)
print(df)

new_df = df.fillna(0)
print(new_df)

     0    1    2
0  1.2  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.2  3.1
     0    1    2
0  1.2  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.2  3.1
     0    1    2
0  1.2  6.5  3.0
1  1.0  0.0  0.0
2  0.0  0.0  0.0
3  0.0  6.2  3.1


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:
df[1].replace(np.NaN, '0', inplace=True)
print(df)

     0    1    2
0  1.2  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.2  3.1


In [None]:
np.nan > 0.0
type(np.nan)

float

## 使用`ffill`

- Fill NA/NaN values by propagating the last valid observation to next valid.
- `forehand`，正手，从上往下

In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame([[np.nan, 2, np.nan, 0],
                   [3, 4, 2, 1],
                   [np.nan, 3, 3, 3],
                   [np.nan, 3, 4, 4],
                   [4, 5, 5, 5],
                   [np.nan, 3, 4, 4]],
                  columns=list("ABCD"))
print(df)

df['A'].fillna(method='ffill', inplace=True)
print(df)

df['A'].astype(float).mean()

     A  B    C  D
0  NaN  2  NaN  0
1  3.0  4  2.0  1
2  NaN  3  3.0  3
3  NaN  3  4.0  4
4  4.0  5  5.0  5
5  NaN  3  4.0  4
     A  B    C  D
0  NaN  2  NaN  0
1  3.0  4  2.0  1
2  3.0  3  3.0  3
3  3.0  3  4.0  4
4  4.0  5  5.0  5
5  4.0  3  4.0  4


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['A'].fillna(method='ffill', inplace=True)
  df['A'].fillna(method='ffill', inplace=True)


3.4

## pandas.DataFrame.bfill

- Fill NA/NaN values by using the next valid observation to fill the gap.
- `forehand`，反手，从下往上

## 使用`dropna()`

In [None]:
import pandas as pd 
import numpy as np

s = pd.Series([1, np.NaN, 3.2, np.NaN, 7])
print(s)
print(s.dropna())

df = pd.DataFrame([[1.2, 6.5, 3.0],
               [1., np.NaN, np.NaN],
               [np.NaN, np.NaN, np.NaN],
               [np.NaN, 6.2, 3.1]])
print(df)
print(df.dropna())
print(df.dropna(how='all'))
print(df.dropna(axis=1, how='all'))


print(df.fillna(0))
print(df.fillna({1:0.5, 2:1.0}))

0    1.0
1    NaN
2    3.2
3    NaN
4    7.0
dtype: float64
0    1.0
2    3.2
4    7.0
dtype: float64
     0    1    2
0  1.2  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.2  3.1
     0    1    2
0  1.2  6.5  3.0
     0    1    2
0  1.2  6.5  3.0
1  1.0  NaN  NaN
3  NaN  6.2  3.1
     0    1    2
0  1.2  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.2  3.1
     0    1    2
0  1.2  6.5  3.0
1  1.0  0.0  0.0
2  0.0  0.0  0.0
3  0.0  6.2  3.1
     0    1    2
0  1.2  6.5  3.0
1  1.0  0.5  1.0
2  NaN  0.5  1.0
3  NaN  6.2  3.1


## NaA的计算


In [None]:
import numpy as np

print(np.NaN + 1)
print(sum([np.NaN, 1.0, 2.0]))

nan
nan


In [None]:
import numpy as np

np.NaN > 0.0

False

# 重复值

In [None]:
import numpy as np
import pandas as pd

classes = ["A", "B", "C"]
score = pd.DataFrame({
    "class":[classes[x] for x in np.random.randint(0, len(classes), 6)],
    "liter":np.random.randint(0, 100, 6),
    "math":np.random.randint(0, 100, 6)
})
print(score)

  class  liter  math
0     B     13    48
1     A     61    75
2     B     81    25
3     B     68    28
4     C     95     5
5     A     53    34


In [None]:
score.drop_duplicates(subset=['class'], keep='first', inplace=True)
print(score)
score.loc[len(score)] = ["B", 22, 23]
print(score)
score.drop_duplicates(subset=['class'], keep='first', inplace=True)
print(score)

  class  liter  math
0     B     13    48
1     A     61    75
4     C     95     5
  class  liter  math
0     B     13    48
1     A     61    75
4     C     95     5
3     B     22    23
  class  liter  math
0     B     13    48
1     A     61    75
4     C     95     5
