In [1]:
import pandas as pd
import numpy as np

# Read a CSV file with null-value in it

In [2]:
# Read a CSV file with null-value in it

file_name = "data_with_nan.csv"
df = pd.read_csv(file_name)
df

Unnamed: 0,col1,col2,col3
0,test1,13.0,
1,test2,44.0,aaa
2,test3,,
3,,,
4,test5,,bbb
5,,,


# Try to create a DataFrame with null-value

In [3]:
# Try to create a DataFrame with null-value

columns = ["col1", "col2", "col3"]
data = [
    ["test1", 13, None],
    ["test2", 44, "aaa"],
    ["test3", None, None],
    [None, None, None]
]

df2 = pd.DataFrame(data=data, columns=columns)
df2

Unnamed: 0,col1,col2,col3
0,test1,13.0,
1,test2,44.0,aaa
2,test3,,
3,,,


In [4]:
print(df2["col3"][0])
print(type(df2["col3"][0]))

None
<class 'NoneType'>


In [5]:
print(df2["col2"][2])
print(type(df2["col2"][2]))

nan
<class 'numpy.float64'>


In [6]:
# NaN will be skipped

df2["col2"].mean()

28.5

In [7]:
df2["col2"].sum()

57.0

In [8]:
df2["col2"].max()

44.0

In [9]:
df2["col2"].min()

13.0

In [10]:
columns = ["col1", "col2", "col3"]
data = [
    ["test1", 13, None],
    ["test2", 44, "aaa"],
    ["test3", None, None],
    [None, None, None]
]

df2 = pd.DataFrame(data=data, columns=columns)

df2.to_csv("output_none.csv", index=0)

# Create NaN

In [11]:
# How to create NaN

columns = ["col1", "col2", "col3"]
data = [
    ["test1", 13, np.nan],
    ["test2", 44, "aaa"],
    ["test3", np.nan, np.nan],
    [np.nan, np.nan, np.nan]
]

df3 = pd.DataFrame(data=data, columns=columns)
df3

Unnamed: 0,col1,col2,col3
0,test1,13.0,
1,test2,44.0,aaa
2,test3,,
3,,,


# Why numpy.nan

In [12]:
# Why numpy.nan

np.array([1, 2, None]).mean()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [13]:
np.array([1, 2, np.nan]).mean()

nan

In [14]:
np.nanmean(
    np.array([1, 2, np.nan])
)

1.5

In [15]:
np.nanmean(
    np.array([1, 2, None])
)

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [16]:
# Other NaN operation

np.nanmax(
    np.array([1, 2, np.nan])
)

2.0

In [17]:
np.nanmin(
    np.array([1, 2, np.nan])
)

1.0

In [18]:
np.nansum(
    np.array([1, 2, np.nan])
)

3.0

# Check if values are null

In [19]:
df3

Unnamed: 0,col1,col2,col3
0,test1,13.0,
1,test2,44.0,aaa
2,test3,,
3,,,


In [20]:
# Check if values are null

df3["col1"].isnull()

0    False
1    False
2    False
3     True
Name: col1, dtype: bool

In [21]:
df3["col2"].isnull()

0    False
1    False
2     True
3     True
Name: col2, dtype: bool

In [22]:
df3["col1"].isna()

0    False
1    False
2    False
3     True
Name: col1, dtype: bool

In [23]:
df3["col2"].isna()

0    False
1    False
2     True
3     True
Name: col2, dtype: bool

In [24]:
# Check if values are not null

df3["col2"].notna()

0     True
1     True
2    False
3    False
Name: col2, dtype: bool

In [25]:
# Filter

df3[df3["col2"].notna()]

Unnamed: 0,col1,col2,col3
0,test1,13.0,
1,test2,44.0,aaa


In [26]:
df3

Unnamed: 0,col1,col2,col3
0,test1,13.0,
1,test2,44.0,aaa
2,test3,,
3,,,


In [27]:
# Drop NaN

df3.dropna()

Unnamed: 0,col1,col2,col3
1,test2,44.0,aaa


In [28]:
df3.dropna(axis=1)

0
1
2
3


In [29]:
df3.dropna(how="all")

Unnamed: 0,col1,col2,col3
0,test1,13.0,
1,test2,44.0,aaa
2,test3,,


In [30]:
df3.dropna(axis=1, how="all")

Unnamed: 0,col1,col2,col3
0,test1,13.0,
1,test2,44.0,aaa
2,test3,,
3,,,


In [31]:
# Fill null

df3.fillna(0)

Unnamed: 0,col1,col2,col3
0,test1,13.0,0
1,test2,44.0,aaa
2,test3,0.0,0
3,0,0.0,0


In [32]:
df3.col2.fillna(0)

0    13.0
1    44.0
2     0.0
3     0.0
Name: col2, dtype: float64

In [33]:
df3.col1.fillna("Unknown")

0      test1
1      test2
2      test3
3    Unknown
Name: col1, dtype: object