In [2]:
import pandas as pd

In [10]:
df = pd.read_csv('../height_weight.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
sex       200 non-null object
weight    200 non-null int64
height    200 non-null int64
repwt     183 non-null float64
repht     183 non-null float64
dtypes: float64(2), int64(2), object(1)
memory usage: 7.9+ KB


In [9]:
df.drop(columns=['repwt'])

Unnamed: 0,sex,weight,height,repht
0,M,77,182,180.0
1,F,58,161,159.0
2,F,53,161,158.0
3,M,68,177,175.0
4,F,59,157,155.0
...,...,...,...,...
195,M,74,175,175.0
196,M,83,180,180.0
197,M,81,175,
198,M,90,181,178.0


In [11]:
df

Unnamed: 0,sex,weight,height,repwt,repht
0,M,77,182,77.0,180.0
1,F,58,161,51.0,159.0
2,F,53,161,54.0,158.0
3,M,68,177,70.0,175.0
4,F,59,157,59.0,155.0
...,...,...,...,...,...
195,M,74,175,71.0,175.0
196,M,83,180,80.0,180.0
197,M,81,175,,
198,M,90,181,91.0,178.0


In [6]:
# execute only if you are sure
df.drop(columns=['repwt'],inplace=True)

In [7]:
df

Unnamed: 0,sex,weight,height,repht
0,M,77,182,180.0
1,F,58,161,159.0
2,F,53,161,158.0
3,M,68,177,175.0
4,F,59,157,155.0
...,...,...,...,...
195,M,74,175,175.0
196,M,83,180,180.0
197,M,81,175,
198,M,90,181,178.0


# forward fill from the last known not null value

In [12]:
# forward fill from the last known not null value
df['repwt'].fillna(method='ffill').head(20)

0      77.0
1      51.0
2      54.0
3      70.0
4      59.0
5      76.0
6      77.0
7      73.0
8      71.0
9      64.0
10     75.0
11     56.0
12     52.0
13     64.0
14     57.0
15     66.0
16    101.0
17     62.0
18     75.0
19     61.0
Name: repwt, dtype: float64

In [13]:
df['repwt'].fillna(method='bfill',inplace=True) # inplace=True if you want to persist dataframe

In [14]:
df['repwt'].head(20)

0      77.0
1      51.0
2      54.0
3      70.0
4      59.0
5      76.0
6      77.0
7      73.0
8      71.0
9      64.0
10     75.0
11     56.0
12     52.0
13     64.0
14     57.0
15     66.0
16    101.0
17     62.0
18     75.0
19     61.0
Name: repwt, dtype: float64

In [16]:
# Continue this step only if time permits, as we try to apply interpolate. First see how the series looks like
df = pd.read_csv('../height_weight.csv')
df[df['repwt'].notnull()]['repwt'].head(20)

0      77.0
1      51.0
2      54.0
3      70.0
4      59.0
5      76.0
6      77.0
7      73.0
8      71.0
9      64.0
10     75.0
11     56.0
12     52.0
13     64.0
14     57.0
15     66.0
16    101.0
17     62.0
18     75.0
19     61.0
Name: repwt, dtype: float64

# Apply linear interpolate method

In [18]:
# Apply linear interpolate method
# Various methods like linear, polynomial - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.interpolate.html
df['repwt'].interpolate(method='linear').head(20)

0      77.0
1      51.0
2      54.0
3      70.0
4      59.0
5      76.0
6      77.0
7      73.0
8      71.0
9      64.0
10     75.0
11     56.0
12     52.0
13     64.0
14     57.0
15     66.0
16    101.0
17     62.0
18     75.0
19     61.0
Name: repwt, dtype: float64

In [49]:
# Get rows if there is at least one null value in the row, and count # of such rows by year it belongs to
# This is a typical use case scenario where we want to know if there was an outage for OLTP transactions between a time
df.loc[df.isnull().any(axis=1),'YYYYMMDD'].value_counts()

20160821    20
20160809    12
20160822    11
Name: YYYYMMDD, dtype: int64

In [19]:
df

Unnamed: 0,sex,weight,height,repwt,repht
0,M,77,182,77.0,180.0
1,F,58,161,51.0,159.0
2,F,53,161,54.0,158.0
3,M,68,177,70.0,175.0
4,F,59,157,59.0,155.0
...,...,...,...,...,...
195,M,74,175,71.0,175.0
196,M,83,180,80.0,180.0
197,M,81,175,,
198,M,90,181,91.0,178.0


In [20]:
# dropna will drop all rows that have at least one null value.
nulls_dropped = df.dropna()

In [21]:
# Observe now that all columns have equal row #, however Index still goes from 0 to 8783 e.g. nulls_dropped[5300:5310]
nulls_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 181 entries, 0 to 199
Data columns (total 5 columns):
sex       181 non-null object
weight    181 non-null int64
height    181 non-null int64
repwt     181 non-null float64
repht     181 non-null float64
dtypes: float64(2), int64(2), object(1)
memory usage: 8.5+ KB


# Drop rows based on threshold

In [23]:
# Instead of dropping rows that contains at least one null value, use Thresh=n, which determines the threshold under which rows should be dropped
# For e.g if thresh=2, drop only rows that have less than 7 measurements (i.e. less than 7 non-null values)
drop_thresh = df.dropna(thresh=2)
drop_thresh[drop_thresh.isnull().any(axis=1)]

Unnamed: 0,sex,weight,height,repwt,repht
46,M,73,180,,
47,F,49,161,,
54,M,64,177,,
56,F,66,170,65.0,
75,F,50,171,,
99,F,55,155,,154.0
124,M,67,179,,
125,F,52,169,56.0,
126,F,47,153,,154.0
137,F,62,167,,


In [24]:
# Generally we don't drop all rows, a better way to to handle how to fill the rows. Lets identify the rows to fill first
rows_to_fill = df.isnull().any(axis=1)
df[rows_to_fill]

Unnamed: 0,sex,weight,height,repwt,repht
46,M,73,180,,
47,F,49,161,,
54,M,64,177,,
56,F,66,170,65.0,
75,F,50,171,,
99,F,55,155,,154.0
124,M,67,179,,
125,F,52,169,56.0,
126,F,47,153,,154.0
137,F,62,167,,


# Fill all null values with mean of the columns

In [25]:
# Fill all null values with mean of the columns
nulls_filled = df.fillna(df.mean())

In [26]:
# Get all rows that were filled and observe that columns RAIN, SNOW , THUNDER etc. are 0-1 values. I don't want that.
# Would rather prefer 0 or 1
nulls_filled[rows_to_fill]

Unnamed: 0,sex,weight,height,repwt,repht
46,M,73,180,65.622951,168.497268
47,F,49,161,65.622951,168.497268
54,M,64,177,65.622951,168.497268
56,F,66,170,65.0,168.497268
75,F,50,171,65.622951,168.497268
99,F,55,155,65.622951,154.0
124,M,67,179,65.622951,168.497268
125,F,52,169,56.0,168.497268
126,F,47,153,65.622951,154.0
137,F,62,167,65.622951,168.497268


# Sometimes mode is better than mean

In [27]:
# Mode is a better value to fill in this case. Since there might be multiple values returned for mode, we get the first value
nulls_filled = df.fillna(df.mode().iloc[0])

In [28]:
# Verify again that now columns ICE are filled with mode, which is more appropriate
nulls_filled[rows_to_fill]

Unnamed: 0,sex,weight,height,repwt,repht
46,M,73,180,55.0,165.0
47,F,49,161,55.0,165.0
54,M,64,177,55.0,165.0
56,F,66,170,65.0,165.0
75,F,50,171,55.0,165.0
99,F,55,155,55.0,154.0
124,M,67,179,55.0,165.0
125,F,52,169,56.0,165.0
126,F,47,153,55.0,154.0
137,F,62,167,55.0,165.0


In [29]:
# At this point, I feel good about the result, so want to save it in original dataframe
df.fillna(df.mode().iloc[0],inplace=True)

In [30]:
df

Unnamed: 0,sex,weight,height,repwt,repht
0,M,77,182,77.0,180.0
1,F,58,161,51.0,159.0
2,F,53,161,54.0,158.0
3,M,68,177,70.0,175.0
4,F,59,157,59.0,155.0
...,...,...,...,...,...
195,M,74,175,71.0,175.0
196,M,83,180,80.0,180.0
197,M,81,175,55.0,165.0
198,M,90,181,91.0,178.0


In [31]:
# Now verify if there are any null values in the entire dataframe
df.isnull().any()

sex       False
weight    False
height    False
repwt     False
repht     False
dtype: bool