In [88]:
import pandas as pd

In [89]:
# load data
df = pd.read_csv("incomplete_test.csv")

In [90]:
df

Unnamed: 0,area,price,category,year,condition
0,Lapland,124000,apartment,1972.0,2.0
1,Lapland,89000,apartment,1984.0,3.0
2,Central,187500,house,2005.0,
3,South,256000,house,,5.0
4,Lapland,156000,house,2011.0,5.0
5,South,176000,apartment,1997.0,
6,Central,367400,house,,5.0
7,Central,166000,apartment,1981.0,3.0
8,South,249000,apartment,2004.0,4.0


In [None]:
# we have two missing values in two different columns
df.isna().sum()

area         0
price        0
category     0
year         2
condition    2
dtype: int64

### Option 1: Just remove all rows with any missing values

This is mostly used when we have so much data, it doesn't hurt to just remove all incomplete rows. With smaller datasets, be careful with this one (we often remove too much data).

In [92]:
# quickest way to solve this => just remove any row
# having any missing value (NaN => Not A Number)
df = df.dropna()

In [93]:
df

Unnamed: 0,area,price,category,year,condition
0,Lapland,124000,apartment,1972.0,2.0
1,Lapland,89000,apartment,1984.0,3.0
4,Lapland,156000,house,2011.0,5.0
7,Central,166000,apartment,1981.0,3.0
8,South,249000,apartment,2004.0,4.0


In [94]:
# notice, after we dropped incomplete rows
# the index numbering is now broken (missing all the removed row indeces)
# this is not compulsorty, buf if you want to re-calculate a new index, try this:
df = df.reset_index()
df = df.drop("index", axis=1)

In [None]:
# we lost 4 rows of data due to removing all NaNs
df

Unnamed: 0,area,price,category,year,condition
0,Lapland,124000,apartment,1972.0,2.0
1,Lapland,89000,apartment,1984.0,3.0
2,Lapland,156000,house,2011.0,5.0
3,Central,166000,apartment,1981.0,3.0
4,South,249000,apartment,2004.0,4.0


### Option 2: instead of removing every row with missing value, REPLACE missing values

Replacing missing values with some other values is called IMPUTATION.

In [96]:
# re-load data so we have the missing values again
df = pd.read_csv("incomplete_test.csv")
df

Unnamed: 0,area,price,category,year,condition
0,Lapland,124000,apartment,1972.0,2.0
1,Lapland,89000,apartment,1984.0,3.0
2,Central,187500,house,2005.0,
3,South,256000,house,,5.0
4,Lapland,156000,house,2011.0,5.0
5,South,176000,apartment,1997.0,
6,Central,367400,house,,5.0
7,Central,166000,apartment,1981.0,3.0
8,South,249000,apartment,2004.0,4.0


In [97]:
# based on the data itself, you can use some default value
# for missing values, for example default condition could be 3
df.fillna({"condition": 3}, inplace=True)

# inplace=True means we don't reassign the df like usually
# for example:
# instead of df = df.dropna(), we could just have df.dropna(inplace=True)

In [98]:
df

Unnamed: 0,area,price,category,year,condition
0,Lapland,124000,apartment,1972.0,2.0
1,Lapland,89000,apartment,1984.0,3.0
2,Central,187500,house,2005.0,3.0
3,South,256000,house,,5.0
4,Lapland,156000,house,2011.0,5.0
5,South,176000,apartment,1997.0,3.0
6,Central,367400,house,,5.0
7,Central,166000,apartment,1981.0,3.0
8,South,249000,apartment,2004.0,4.0


In [99]:
# let's use the average year as the year
df.fillna({"year": df['year'].mean()}, inplace=True)

# modify year so it's an integer without decimal
df['year'] = df['year'].astype(int)

In [100]:
df

Unnamed: 0,area,price,category,year,condition
0,Lapland,124000,apartment,1972,2.0
1,Lapland,89000,apartment,1984,3.0
2,Central,187500,house,2005,3.0
3,South,256000,house,1993,5.0
4,Lapland,156000,house,2011,5.0
5,South,176000,apartment,1997,3.0
6,Central,367400,house,1993,5.0
7,Central,166000,apartment,1981,3.0
8,South,249000,apartment,2004,4.0


### Option 3: Use category -based imputation (groupby())

In [101]:
area_means = df.groupby("area").mean(numeric_only=True)
area_means

Unnamed: 0_level_0,price,year,condition
area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Central,240300.0,1993.0,3.666667
Lapland,123000.0,1989.0,3.333333
South,227000.0,1998.0,4.0


In [None]:
# with this information, we could make imputation
# where we use the area information in order to fill in
# the missing values => better representation of the original
# distribution and statistics