# Pandas Tricks for Imputing Missing Data
## Dealing with nulls
##### By Matthew Gerardino
---

In [1]:
#imports
import pandas as pd 

In [2]:
#reading in data seperated by semicolon
df = pd.read_csv("../Data/winemag-data_first150k.csv", sep=';')
df.head()

Unnamed: 0,country,designation,points,price,province,region_1,region_2,variety,winery,last_year_points
0,US,Martha's Vineyard,96.0,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz,94
1,Spain,Carodorum Selección Especial Reserva,96.0,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez,92
2,US,Special Selected Late Harvest,96.0,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley,100
3,US,Reserve,96.0,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi,94
4,France,La Brûlade,95.0,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude,94


In [3]:
#view dtypes
df.dtypes

country              object
designation          object
points              float64
price               float64
province             object
region_1             object
region_2             object
variety              object
winery               object
last_year_points      int64
dtype: object

In [4]:
#sum of null values in each column
df.isnull().sum()

country                 2
designation         43826
points                  5
price               13396
province                7
region_1            23845
region_2            85659
variety                 5
winery                  5
last_year_points        0
dtype: int64

In [5]:
#view only columns with null/missing values
[col for col in df.columns if df[col].isnull().any()]

['country',
 'designation',
 'points',
 'price',
 'province',
 'region_1',
 'region_2',
 'variety',
 'winery']

In [6]:
#view only columns with null/missing values
col_nul=[]
for col in df.columns:
    if df[col].isnull().any():
        col_nul.append(col)
print(col_nul)

['country', 'designation', 'points', 'price', 'province', 'region_1', 'region_2', 'variety', 'winery']


---
#### The _price_ column contains 8996 missing values. We can replace these missing values using the ```.fillna()``` method. For example, let’s fill in the missing values with the mean price:

In [7]:
#replacing null with mean for price column
df['price'].fillna(df['price'].mean(), inplace = True)

In [8]:
#ensure price column has no null values
df.isnull().sum()

country                 2
designation         43826
points                  5
price                   0
province                7
region_1            23845
region_2            85659
variety                 5
winery                  5
last_year_points        0
dtype: int64

In [9]:
#replacing all columns that still have nulls with the mode of each column
my_list=df[list(df.columns)]
list_mode=[]

for i in my_list:
    list_mode.append(i)
    
for i in list_mode:
    df[i].fillna(df[i].mode()[0], inplace = True)

In [10]:
#ensure that all columns have no nulls
df.isnull().sum()

country             0
designation         0
points              0
price               0
province            0
region_1            0
region_2            0
variety             0
winery              0
last_year_points    0
dtype: int64