# Reasons for Missing
      - Inacurate measurements
      - Data does not exist (questionaries)

# Problems with Missing Values

# Solutions for missing values
    - Removing rows
    - Removing columns
    - Imputation (replacing with known values)
    - Do nothing (often usefull for descriptive analytics)

In [1]:
import pandas as pd

In [2]:
ufo=pd.read_csv('http://bit.ly/uforeports')

In [3]:
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [4]:
ufo.tail()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
18236,Grant Park,,TRIANGLE,IL,12/31/2000 23:00
18237,Spirit Lake,,DISK,IA,12/31/2000 23:00
18238,Eagle River,,,WI,12/31/2000 23:45
18239,Eagle River,RED,LIGHT,WI,12/31/2000 23:45
18240,Ybor,,OVAL,FL,12/31/2000 23:59


In [None]:
type[ufo.loc]

In [49]:
ufo.isnull().head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,False,True,False,False,False
1,False,True,False,False,False
2,False,True,False,False,False
3,False,True,False,False,False
4,False,True,False,False,False


In [7]:
ufo.notnull().head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,True,False,True,True,True
1,True,False,True,True,True
2,True,False,True,True,True
3,True,False,True,True,True
4,True,False,True,True,True


In [50]:
# sum have axis =0 by default (accross the rows, by the columns)
ufo.isnull().sum(axis=0)

City                  25
Colors Reported    15359
Shape Reported         0
State                  0
Time                   0
dtype: int64

In [51]:
pd.Series([True, False, True]).sum()

2

In [39]:
# isnull is also a series method
ufo[ufo.City.isnull()].head() # ufo.City.isnull() gives boolean array used for selection

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
21,,,Various,LA,8/15/1943 0:00
22,,,LIGHT,LA,8/15/1943 0:00
204,,,DISK,CA,7/15/1952 12:30
241,,BLUE,DISK,MT,7/4/1953 14:00
613,,,DISK,NV,7/1/1960 12:00


In [48]:
ufo.City.fillna('Grad')

0                      Ithaca
1                 Willingboro
2                     Holyoke
3                     Abilene
4        New York Worlds Fair
5                 Valley City
6                 Crater Lake
7                        Alma
8                     Eklutna
9                     Hubbard
10                    Fontana
11                   Waterloo
12                     Belton
13                     Keokuk
14                  Ludington
15                Forest Home
16                Los Angeles
17                  Hapeville
18                     Oneida
19                 Bering Sea
20                   Nebraska
21                       Grad
22                       Grad
23                  Owensboro
24                 Wilderness
25                  San Diego
26                 Wilderness
27                     Clovis
28                 Los Alamos
29               Ft. Duschene
                 ...         
18211                 Holyoke
18212                  Carson
18213     

# What to do with Missing values
     - Drop rows
     - Drom columns
     - Impute

In [52]:
ufo.dropna(how='any').shape # drop rows where any of columns have missing value / Also has inplace parameter

(2877, 5)

In [53]:
ufo.dropna(how='all').shape

(18241, 5)

In [31]:
ufo.dropna(subset=['City', 'Shape Reported'], how='any').shape

(18216, 5)

In [32]:
ufo.dropna(subset=['City', 'Shape Reported'], how='any').shape # Drop if both attributes are null

(18216, 5)

In [54]:
ufo['Shape Reported'].value_counts() # Missing values are excluded

LIGHT        2803
Various      2644
DISK         2122
TRIANGLE     1889
OTHER        1402
CIRCLE       1365
SPHERE       1054
FIREBALL     1039
OVAL          845
CIGAR         617
FORMATION     434
VARIOUS       333
RECTANGLE     303
CYLINDER      294
CHEVRON       248
DIAMOND       234
EGG           197
FLASH         188
TEARDROP      119
CONE           60
CROSS          36
DELTA           7
CRESCENT        2
ROUND           2
FLARE           1
HEXAGON         1
DOME            1
PYRAMID         1
Name: Shape Reported, dtype: int64

In [26]:
ufo['Shape Reported'].value_counts(dropna=False) # Missing values are excluded

LIGHT        2803
NaN          2644
DISK         2122
TRIANGLE     1889
OTHER        1402
CIRCLE       1365
SPHERE       1054
FIREBALL     1039
OVAL          845
CIGAR         617
FORMATION     434
VARIOUS       333
RECTANGLE     303
CYLINDER      294
CHEVRON       248
DIAMOND       234
EGG           197
FLASH         188
TEARDROP      119
CONE           60
CROSS          36
DELTA           7
CRESCENT        2
ROUND           2
FLARE           1
DOME            1
HEXAGON         1
PYRAMID         1
Name: Shape Reported, dtype: int64

In [27]:
ufo['Shape Reported'].fillna(value='Various', inplace=True)

In [28]:
ufo['Shape Reported'].value_counts(dropna=False)

LIGHT        2803
Various      2644
DISK         2122
TRIANGLE     1889
OTHER        1402
CIRCLE       1365
SPHERE       1054
FIREBALL     1039
OVAL          845
CIGAR         617
FORMATION     434
VARIOUS       333
RECTANGLE     303
CYLINDER      294
CHEVRON       248
DIAMOND       234
EGG           197
FLASH         188
TEARDROP      119
CONE           60
CROSS          36
DELTA           7
CRESCENT        2
ROUND           2
FLARE           1
HEXAGON         1
DOME            1
PYRAMID         1
Name: Shape Reported, dtype: int64

# Replace to NaN 

In [55]:
ld=[{"Player": "Milos Teodosic", 'Points':6, 'Assists':10},
    {"Player": "Bogdan Bogdanovic", 'Points':15, 'Assists':-9999},
    {"Player": "Nikola Jokic", 'Points':22, 'Assists':-8888},
   ]

In [56]:
players=pd.DataFrame(ld)

In [57]:
players.head()

Unnamed: 0,Assists,Player,Points
0,10,Milos Teodosic,6
1,-9999,Bogdan Bogdanovic,15
2,-8888,Nikola Jokic,22


In [58]:
import numpy as np
players.replace(-9999, np.NaN)

Unnamed: 0,Assists,Player,Points
0,10.0,Milos Teodosic,6
1,,Bogdan Bogdanovic,15
2,-8888.0,Nikola Jokic,22


In [47]:
players.replace([-9999, -8888], 0)

Unnamed: 0,Assists,Player,Points
0,10,Milos Teodosic,6
1,0,Bogdan Bogdanovic,15
2,0,Nikola Jokic,22


In [None]:
# Use dictionary for replacement of different values in different columns
new_df=df.replace({
        'Points': -9999
        'Assists': -8888
    }, np.NaN)

In [None]:
# Replacement in general (replacing values)
new_df=df.replace({
        'att1': np.NaN
        'att2': -234345
        'event':'0'
    })

# Advanced (Fancy) imputation
 -https://pypi.python.org/pypi/fancyimpute