# Handling missing data

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('datasets/diabetes.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
pregnancies    768 non-null int64
glucose        768 non-null int64
diastolic      768 non-null int64
triceps        768 non-null int64
insulin        768 non-null int64
bmi            768 non-null float64
dpf            768 non-null float64
age            768 non-null int64
diabetes       768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


From info(), it doesn't look like missing values presented. But missing values can come in other forms such as zero, -1, ?, etc. Looking at head() we can see some features with zeroes.

In [4]:
df.head()

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


insulin and triceps can't be zero, so this sure is missing data.

## Dropping missing data
- Replacing with np.nan
- One way is to drop missing data

In [10]:
df.insulin.replace(0, np.nan, inplace=True)
df.triceps.replace(0, np.nan, inplace=True)
df.bmi.replace(0, np.nan, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
pregnancies    768 non-null int64
glucose        768 non-null int64
diastolic      768 non-null int64
triceps        541 non-null float64
insulin        394 non-null float64
bmi            757 non-null float64
dpf            768 non-null float64
age            768 non-null int64
diabetes       768 non-null int64
dtypes: float64(4), int64(5)
memory usage: 54.1 KB


In this example, dropping NAs causes sigificant loss in data. This is unacceptable.

In [12]:
df.dropna().shape

(393, 9)

## Imputing missing data
- Making an educated guess about the missing values
- Example: Using the mean of non-missing entries
- Imputer = Transformer

In [37]:
X = df.drop('diabetes', axis=1)
y = df['diabetes']
X.head()

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age
0,6,148,72,35.0,,33.6,0.627,50
1,1,85,66,29.0,,26.6,0.351,31
2,8,183,64,,,23.3,0.672,32
3,1,89,66,23.0,94.0,28.1,0.167,21
4,0,137,40,35.0,168.0,43.1,2.288,33


In [16]:
from sklearn.preprocessing import Imputer

# Instantiate imputer. Note that axis=0 is imputing along columns
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X)

Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)

In [30]:
imp.transform(X)

array([[  6.   , 148.   ,  72.   , ...,  33.6  ,   0.627,  50.   ],
       [  1.   ,  85.   ,  66.   , ...,  26.6  ,   0.351,  31.   ],
       [  8.   , 183.   ,  64.   , ...,  23.3  ,   0.672,  32.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,  26.2  ,   0.245,  30.   ],
       [  1.   , 126.   ,  60.   , ...,  30.1  ,   0.349,  47.   ],
       [  1.   ,  93.   ,  70.   , ...,  30.4  ,   0.315,  23.   ]])

## Imputing within a pipeline
- In a pipeple, each step except the last must be a **transformer**
- The last must be an **estimator**, such as, classifier or regressor

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer

imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
logreg = LogisticRegression()

#### Construct a pipeline with designated steps

In [45]:
steps = [('imputation', imp),
         ('logistic_regression', logreg)]

pipeline = Pipeline(steps)

X_train, X_test, y_train, y_test = train_test_split(X, y,
            test_size=0.3, random_state=42)

#### Using pipeline: fit, predict, score

In [46]:
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

pipeline.score(X_test, y_test)

0.7619047619047619

# Exercise: US Voting Dataset

In [74]:
colnames = ['party', 'infants', 'water', 'budget', 'physician', 'salvador',
       'religious', 'satellite', 'aid', 'missile', 'immigration', 'synfuels',
       'education', 'superfund', 'crime', 'duty_free_exports', 'eaa_rsa']

lookup = dict(y=1, n=0)
converters = { c : lambda x : lookup.get(x, x) for c in colnames[1:]}

In [76]:
df = pd.read_csv('datasets/house-votes-84.csv',names=colnames, converters=converters)
df.head()

Unnamed: 0,party,infants,water,budget,physician,salvador,religious,satellite,aid,missile,immigration,synfuels,education,superfund,crime,duty_free_exports,eaa_rsa
0,republican,0,1,0,1,1,1,0,0,0,1,?,1,1,1,0,1
1,republican,0,1,0,1,1,1,0,0,0,0,0,1,1,1,0,?
2,democrat,?,1,1,?,1,1,0,0,0,0,1,0,1,1,0,0
3,democrat,0,1,1,0,?,1,0,0,0,0,1,0,1,0,0,1
4,democrat,1,1,1,0,1,1,0,0,0,0,1,?,1,1,1,1


### Convert ? to NaN
? : missing value

In [82]:
df[df == '?'] = np.nan
print(df.isnull().sum())

party                  0
infants               12
water                 48
budget                11
physician             11
salvador              15
religious             11
satellite             14
aid                   15
missile               22
immigration            7
synfuels              21
education             31
superfund             25
crime                 17
duty_free_exports     28
eaa_rsa              104
dtype: int64


In [83]:
print('Shape of Original DataFrame: {}'.format(df.shape))
print('Shape of DataFrame after dropping all rows with missing values: {}'.format(df.dropna().shape))

Shape of Original DataFrame: (435, 17)
Shape of DataFrame after dropping all rows with missing values: (232, 17)


## Imputing missing data in a ML pipeline
- Using Support Vector Machine (SVM) : Support Vector Classification (SVC)
- Impute with 'most_frequent' strategy

In [94]:
from sklearn.preprocessing import Imputer

from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Instantiate the Imputation transformer
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)

# Instantiate the SVC classifier
clf = SVC()

# Setup the pipeline with the required steps
steps = [('imputation', imp),
         ('SVM', clf)]

X = df.drop('party', axis=1)
y = df['party']

### Pipeline: split, fit, predict, classification_report

In [96]:
pipeline = Pipeline(steps)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)

# fitting
pipeline.fit(X_train, y_train)
# predicting
y_pred = pipeline.predict(X_test)

print('Confusion matrix:\n{}'.format(confusion_matrix(y_test, y_pred)))
print('Classification report:\n{}'.format(classification_report(y_test, y_pred)))

Confusion matrix:
[[82  3]
 [ 1 45]]
Classification report:
             precision    recall  f1-score   support

   democrat       0.99      0.96      0.98        85
 republican       0.94      0.98      0.96        46

avg / total       0.97      0.97      0.97       131

