# Step 1: As in P1.5

In [1]:
import pandas as pd
import numpy as np
import datetime

In [2]:
df = pd.read_excel('data/mixed_data.xls')

The file is available on Nestor.

We start our exploratory analysis by inspecting the size of the data file

In [3]:
df.shape

(30000, 26)

So this file has 30000 records and 26 fields (columns). Note that this is one more column than before. 

# Step 2: Check Data Integrity

Check the data headings. 

In [4]:
df.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'DOB',
       'PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1',
       'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
       'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,DOB,PAY_1,PAY_2,PAY_3,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,b15e78fe-a223,500000,1,1,1,42,1900-01-01,-2,-1,0,...,6420,49240,6772,31305,43864,6453,49470,3723,103686,0
1,689ae71c-3f4b,30000,1,2,1,36,1900-01-01,1,-1,-1,...,1170,780,0,780,0,1170,0,0,0,0
2,3dc1d96e-35c2,50000,1,1,1,0,1981-10-21,0,0,0,...,31240,20343,12349,2017,3000,2000,3007,1003,1500,0
3,f786754a-eb0b,50000,1,3,1,0,1982-10-22,Not available,-1,-1,...,1961,1261,2681,13196,2866,1961,1261,2681,1261,0
4,e6a3c5a6-7647,200000,2,1,2,36,1900-01-01,-2,-2,-2,...,0,0,0,0,0,0,0,0,0,1


The action we propose to take is this. 
When a non 0 age value is present, the data record can keep this. However, when it is not present, we need to estimate the age fror the data of birth. Let's try to do this. 
First, let's try to understand how many records without age we have.

In [6]:
id_counts = df['AGE'].value_counts()
id_counts.head()

0     15217
29      780
27      734
30      714
28      709
Name: AGE, dtype: int64

Nearly half of our records are without a valid entry for the age. 

In [7]:
noage = df[df['AGE']==0]

In [8]:
noage.shape

(15217, 26)

That's good! We have isolated the records with invalid age values in the 'noage' dataframe. 

Some homework now: try to extract the age from the Date of Birth: this will allow our data set to be further used and we will have solved the 'alignment' problem in our data. 

In [9]:
# source: https://stackoverflow.com/questions/26788854/pandas-get-the-age-from-a-date-example-date-of-birth
def calculate_age(born : datetime.date):
    today = datetime.date.today()
    return today.year - born.year

aged = noage.copy()
aged["AGE"] = aged["DOB"].apply(calculate_age)
aged

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,DOB,PAY_1,PAY_2,PAY_3,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
2,3dc1d96e-35c2,50000,1,1,1,42.0,1981-10-21,0,0,0,...,31240,20343,12349,2017,3000,2000,3007,1003,1500,0
3,f786754a-eb0b,50000,1,3,1,41.0,1982-10-22,Not available,-1,-1,...,1961,1261,2681,13196,2866,1961,1261,2681,1261,0
8,481cbaf6-493a,50000,1,2,2,36.0,1987-10-27,1,2,2,...,42805,39207,50155,0,21220,13,1510,15018,21,0
9,6ac2e9cb-d215,200000,2,2,1,48.0,1975-10-15,1,-2,-2,...,0,0,0,0,0,0,0,0,0,0
11,97db78db-8e8b,230000,2,1,1,63.0,1960-09-30,Not available,-2,-2,...,1443,1443,1443,1443,1443,1443,1443,1443,1443,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29992,1063c73e-65ec,30000,2,3,1,46.0,1977-10-17,2,0,0,...,30146,26371,27567,1452,2000,1500,933,2000,3000,1
29994,1ec4289c-4f52,50000,1,1,2,29.0,1994-11-03,2,2,0,...,29506,3672,8346,0,2354,1410,1500,4900,700,1
29995,370d8323-eded,170000,2,2,2,41.0,1982-10-22,-1,-1,-1,...,190,4786,2714,15643,1108,191,4786,2714,7714,0
29998,aa8ddf26-379c,80000,2,1,1,49.0,1974-10-14,-1,-1,-1,...,18351,14249,10637,17253,35369,18351,14249,10637,12331,0
