# Data Cleaning

It seems there are quite a number of 'intended' challenges putting in the dataset file. 😂  
Pandas read_csv doesn't work

In [84]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv

## Error using pandas.read_csv

In [85]:
filename = 'data/dataset.csv'

src = pd.read_csv(filename)

ParserError: Error tokenizing data. C error: Expected 27 fields in line 34792, saw 32


## Use csv.reader

In [86]:
filename = 'data/dataset.csv'

rows = []
firsts = []
ncols = []

# with this setting, the problems of 'newline' character embeded in address column' are handled
with open(filename,'r',newline='') as file:
    reader = csv.reader(file, quotechar='"', delimiter=',',doublequote=True)#quoting=csv.QUOTE_ALL)
    for row in reader:
        rows.append(row)
        firsts.append(row[0]) # for quick check of ids
        ncols.append(len(row)) # check number parsed columns
        
src = pd.DataFrame({'first': firsts, 'row': rows, 'ncol' : ncols})
print(src.shape)
src.head()

(316827, 3)


Unnamed: 0,first,row,ncol
0,id,"[id, loan_amnt, term, int_rate, installment, g...",27
1,1,"[1, 10000, 60 months, 18.55, 256.94, E, E2, S...",27
2,2,"[2, 12250, 36 months, 11.55, 404.25, B, B3, c...",27
3,3,"[3, 4200, 36 months, 17.77, 151.36, D, D1, MV...",27
4,4,"[4, 6000, 36 months, 15.22, 208.64, C, C3, Co...",27


In [87]:
# find rows with issues
src[src.ncol != 27]

Unnamed: 0,first,row,ncol
271,356,"[356, 20000, 36 months, 14.33, 686.77, C, C1,...",9
34791,97478,"[97478, 21500, 60 months, 18.25, 548.89, D, D...",32
78154,West Katiefort,"[West Katiefort, SD 93700""]",2


And this row's address seems to have problem. Just before 78154

In [88]:
src.iloc[78153].row

['54313',
 '9000',
 ' 36 months',
 '7.49',
 '279.92',
 'A',
 'A4',
 'friendship center golf cars',
 '4 years',
 'RENT',
 '40000.0',
 'Not Verified',
 'Fully Paid',
 'debt_consolidation',
 'personal loan',
 '22.41',
 'Apr-2000',
 '15',
 '0',
 '13414',
 '21.9',
 '29',
 'f',
 'INDIVIDUAL',
 '',
 '0.0',
 '49842 Deanna Street Apt. 820\r\nNorth Deborah, Mn,Debt consolidation,37.1,Sep-2004,24,0,5293,8.4,53,w,INDIVIDUAL,4.0,0.0,49778 Nancy Lake Apt. 783']

## Exclude problematic records

In [89]:
col_names = src.iloc[0,1]
exclude_rec_no = [0, 271, 34791, 78153, 78154]
df = pd.DataFrame(temp.loc[~src.index.isin(exclude_rec_no), 'row'].tolist(), columns=col_names)
df.shape

(316822, 27)

In [90]:
df.head()

Unnamed: 0,id,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,...,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,application_type,mort_acc,pub_rec_bankruptcies,address
0,1,10000,60 months,18.55,256.94,E,E2,Sr. Director OF Marketing & Business Dev,5 years,MORTGAGE,...,16,0,151198,92.4,31,f,INDIVIDUAL,3.0,0.0,"257 Smith Circles Apt. 186\r\nWilsonland, NH 1..."
1,2,12250,36 months,11.55,404.25,B,B3,clearwater towing,2 years,OWN,...,8,0,16125,79.4,16,f,INDIVIDUAL,0.0,0.0,"632 Villanueva View\r\nLawrencechester, NM 00813"
2,3,4200,36 months,17.77,151.36,D,D1,MV Transportation,4 years,RENT,...,6,0,9525,73.8,7,f,INDIVIDUAL,0.0,0.0,"612 Gillespie Island\r\nDavisside, KY 93700"
3,4,6000,36 months,15.22,208.64,C,C3,County of Santa Clara,10+ years,RENT,...,9,0,12832,82.8,16,w,INDIVIDUAL,0.0,0.0,"781 Jessica Trail\r\nRyanton, FL 05113"
4,5,16200,36 months,10.16,523.95,B,B1,Memorial Hermann,< 1 year,MORTGAGE,...,8,0,27084,71.7,24,f,INDIVIDUAL,4.0,0.0,"261 Ashley Knolls\r\nNew Victor, CO 70466"


In [92]:
df.to_csv('data/00_dataset_cleansed.csv', index=False)

In [93]:
# test reading
src = pd.read_csv('data/00_dataset_cleansed.csv')
print(src.shape)
src.info()

(316822, 27)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316822 entries, 0 to 316821
Data columns (total 27 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    316822 non-null  int64  
 1   loan_amnt             316822 non-null  int64  
 2   term                  316822 non-null  object 
 3   int_rate              316822 non-null  float64
 4   installment           316822 non-null  float64
 5   grade                 316822 non-null  object 
 6   sub_grade             316822 non-null  object 
 7   emp_title             298748 non-null  object 
 8   emp_length            302489 non-null  object 
 9   home_ownership        316822 non-null  object 
 10  annual_inc            316822 non-null  float64
 11  verification_status   316822 non-null  object 
 12  loan_status           316822 non-null  object 
 13  purpose               316822 non-null  object 
 14  title                 315430 non-null  