data source:<BR>
https://www.lendingclub.com/info/download-data.action

In [1]:
import time
import pandas
print('Pandas',pandas.__version__)

Pandas 0.23.4


In [2]:
start_time = time.time()
loans_2007 = pandas.read_csv('LoanStats3a.csv', skiprows=1, low_memory=False)
print('elapsed:',round(time.time() - start_time,2),'seconds')
loans_2007.shape

elapsed: 5.17 seconds


(42538, 145)

In [3]:
pandas.set_option("display.max_rows", 999)
pandas.set_option("display.max_columns", 999)

In [4]:
loans_2007.dtypes

id                                             object
member_id                                     float64
loan_amnt                                     float64
funded_amnt                                   float64
funded_amnt_inv                               float64
term                                           object
int_rate                                       object
installment                                   float64
grade                                          object
sub_grade                                      object
emp_title                                      object
emp_length                                     object
home_ownership                                 object
annual_inc                                    float64
verification_status                            object
issue_d                                        object
loan_status                                    object
pymnt_plan                                     object
url                         

# grade column
Suppose we were doing analysis that involved the "grade" column.

Let's look at the contents.

In [5]:
loans_2007['grade'][0:10]

0    B
1    C
2    C
3    C
4    B
5    A
6    C
7    E
8    F
9    B
Name: grade, dtype: object

The "grade" is a letter. 

How many letters are used?

In [6]:
loans_2007['grade'].nunique()

7

Since there are not many letters, how many entries are there per grade?

In [7]:
loans_2007['grade'].value_counts()

B    12389
A    10183
C     8740
D     6016
E     3394
F     1301
G      512
Name: grade, dtype: int64

We recognize that this variable is better thought of as a categorical variable (rather than a string)

https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html

We can convert the column type using

In [8]:
loans_2007['grade'] = loans_2007['grade'].astype('category')

In [9]:
loans_2007['grade'].dtype

CategoricalDtype(categories=['A', 'B', 'C', 'D', 'E', 'F', 'G'], ordered=False)

# home_ownership column

In [10]:
loans_2007['home_ownership'][0:10]

0    RENT
1    RENT
2    RENT
3    RENT
4    RENT
5    RENT
6    RENT
7    RENT
8     OWN
9    RENT
Name: home_ownership, dtype: object

In [11]:
loans_2007['home_ownership'].nunique()

5

In [12]:
loans_2007['home_ownership'].value_counts()

RENT        20181
MORTGAGE    18959
OWN          3251
OTHER         136
NONE            8
Name: home_ownership, dtype: int64

In [13]:
loans_2007['home_ownership'] = loans_2007['home_ownership'].astype('category')

# revol_util column

In [14]:
loans_2007['revol_util'][0:10]

0    83.7%
1     9.4%
2    98.5%
3      21%
4    53.9%
5    28.3%
6    85.6%
7    87.5%
8    32.6%
9    36.5%
Name: revol_util, dtype: object

In [15]:
# caveat: the following produces a "ValueError"

loans_2007['revol_util as percentage'] = loans_2007['revol_util'].astype('float64')

ValueError: could not convert string to float: '63.5%'

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.replace.html

In [16]:
loans_2007['revol_util as percentage'] = \
      loans_2007['revol_util'].str.replace('%','').astype('float64')

In [17]:
loans_2007['revol_util as percentage'][0:5]

0    83.7
1     9.4
2    98.5
3    21.0
4    53.9
Name: revol_util as percentage, dtype: float64

# How to avoid manually processing all the columns?


In [18]:
series_of_type_per_column = loans_2007.dtypes

In [19]:
series_of_type_per_column

id                                              object
member_id                                      float64
loan_amnt                                      float64
funded_amnt                                    float64
funded_amnt_inv                                float64
term                                            object
int_rate                                        object
installment                                    float64
grade                                         category
sub_grade                                       object
emp_title                                       object
emp_length                                      object
home_ownership                                category
annual_inc                                     float64
verification_status                             object
issue_d                                         object
loan_status                                     object
pymnt_plan                                      object
url       

We can loop through all the elements in the series

In [20]:
for col,col_type in series_of_type_per_column.iteritems():
    if(col_type=='object'):
        print(col,'has',loans_2007[col].nunique(),'unique entries')

id has 3 unique entries
term has 2 unique entries
int_rate has 394 unique entries
sub_grade has 35 unique entries
emp_title has 30658 unique entries
emp_length has 11 unique entries
verification_status has 3 unique entries
issue_d has 55 unique entries
loan_status has 4 unique entries
pymnt_plan has 1 unique entries
desc has 28963 unique entries
purpose has 14 unique entries
title has 21264 unique entries
zip_code has 837 unique entries
addr_state has 50 unique entries
earliest_cr_line has 530 unique entries
revol_util has 1119 unique entries
initial_list_status has 1 unique entries
last_pymnt_d has 112 unique entries
next_pymnt_d has 98 unique entries
last_credit_pull_d has 138 unique entries
application_type has 1 unique entries
hardship_flag has 1 unique entries
disbursement_method has 1 unique entries
debt_settlement_flag has 2 unique entries
debt_settlement_flag_date has 63 unique entries
settlement_status has 3 unique entries
settlement_date has 61 unique entries


We can provide more information in that loop 

In [21]:
first_n_entries=5
for col,col_type in loans_2007.dtypes.iteritems():
    if(col_type=='object'):
        print('\n',col,'has',loans_2007[col].nunique(),
                             'unique entries; first ',first_n_entries,'are')
        print(loans_2007[col][0:first_n_entries])


 id has 3 unique entries; first  5 are
0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
Name: id, dtype: object

 term has 2 unique entries; first  5 are
0     36 months
1     60 months
2     36 months
3     36 months
4     60 months
Name: term, dtype: object

 int_rate has 394 unique entries; first  5 are
0     10.65%
1     15.27%
2     15.96%
3     13.49%
4     12.69%
Name: int_rate, dtype: object

 sub_grade has 35 unique entries; first  5 are
0    B2
1    C4
2    C5
3    C1
4    B5
Name: sub_grade, dtype: object

 emp_title has 30658 unique entries; first  5 are
0                         NaN
1                       Ryder
2                         NaN
3         AIR RESOURCES BOARD
4    University Medical Group
Name: emp_title, dtype: object

 emp_length has 11 unique entries; first  5 are
0    10+ years
1     < 1 year
2    10+ years
3    10+ years
4       1 year
Name: emp_length, dtype: object

 verification_status has 3 unique entries; first  5 are
0           Verified
1    Source Ver