In [43]:
import pandas as pd
import numpy as np
import random

In [44]:
# generate some claim numbers - between 1000 to 5000 as string
# padd it with 2 leading zeros (7 digits total)

claim_numbers = random.sample(range(1, 1000), 10)

In [45]:
# a list with claim number (integers)
print(claim_numbers)

[279, 81, 731, 759, 362, 755, 991, 175, 478, 291]


In [46]:
claim_numbers = [("0000000" + str(number))[-7:] for number in claim_numbers]

In [47]:
claim_numbers

['0000279',
 '0000081',
 '0000731',
 '0000759',
 '0000362',
 '0000755',
 '0000991',
 '0000175',
 '0000478',
 '0000291']

In [54]:
for i in range(len(claim_numbers)):
    # this does not work
    # python strings are immutable
    # claim_numbers[i][0] = '3'
    
    claim_numbers[i] = '3' + claim_numbers[i][1:]

In [56]:
# updated with leading digit 3
claim_numbers

['3000279',
 '3000081',
 '3000731',
 '3000759',
 '3000362',
 '3000755',
 '3000991',
 '3000175',
 '3000478',
 '3000291']

In [61]:
# or if you know what you need, you can do this in advance all
# all in one step together.
claim_numbers_2 = random.sample(range(3000000, 3999999), 10)
claim_numbers_2 = [str(number) for number in claim_numbers_2]
claim_numbers_2

['3937749',
 '3049894',
 '3482021',
 '3601649',
 '3725435',
 '3052299',
 '3995130',
 '3345617',
 '3425295',
 '3067101']

In [77]:
# create a sample claim number with status
# status is random between 2 values with 0 or 1
df = pd.DataFrame(np.random.randint(0, 2, size=(10, 1)),
                  columns=['status'],
                  index=claim_numbers_2)

In [78]:
# review our sample
df

Unnamed: 0,status
3937749,1
3049894,1
3482021,0
3601649,1
3725435,0
3052299,0
3995130,0
3345617,1
3425295,0
3067101,1


In [75]:
# https://medium.com/@evelynli_30748/map-apply-applymap-with-the-lambda-function-5e83028be759
# but this is only a series
df['status'].map({0: 'REJECT', 1: 'ACCEPT'})

3937749    REJECT
3049894    ACCEPT
3482021    ACCEPT
3601649    ACCEPT
3725435    REJECT
3052299    REJECT
3995130    REJECT
3345617    REJECT
3425295    REJECT
3067101    ACCEPT
Name: status, dtype: object

In [81]:
# https://stackoverflow.com/a/12152759
df['status'].replace({0: 'REJECT', 1: 'ACCEPT'}, inplace=True)

In [82]:
# this worked really nicely!
df

Unnamed: 0,status
3937749,ACCEPT
3049894,ACCEPT
3482021,REJECT
3601649,ACCEPT
3725435,REJECT
3052299,REJECT
3995130,REJECT
3345617,ACCEPT
3425295,REJECT
3067101,ACCEPT


In [84]:
claim_numbers_n = random.sample(range(1000000000, 4000000000), 1000000)
claim_numbers_n = [str(number) for number in claim_numbers_n]

In [85]:
df = pd.DataFrame(np.random.randint(0, 2, size=(1000000, 1)),
                  columns=['status'],
                  index=claim_numbers_n)

In [88]:
# 1M rows of claim numbers with status
df.shape

(1000000, 1)

In [89]:
# this is a great way to provide a final table
# after analysis is fully completed at the end.
%timeit df['status'].replace({0: 'REJECT', 1: 'ACCEPT'}, inplace=True)

40.3 ms ± 631 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [90]:
df.head()

Unnamed: 0,status
3261376278,REJECT
1632058789,ACCEPT
1439631111,REJECT
2401611405,REJECT
1697848697,ACCEPT


In [95]:
# P.313: Fill Value to vary by group
states = ['Ohio', 'New York', 'Vermont', 'Florida',
          'Oregon', 'Nevada', 'California', 'Idaho']

# this syntax produced a list containing 4 copies of the elements East and West
group_key = ['East'] * 4 + ['West'] * 4

print(group_key)

['East', 'East', 'East', 'East', 'West', 'West', 'West', 'West']


In [94]:
data = pd.Series(np.random.randn(8), index=states)
data

Ohio         -0.700636
New York      1.446773
Vermont      -1.735312
Florida      -1.284564
Oregon        1.178967
Nevada        0.762814
California    0.940077
Idaho         1.188863
dtype: float64

In [96]:
# set some values in the data to be missing
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
data

Ohio         -0.700636
New York      1.446773
Vermont            NaN
Florida      -1.284564
Oregon        1.178967
Nevada             NaN
California    0.940077
Idaho              NaN
dtype: float64

In [97]:
# each of the slot is "slotted" into the region
data.groupby(group_key).mean()

East   -0.179476
West    1.059522
dtype: float64

In [98]:
# create a lambda function
fill_mean = lambda g: g.fillna(g.mean())
type(fill_mean)

function

In [99]:
data.groupby(group_key).apply(fill_mean)

Ohio         -0.700636
New York      1.446773
Vermont      -0.179476
Florida      -1.284564
Oregon        1.178967
Nevada        1.059522
California    0.940077
Idaho         1.059522
dtype: float64

In [106]:
# or in another case - you can do fillna base on group
fill_values = {'East': 0.5, 'West': -1}
print(type(fill_values))

fill_func = lambda g: g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)

<class 'dict'>


Ohio         -0.700636
New York      1.446773
Vermont       0.500000
Florida      -1.284564
Oregon        1.178967
Nevada       -1.000000
California    0.940077
Idaho        -1.000000
dtype: float64

In [194]:
# Healthcare Example

# Please create a sample of member numbers with default value of MMM or BBB LOB
# You will create the member numbers as length of 5 digit with 2 padded zeros 
# (7 digits total). The MMM or BB will be assigned at random, then you will also
# generate a random age of 65-100. You will make some of them np.nan.
# finally then you will fill them with means by group of MMM or BBB LOB.

import random

member_numbers = random.sample(range(0, 5000), 10)

# create a lambda function here using what we learned yesterday
func = lambda n: ('00000' + str(n))[-7:]

# advance use of list comprehension
member_numbers = [func(number) for number in member_numbers]

# put them into a pandas dataframe
data = pd.DataFrame(member_numbers,
                    columns=['member_number'])

# assign another value that can be BBB/MMM at random
data['line_of_business'] = np.random.randint(0, 2, data.shape[0]).astype(str)

In [195]:
print(data.dtypes)
data

member_number       object
line_of_business    object
dtype: object


Unnamed: 0,member_number,line_of_business
0,1829,0
1,2517,0
2,206,1
3,1031,0
4,1703,1
5,2601,0
6,1336,0
7,4935,0
8,3181,0
9,3855,1


In [196]:
# You must convert your int column into string first
# data['line_of_business'].str.replace({0: 'BBB'})

# https://stackoverflow.com/a/52065957

data['line_of_business'].replace({'0': 'BBB', '1':'MMM'}, inplace=True)

In [197]:
data

Unnamed: 0,member_number,line_of_business
0,1829,BBB
1,2517,BBB
2,206,MMM
3,1031,BBB
4,1703,MMM
5,2601,BBB
6,1336,BBB
7,4935,BBB
8,3181,BBB
9,3855,MMM


In [198]:
# set some values to be np.nan, just like the book
# https://stackoverflow.com/a/17071908

# df.loc[df['column_name'].isin(some_values)]

# this is how to set a particular columns matching 
# particular set of rows (member_number) values
target_members = ['0001031', '0003181']

# access by .loc[row_index or boolean_array, 'column name'] 
data.loc[data['member_number'].isin(target_members), 'line_of_business'] = np.nan

In [199]:
data

Unnamed: 0,member_number,line_of_business
0,1829,BBB
1,2517,BBB
2,206,MMM
3,1031,
4,1703,MMM
5,2601,BBB
6,1336,BBB
7,4935,BBB
8,3181,
9,3855,MMM


In [205]:
# df['column_name'].isin([list_of_values])
data.loc[data['member_number'].isin(['0001031', '0003181']),'line_of_business']

3    NaN
8    NaN
Name: line_of_business, dtype: object