# **Customer Demographic Cleaning**

In [1]:
# Import libraries
import pandas as pd
import datetime as DT

In [None]:
# Read an excel file into a pandas data frame
cust_demo = pd.read_excel('KPMG_raw_data.xlsx', sheet_name = 'CustomerDemographic', header = 1)

In [4]:
cust_demo

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,default,owns_car,tenure
0,1,Laraine,Medendorp,F,93,1953-10-12,Executive Secretary,Health,Mass Customer,N,"""'",Yes,11.0
1,2,Eli,Bockman,Male,81,1980-12-16,Administrative Officer,Financial Services,Mass Customer,N,<script>alert('hi')</script>,Yes,16.0
2,3,Arlin,Dearle,Male,61,1954-01-20,Recruiting Manager,Property,Mass Customer,N,2018-02-01 00:00:00,Yes,15.0
3,4,Talbot,,Male,33,1961-10-03,,IT,Mass Customer,N,() { _; } >_[$($())] { touch /tmp/blns.shellsh...,No,7.0
4,5,Sheila-kathryn,Calton,Female,56,1977-05-13,Senior Editor,,Affluent Customer,N,NIL,Yes,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,3996,Rosalia,Halgarth,Female,8,1975-08-09,VP Product Management,Health,Mass Customer,N,-100,No,19.0
3996,3997,Blanch,Nisuis,Female,87,2001-07-13,Statistician II,Manufacturing,High Net Worth,N,â¦testâ§,Yes,1.0
3997,3998,Sarene,Woolley,U,60,NaT,Assistant Manager,IT,High Net Worth,N,,No,
3998,3999,Patrizius,,Male,11,1973-10-24,,Manufacturing,Affluent Customer,N,Â¡â¢Â£Â¢âÂ§Â¶â¢ÂªÂºââ,Yes,10.0


In [5]:
# Print a concise summary of a data frame
cust_demo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 13 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   customer_id                          4000 non-null   int64         
 1   first_name                           4000 non-null   object        
 2   last_name                            3875 non-null   object        
 3   gender                               4000 non-null   object        
 4   past_3_years_bike_related_purchases  4000 non-null   int64         
 5   DOB                                  3913 non-null   datetime64[ns]
 6   job_title                            3494 non-null   object        
 7   job_industry_category                3344 non-null   object        
 8   wealth_segment                       4000 non-null   object        
 9   deceased_indicator                   4000 non-null   object        
 10  default     

In [6]:
# Find duplicate rows in a data frame
cust_demo.duplicated().sum()

0

In [7]:
# Check if any value is NaN in a pandas data frame
cust_demo.isnull().sum()

customer_id                              0
first_name                               0
last_name                              125
gender                                   0
past_3_years_bike_related_purchases      0
DOB                                     87
job_title                              506
job_industry_category                  656
wealth_segment                           0
deceased_indicator                       0
default                                302
owns_car                                 0
tenure                                  87
dtype: int64

There are some missing values in the columns : 'last_name', 'DOB', 'job_title', 
'job_industry_category', 'default', 'tenure.

The default column contains invalid data, so it can be dropped from the data frame.


In [8]:
# Find the number of unique values in the default column
cust_demo['default'].value_counts()

100                                       113
1                                         112
-1                                        111
-100                                       99
Ù¡Ù¢Ù£                                     53
                                         ... 
testâ testâ«                               31
/dev/null; touch /tmp/blns.fail ; echo     30
âªâªtestâª                                 29
ì¸ëë°í ë¥´                                 27
,ãã»:*:ã»ãâ( â» Ï â» )ãã»:*:ã»ãâ           25
Name: default, Length: 90, dtype: int64

In [9]:
# Drop the default column
cust_demo = cust_demo.drop(columns = ['default'])

# Drop missing values in DOB and job_industry_category columns
cust_demo = cust_demo.dropna(subset = ['DOB', 'job_industry_category'])

# Replace missing values in the last_name column, no need to drop because first_name enough to identify
cust_demo['last_name'] = cust_demo['last_name'].fillna('-')

# Replace missing values in the job_title column
cust_demo['job_title'] = cust_demo['job_title'].fillna('Other')

In [10]:
# Recheck there still have NaN in a data frame
cust_demo.isnull().sum()

customer_id                            0
first_name                             0
last_name                              0
gender                                 0
past_3_years_bike_related_purchases    0
DOB                                    0
job_title                              0
job_industry_category                  0
wealth_segment                         0
deceased_indicator                     0
owns_car                               0
tenure                                 0
dtype: int64

In [11]:
# Find the number of unique values for each column
cust_demo.nunique()

customer_id                            3257
first_name                             2632
last_name                              3063
gender                                    5
past_3_years_bike_related_purchases     100
DOB                                    2927
job_title                               196
job_industry_category                     9
wealth_segment                            3
deceased_indicator                        2
owns_car                                  2
tenure                                   22
dtype: int64

In [12]:
# Find the number of unique values in the gender column
cust_demo['gender'].value_counts()

Female    1688
Male      1566
F            1
U            1
Femal        1
Name: gender, dtype: int64

The gender column has 5 values : 'Male', 'Female', 'M', 'F' and 'U' which means 'Unidentified'. So these need to be grouped to make them more consistent and correct.

In [13]:
# Replace values 'F': Female'.'M : Male' 'Femal : Female''U : Unidentified'
cust_demo['gender'] = cust_demo['gender'].replace('F','Female').replace('M','Male').replace('Femal','Female').replace('U','Unidentified')
cust_demo['gender'].value_counts()

Female          1690
Male            1566
Unidentified       1
Name: gender, dtype: int64

In [14]:
 cust_demo['DOB']

0      1953-10-12
1      1980-12-16
2      1954-01-20
3      1961-10-03
5      1966-09-16
          ...    
3993   1989-04-07
3994   1975-12-12
3995   1975-08-09
3996   2001-07-13
3998   1973-10-24
Name: DOB, Length: 3257, dtype: datetime64[ns]

Making the data a bit more comprehensible by create Age column by using the DOB column.

In [15]:
# Insert the Age column after the DOB column
cust_demo.insert(loc = cust_demo.columns.get_loc('DOB')+1, column='Age', value = '')
today = pd.Timestamp.today()
cust_demo['Age'] = (today - cust_demo['DOB']).astype('timedelta64[Y]')
cust_demo['Age'].sort_values(ascending = False)

33      179.0
719      91.0
1091     87.0
3409     82.0
2412     79.0
        ...  
3615     21.0
3111     21.0
18       21.0
3172     21.0
65       20.0
Name: Age, Length: 3257, dtype: float64

There is an inaccurate value in the Age column,which is 179 year old, so need to fillter it out.

In [16]:
# Fillter out inaccurate value in the Age column
cust_demo = cust_demo[cust_demo['Age'] < 100]
cust_demo['Age'].sort_values(ascending = False)

719     91.0
1091    87.0
3409    82.0
657     79.0
2412    79.0
        ... 
2295    21.0
1354    21.0
3172    21.0
3748    21.0
65      20.0
Name: Age, Length: 3256, dtype: float64

Group ages in ranges in order to use them in data analysis.

In [17]:
# Group age in 10 years range
Age_group = pd.cut(cust_demo['Age'], bins=[1, 20, 30, 40, 50, 60, 70, 80, 100],
                    labels=['1-20', '20-30', '30-40', '40-50', '50-60','60-70','70-80','80+'])

In [18]:
# Insert the Age_group column after the Age column
cust_demo.insert(cust_demo.columns.get_loc('Age')+1, column='Age_group', value = Age_group)

In [19]:
cust_demo.loc[:5,['Age','Age_group']]

Unnamed: 0,Age,Age_group
0,69.0,60-70
1,42.0,40-50
2,69.0,60-70
3,61.0,60-70
5,56.0,50-60


In [20]:
# Find the number of unique values in the job_industry_category column
cust_demo['job_industry_category'].value_counts()

Manufacturing         796
Financial Services    767
Health                596
Retail                358
Property              267
IT                    151
Entertainment         136
Argiculture           113
Telecommunications     72
Name: job_industry_category, dtype: int64

In [21]:
# Find the number of unique values in the wealth_segment column
cust_demo['wealth_segment'].value_counts()

Mass Customer        1636
High Net Worth        826
Affluent Customer     794
Name: wealth_segment, dtype: int64

In [22]:
# Find the number of unique values in the deceased_indicator column
cust_demo['deceased_indicator'].value_counts()

N    3254
Y       2
Name: deceased_indicator, dtype: int64

In [23]:
# Fillter out 'Y' value in the deceased_indicator column
cust_demo = cust_demo[cust_demo['deceased_indicator'] == 'N']
cust_demo['deceased_indicator'].value_counts()

N    3254
Name: deceased_indicator, dtype: int64

In [24]:
# Find the number of unique values in the owns_car column
cust_demo['owns_car'].value_counts()

Yes    1657
No     1597
Name: owns_car, dtype: int64

In [25]:
cust_demo

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,Age,Age_group,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure
0,1,Laraine,Medendorp,Female,93,1953-10-12,69.0,60-70,Executive Secretary,Health,Mass Customer,N,Yes,11.0
1,2,Eli,Bockman,Male,81,1980-12-16,42.0,40-50,Administrative Officer,Financial Services,Mass Customer,N,Yes,16.0
2,3,Arlin,Dearle,Male,61,1954-01-20,69.0,60-70,Recruiting Manager,Property,Mass Customer,N,Yes,15.0
3,4,Talbot,-,Male,33,1961-10-03,61.0,60-70,Other,IT,Mass Customer,N,No,7.0
5,6,Curr,Duckhouse,Male,35,1966-09-16,56.0,50-60,Other,Retail,High Net Worth,N,Yes,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3993,3994,Stephie,Byars,Female,5,1989-04-07,33.0,30-40,Structural Analysis Engineer,Manufacturing,Affluent Customer,N,No,12.0
3994,3995,Rusty,Iapico,Male,93,1975-12-12,47.0,40-50,Staff Scientist,Manufacturing,Mass Customer,N,Yes,14.0
3995,3996,Rosalia,Halgarth,Female,8,1975-08-09,47.0,40-50,VP Product Management,Health,Mass Customer,N,No,19.0
3996,3997,Blanch,Nisuis,Female,87,2001-07-13,21.0,20-30,Statistician II,Manufacturing,High Net Worth,N,Yes,1.0


In [26]:
cust_demo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3254 entries, 0 to 3998
Data columns (total 14 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   customer_id                          3254 non-null   int64         
 1   first_name                           3254 non-null   object        
 2   last_name                            3254 non-null   object        
 3   gender                               3254 non-null   object        
 4   past_3_years_bike_related_purchases  3254 non-null   int64         
 5   DOB                                  3254 non-null   datetime64[ns]
 6   Age                                  3254 non-null   float64       
 7   Age_group                            3254 non-null   category      
 8   job_title                            3254 non-null   object        
 9   job_industry_category                3254 non-null   object        
 10  wealth_segme

In [27]:
cust_demo.isnull().sum()

customer_id                            0
first_name                             0
last_name                              0
gender                                 0
past_3_years_bike_related_purchases    0
DOB                                    0
Age                                    0
Age_group                              0
job_title                              0
job_industry_category                  0
wealth_segment                         0
deceased_indicator                     0
owns_car                               0
tenure                                 0
dtype: int64

In [28]:
# Exports data frame to a CSV file 
cust_demo.to_csv('Cust_Demo_Cleaned.csv',index = False)

# **Summary Customer Demographic table:**

**Accuracy:** 
1.   DOB: inaccurate
2.   Age: missing

**Completeness**

1.   DOB: blanks
2.   Job title: blanks
3.   Job industry category: blanks

**Consistency**
1.   Gender: inconsistency

**Relevancy**

1.   Default: invalid data








