#### Lets clean the sheet 2 which is NewCustomerList
importing the required packages.

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
df2 = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx', sheet_name=2, header=1)

In [3]:
# check few of the imported data.
df2.head()

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,...,state,country,property_valuation,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Rank,Value
0,Chickie,Brister,Male,86,1957-07-12,General Manager,Manufacturing,Mass Customer,N,Yes,...,QLD,Australia,6,0.56,0.7,0.875,0.74375,1,1,1.71875
1,Morly,Genery,Male,69,1970-03-22,Structural Engineer,Property,Mass Customer,N,No,...,NSW,Australia,11,0.89,0.89,1.1125,0.945625,1,1,1.71875
2,Ardelis,Forrester,Female,10,1974-08-28,Senior Cost Accountant,Financial Services,Affluent Customer,N,No,...,VIC,Australia,5,1.01,1.01,1.01,1.01,1,1,1.71875
3,Lucine,Stutt,Female,64,1979-01-28,Account Representative III,Manufacturing,Affluent Customer,N,Yes,...,QLD,Australia,1,0.87,1.0875,1.0875,1.0875,4,4,1.703125
4,Melinda,Hadlee,Female,34,1965-09-21,Financial Analyst,Financial Services,Affluent Customer,N,No,...,NSW,Australia,9,0.52,0.52,0.65,0.65,4,4,1.703125


We can see there are some **Unnamed columns** with data points, but we don't know what these data points are about (it's always good to inform or report to clients about these random/missing data before dropping them), so lets drop these columns first.

In [4]:
df2.columns

Index(['first_name', 'last_name', 'gender',
       'past_3_years_bike_related_purchases', 'DOB', 'job_title',
       'job_industry_category', 'wealth_segment', 'deceased_indicator',
       'owns_car', 'tenure', 'address', 'postcode', 'state', 'country',
       'property_valuation', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18',
       'Unnamed: 19', 'Unnamed: 20', 'Rank', 'Value'],
      dtype='object')

In [5]:
df2 = df2.drop(columns=['Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18',
       'Unnamed: 19', 'Unnamed: 20'])

In [7]:
df2.head()

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,address,postcode,state,country,property_valuation,Rank,Value
0,Chickie,Brister,Male,86,1957-07-12,General Manager,Manufacturing,Mass Customer,N,Yes,14,45 Shopko Center,4500,QLD,Australia,6,1,1.71875
1,Morly,Genery,Male,69,1970-03-22,Structural Engineer,Property,Mass Customer,N,No,16,14 Mccormick Park,2113,NSW,Australia,11,1,1.71875
2,Ardelis,Forrester,Female,10,1974-08-28,Senior Cost Accountant,Financial Services,Affluent Customer,N,No,10,5 Colorado Crossing,3505,VIC,Australia,5,1,1.71875
3,Lucine,Stutt,Female,64,1979-01-28,Account Representative III,Manufacturing,Affluent Customer,N,Yes,5,207 Annamark Plaza,4814,QLD,Australia,1,4,1.703125
4,Melinda,Hadlee,Female,34,1965-09-21,Financial Analyst,Financial Services,Affluent Customer,N,No,19,115 Montana Place,2093,NSW,Australia,9,4,1.703125


In [8]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 18 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   first_name                           1000 non-null   object        
 1   last_name                            971 non-null    object        
 2   gender                               1000 non-null   object        
 3   past_3_years_bike_related_purchases  1000 non-null   int64         
 4   DOB                                  983 non-null    datetime64[ns]
 5   job_title                            894 non-null    object        
 6   job_industry_category                835 non-null    object        
 7   wealth_segment                       1000 non-null   object        
 8   deceased_indicator                   1000 non-null   object        
 9   owns_car                             1000 non-null   object        
 10  tenure       

In [9]:
# check the missing values
df2.isnull().sum()

first_name                               0
last_name                               29
gender                                   0
past_3_years_bike_related_purchases      0
DOB                                     17
job_title                              106
job_industry_category                  165
wealth_segment                           0
deceased_indicator                       0
owns_car                                 0
tenure                                   0
address                                  0
postcode                                 0
state                                    0
country                                  0
property_valuation                       0
Rank                                     0
Value                                    0
dtype: int64

Here we only have **1000** records so removing too many rows can significantly reduce the amount of data available for analysis.
we can use the **`mode() imputation`** method which can be used for categorical columns. 

In [17]:
# mode() for DOB
mode_DOB = df2['DOB'].mode()[0]
df2['DOB'].fillna(mode_DOB, inplace= True)

In [19]:
df2['DOB'].isnull().sum()

0

In [46]:
df2['job_title'].value_counts()

Associate Professor         121
Environmental Tech           14
Software Consultant          14
Chief Design Engineer        13
Assistant Manager            12
                           ... 
Accountant II                 1
Programmer IV                 1
Administrative Officer        1
Accounting Assistant III      1
Web Developer I               1
Name: job_title, Length: 184, dtype: int64

In [27]:
# mode imputation for job_titles.
mode_job_titles = df2['job_title'].mode()[0]

In [29]:
df2['job_title'].fillna(mode_job_titles, inplace= True)

In [30]:
df2['job_title'].isnull().sum()

0

In [47]:
df2['job_industry_category'].value_counts()

Financial Services    368
Manufacturing         199
Health                152
Retail                 78
Property               64
IT                     51
Entertainment          37
Argiculture            26
Telecommunications     25
Name: job_industry_category, dtype: int64

In [39]:
# imputing with repeated value
mode_indus_ch = df2['job_industry_category'].mode()[0]

In [41]:
df2['job_industry_category'].fillna(mode_indus_ch, inplace= True)

In [50]:
df2['job_industry_category'].isnull().sum()

0

In [52]:
df2.dtypes

first_name                                     object
last_name                                      object
gender                                         object
past_3_years_bike_related_purchases             int64
DOB                                    datetime64[ns]
job_title                                      object
job_industry_category                          object
wealth_segment                                 object
deceased_indicator                             object
owns_car                                       object
tenure                                          int64
address                                        object
postcode                                        int64
state                                          object
country                                        object
property_valuation                              int64
Rank                                            int64
Value                                         float64
dtype: object

In [53]:
# let's replace U with 'Unidentified' 
df2['gender'].value_counts()

Female    513
Male      470
U          17
Name: gender, dtype: int64

In [58]:
df2['gender'].replace('U', 'Unidentified', inplace=True)

In [59]:
df2['gender'].value_counts()

Female          513
Male            470
Unidentified     17
Name: gender, dtype: int64

In [64]:
# lets do the same for 'deceased_indicator' and replace N and Y with boolean values.
df2['deceased_indicator'].replace({'N': False, 'Y': True}, inplace=True)

In [65]:
df2['deceased_indicator'].value_counts()

False    1000
Name: deceased_indicator, dtype: int64

In [93]:
df2.tail()

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,address,postcode,state,country,property_valuation,Rank,Value
995,Ferdinand,Romanetti,Male,60,1959-10-07,Paralegal,Financial Services,Affluent Customer,False,No,9,2 Sloan Way,2200,NSW,Australia,7,996,0.374
996,Burk,Wortley,Male,22,2001-10-17,Senior Sales Associate,Health,Mass Customer,False,No,6,04 Union Crossing,2196,NSW,Australia,10,997,0.357
997,Melloney,Temby,Female,17,1954-10-05,Budget/Accounting Analyst IV,Financial Services,Affluent Customer,False,Yes,15,33475 Fair Oaks Junction,4702,QLD,Australia,2,997,0.357
998,Dickie,Cubbini,Male,30,1952-12-17,Financial Advisor,Financial Services,Mass Customer,False,Yes,19,57666 Victoria Way,4215,QLD,Australia,2,997,0.357
999,Sylas,Duffill,Male,56,1955-10-02,Staff Accountant IV,Property,Mass Customer,False,Yes,14,21875 Grover Drive,2010,NSW,Australia,9,1000,0.34


In [94]:
df2.to_csv('NewCustomerList.csv', index=False)