In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, date

import warnings
warnings.filterwarnings('ignore')

## 1. PREPARE DATA

### 1.1. Read CustomerDemographic Table

In [2]:
df_dem = pd.read_excel('./data/KPMG_VI_New_raw_data_update_final.xlsx', sheet_name=3, skiprows=[0], usecols='A:M', engine='openpyxl')
df_dem.head()

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,default,owns_car,tenure
0,1,Laraine,Medendorp,F,93,1953-10-12,Executive Secretary,Health,Mass Customer,N,"""'",Yes,11.0
1,2,Eli,Bockman,Male,81,1980-12-16,Administrative Officer,Financial Services,Mass Customer,N,<script>alert('hi')</script>,Yes,16.0
2,3,Arlin,Dearle,Male,61,1954-01-20,Recruiting Manager,Property,Mass Customer,N,2018-02-01 00:00:00,Yes,15.0
3,4,Talbot,,Male,33,1961-10-03,,IT,Mass Customer,N,() { _; } >_[$($())] { touch /tmp/blns.shellsh...,No,7.0
4,5,Sheila-kathryn,Calton,Female,56,1977-05-13,Senior Editor,,Affluent Customer,N,NIL,Yes,8.0


In [3]:
df_dem.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 13 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   customer_id                          4000 non-null   int64         
 1   first_name                           4000 non-null   object        
 2   last_name                            3875 non-null   object        
 3   gender                               4000 non-null   object        
 4   past_3_years_bike_related_purchases  4000 non-null   int64         
 5   DOB                                  3913 non-null   datetime64[ns]
 6   job_title                            3494 non-null   object        
 7   job_industry_category                3344 non-null   object        
 8   wealth_segment                       4000 non-null   object        
 9   deceased_indicator                   4000 non-null   object        
 10  default     

In [4]:
print('The CustomerDemographic Table has', df_dem.shape[0], 'rows and', df_dem.shape[1], 'columns.')

The CustomerDemographic Table has 4000 rows and 13 columns.


### 1.2. Read CustomerAddress Table

In [5]:
df_add = pd.read_excel('./data/KPMG_VI_New_raw_data_update_final.xlsx', sheet_name=4, skiprows=[0], usecols='A:F', engine='openpyxl')
df_add.head()

Unnamed: 0,customer_id,address,postcode,state,country,property_valuation
0,1,060 Morning Avenue,2016,New South Wales,Australia,10
1,2,6 Meadow Vale Court,2153,New South Wales,Australia,10
2,4,0 Holy Cross Court,4211,QLD,Australia,9
3,5,17979 Del Mar Point,2448,New South Wales,Australia,4
4,6,9 Oakridge Court,3216,VIC,Australia,9


In [6]:
df_add.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3999 entries, 0 to 3998
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   customer_id         3999 non-null   int64 
 1   address             3999 non-null   object
 2   postcode            3999 non-null   int64 
 3   state               3999 non-null   object
 4   country             3999 non-null   object
 5   property_valuation  3999 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 187.6+ KB


In [7]:
print('The CustomerAddress Table has', df_add.shape[0], 'rows and', df_add.shape[1], 'columns.')

The CustomerAddress Table has 3999 rows and 6 columns.


### 1.3. Read Transactions Table

In [8]:
df_trans = pd.read_excel('./data/KPMG_VI_New_raw_data_update_final.xlsx', sheet_name=1, skiprows=[0], usecols='A:M', engine='openpyxl')
df_trans.head()

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date
0,1,2,2950,2017-02-25,0.0,Approved,Solex,Standard,medium,medium,71.49,53.62,41245.0
1,2,3,3120,2017-05-21,1.0,Approved,Trek Bicycles,Standard,medium,large,2091.47,388.92,41701.0
2,3,37,402,2017-10-16,0.0,Approved,OHM Cycles,Standard,low,medium,1793.43,248.82,36361.0
3,4,88,3135,2017-08-31,0.0,Approved,Norco Bicycles,Standard,medium,medium,1198.46,381.1,36145.0
4,5,78,787,2017-10-01,1.0,Approved,Giant Bicycles,Standard,medium,large,1765.3,709.48,42226.0


In [9]:
df_trans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   transaction_id           20000 non-null  int64         
 1   product_id               20000 non-null  int64         
 2   customer_id              20000 non-null  int64         
 3   transaction_date         20000 non-null  datetime64[ns]
 4   online_order             19640 non-null  float64       
 5   order_status             20000 non-null  object        
 6   brand                    19803 non-null  object        
 7   product_line             19803 non-null  object        
 8   product_class            19803 non-null  object        
 9   product_size             19803 non-null  object        
 10  list_price               20000 non-null  float64       
 11  standard_cost            19803 non-null  float64       
 12  product_first_sold_date  19803 n

In [10]:
print('The Transactions Table has', df_trans.shape[0], 'rows and', df_trans.shape[1], 'columns.')

The Transactions Table has 20000 rows and 13 columns.


### 1.4. Read NewCustomerList Table

In [11]:
df_newlist = pd.read_excel('./data/KPMG_VI_New_raw_data_update_final.xlsx', sheet_name=2, skiprows=[0], usecols='A:W', engine='openpyxl')
df_newlist.head()

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,...,state,country,property_valuation,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Rank,Value
0,Chickie,Brister,Male,86,1957-07-12,General Manager,Manufacturing,Mass Customer,N,Yes,...,QLD,Australia,6,0.49,0.6125,0.765625,0.650781,1,1,1.71875
1,Morly,Genery,Male,69,1970-03-22,Structural Engineer,Property,Mass Customer,N,No,...,NSW,Australia,11,0.53,0.53,0.6625,0.563125,1,1,1.71875
2,Ardelis,Forrester,Female,10,1974-08-28,Senior Cost Accountant,Financial Services,Affluent Customer,N,No,...,VIC,Australia,5,0.49,0.49,0.49,0.49,1,1,1.71875
3,Lucine,Stutt,Female,64,1979-01-28,Account Representative III,Manufacturing,Affluent Customer,N,Yes,...,QLD,Australia,1,0.69,0.8625,0.8625,0.8625,4,4,1.703125
4,Melinda,Hadlee,Female,34,1965-09-21,Financial Analyst,Financial Services,Affluent Customer,N,No,...,NSW,Australia,9,0.46,0.46,0.575,0.575,4,4,1.703125


In [12]:
df_newlist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 23 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   first_name                           1000 non-null   object        
 1   last_name                            971 non-null    object        
 2   gender                               1000 non-null   object        
 3   past_3_years_bike_related_purchases  1000 non-null   int64         
 4   DOB                                  983 non-null    datetime64[ns]
 5   job_title                            894 non-null    object        
 6   job_industry_category                835 non-null    object        
 7   wealth_segment                       1000 non-null   object        
 8   deceased_indicator                   1000 non-null   object        
 9   owns_car                             1000 non-null   object        
 10  tenure       

In [13]:
print('The NewCustomerList Table has', df_newlist.shape[0], 'rows and', df_newlist.shape[1], 'columns.')

The NewCustomerList Table has 1000 rows and 23 columns.


## 2. PROCESS DATA

### 2.1. Check data in 3 tables

**CustomerDemographic**

In [14]:
# Check the data for duplicates
print('Shape before dropping duplicates', df_dem.shape)
df_dem = df_dem.drop_duplicates()
print('Shape after dropping duplicates', df_dem.shape)

Shape before dropping duplicates (4000, 13)
Shape after dropping duplicates (4000, 13)


- CustomerDemographic Table has no duplicates

In [15]:
# Check the data for missing values
print(np.sum(df_dem.isnull()), '\n')

customer_id                              0
first_name                               0
last_name                              125
gender                                   0
past_3_years_bike_related_purchases      0
DOB                                     87
job_title                              506
job_industry_category                  656
wealth_segment                           0
deceased_indicator                       0
default                                302
owns_car                                 0
tenure                                  87
dtype: int64 



 - There are lots of missing data records in CustomerDemographic Table
 - The column named 'default' needs to be removed since it has no meaning

In [16]:
df_dem = df_dem.drop(['default'], axis=1)
df_dem.head(3)

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure
0,1,Laraine,Medendorp,F,93,1953-10-12,Executive Secretary,Health,Mass Customer,N,Yes,11.0
1,2,Eli,Bockman,Male,81,1980-12-16,Administrative Officer,Financial Services,Mass Customer,N,Yes,16.0
2,3,Arlin,Dearle,Male,61,1954-01-20,Recruiting Manager,Property,Mass Customer,N,Yes,15.0


**CustomerAddress**

In [17]:
# Check the data for duplicates
print('Shape before dropping duplicates', df_add.shape)
df_add = df_add.drop_duplicates()
print('Shape after dropping duplicates', df_add.shape)

Shape before dropping duplicates (3999, 6)
Shape after dropping duplicates (3999, 6)


- CustomerAddress Table has no duplicates

In [18]:
# Check the data for missing values
print(np.sum(df_add.isnull()), '\n')

customer_id           0
address               0
postcode              0
state                 0
country               0
property_valuation    0
dtype: int64 



 - There are no missing data records in CustomerAddress Table

**Transactions**

In [19]:
# Check the data for duplicates
print('Shape before dropping duplicates', df_trans.shape)
df_trans = df_trans.drop_duplicates()
print('Shape after dropping duplicates', df_trans.shape)

Shape before dropping duplicates (20000, 13)
Shape after dropping duplicates (20000, 13)


- Transactions Table has no duplicates

In [20]:
# Check the data for missing values
print(np.sum(df_trans.isnull()), '\n')

transaction_id               0
product_id                   0
customer_id                  0
transaction_date             0
online_order               360
order_status                 0
brand                      197
product_line               197
product_class              197
product_size               197
list_price                   0
standard_cost              197
product_first_sold_date    197
dtype: int64 



 - There are lots of missing data records in Transactions Table

### 2.2. Merge 3 tables into 1 table

In [21]:
df_dem_add = pd.merge(df_dem, df_add, how='outer', on=['customer_id'])

In [22]:
df_dem.shape

(4000, 12)

In [23]:
df_add.shape

(3999, 6)

In [24]:
df_dem_add.shape

(4003, 17)

In [25]:
df_all = pd.merge(df_trans, df_dem_add, how='outer', on=['customer_id'])

In [26]:
df_trans.shape

(20000, 13)

In [27]:
df_all.shape

(20510, 29)

In [28]:
df_all.head(3)

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,...,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,address,postcode,state,country,property_valuation
0,1.0,2.0,2950,2017-02-25,0.0,Approved,Solex,Standard,medium,medium,...,Financial Services,Mass Customer,N,Yes,10.0,984 Hoepker Court,3064.0,VIC,Australia,6.0
1,11065.0,1.0,2950,2017-10-16,0.0,Approved,Giant Bicycles,Standard,medium,medium,...,Financial Services,Mass Customer,N,Yes,10.0,984 Hoepker Court,3064.0,VIC,Australia,6.0
2,18923.0,62.0,2950,2017-04-26,0.0,Approved,Solex,Standard,medium,medium,...,Financial Services,Mass Customer,N,Yes,10.0,984 Hoepker Court,3064.0,VIC,Australia,6.0


In [29]:
df_all.columns

Index(['transaction_id', 'product_id', 'customer_id', 'transaction_date',
       'online_order', 'order_status', 'brand', 'product_line',
       'product_class', 'product_size', 'list_price', 'standard_cost',
       'product_first_sold_date', 'first_name', 'last_name', 'gender',
       'past_3_years_bike_related_purchases', 'DOB', 'job_title',
       'job_industry_category', 'wealth_segment', 'deceased_indicator',
       'owns_car', 'tenure', 'address', 'postcode', 'state', 'country',
       'property_valuation'],
      dtype='object')

In [30]:
# Check the data for missing values
print(np.sum(df_all.isnull()), '\n')

transaction_id                          510
product_id                              510
customer_id                               0
transaction_date                        510
online_order                            870
order_status                            510
brand                                   707
product_line                            707
product_class                           707
product_size                            707
list_price                              510
standard_cost                           707
product_first_sold_date                 707
first_name                                6
last_name                               661
gender                                    6
past_3_years_bike_related_purchases       6
DOB                                     463
job_title                              2483
job_industry_category                  3330
wealth_segment                            6
deceased_indicator                        6
owns_car                        

- There are **510 customers without having any transactions** => Need to be removed 510 records where 'transaction_id' is empty

In [31]:
df_all = df_all.loc[~df_all['transaction_id'].isnull()]

In [32]:
df_all.tail(3)

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,...,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,address,postcode,state,country,property_valuation
19997,18462.0,80.0,2789,2017-06-20,0.0,Approved,OHM Cycles,Touring,low,medium,...,Financial Services,Affluent Customer,N,Yes,7.0,724 West Park,2112.0,NSW,Australia,11.0
19998,17981.0,69.0,3446,2017-12-26,1.0,Approved,Giant Bicycles,Road,medium,medium,...,Manufacturing,Mass Customer,N,No,14.0,8 Becker Drive,4868.0,QLD,Australia,4.0
19999,18165.0,86.0,3446,2017-12-03,0.0,Approved,OHM Cycles,Standard,medium,medium,...,Manufacturing,Mass Customer,N,No,14.0,8 Becker Drive,4868.0,QLD,Australia,4.0


In [33]:
print('The table has', df_all.shape[0], 'rows and', df_all.shape[1], 'columns.')

The table has 20000 rows and 29 columns.


**Create 'age' column and 'profit' column**

In [None]:
df_all['DOB'].sort_values(ascending=True)

In [34]:
# Create 'age' column
now = pd.Timestamp('now')

df_all['DOB'] = pd.to_datetime(df_all['DOB'])
df_all['age'] = now.year - df_all['DOB'].dt.year - (now.dayofyear < df_all['DOB'].dt.dayofyear)

In [35]:
# Create 'profit' column
df_all['profit'] = df_all['list_price'] - df_all['standard_cost']

In [36]:
df_all = df_all.fillna(0)
df_all.head(3)

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,...,deceased_indicator,owns_car,tenure,address,postcode,state,country,property_valuation,age,profit
0,1.0,2.0,2950,2017-02-25,0.0,Approved,Solex,Standard,medium,medium,...,N,Yes,10.0,984 Hoepker Court,3064.0,VIC,Australia,6.0,68.0,17.87
1,11065.0,1.0,2950,2017-10-16,0.0,Approved,Giant Bicycles,Standard,medium,medium,...,N,Yes,10.0,984 Hoepker Court,3064.0,VIC,Australia,6.0,68.0,448.68
2,18923.0,62.0,2950,2017-04-26,0.0,Approved,Solex,Standard,medium,medium,...,N,Yes,10.0,984 Hoepker Court,3064.0,VIC,Australia,6.0,68.0,179.44


In [37]:
df_all.dtypes

transaction_id                                float64
product_id                                    float64
customer_id                                     int64
transaction_date                       datetime64[ns]
online_order                                  float64
order_status                                   object
brand                                          object
product_line                                   object
product_class                                  object
product_size                                   object
list_price                                    float64
standard_cost                                 float64
product_first_sold_date                       float64
first_name                                     object
last_name                                      object
gender                                         object
past_3_years_bike_related_purchases           float64
DOB                                            object
job_title                   

### 2.4. Convert

In [38]:
# Convert 'float' type into 'int' type
df_all['transaction_id'] = df_all['transaction_id'].apply(np.int64)
df_all['product_id'] = df_all['product_id'].apply(np.int64)
df_all['property_valuation'] = df_all['property_valuation'].apply(np.int64)
df_all['past_3_years_bike_related_purchases'] = df_all['past_3_years_bike_related_purchases'].apply(np.int64)
df_all['property_valuation'] = df_all['property_valuation'].apply(np.int64)
df_all['age'] = df_all['age'].apply(np.int64)
df_all['tenure'] = df_all['tenure'].apply(np.int64)

In [42]:
df_all.drop(['transaction_id', 'product_id', 'customer_id'], axis=1).describe().applymap(lambda x: f'{x:0.2f}')

Unnamed: 0,online_order,list_price,standard_cost,product_first_sold_date,past_3_years_bike_related_purchases,tenure,postcode,property_valuation,age,profit
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,0.49,1107.83,550.57,37823.51,48.77,10.44,2982.84,7.5,44.46,546.51
std,0.5,582.83,407.67,4734.75,28.6,5.83,858.97,2.84,14.45,493.99
min,0.0,12.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,575.27,215.03,35560.0,24.0,6.0,2200.0,6.0,35.0,133.78
50%,0.0,1163.89,464.72,38206.0,48.0,10.0,2767.0,8.0,45.0,437.46
75%,1.0,1635.3,795.1,40672.0,73.0,15.0,3752.0,10.0,55.0,827.16
max,1.0,2091.47,1759.85,42710.0,99.0,22.0,4883.0,12.0,179.0,1702.55


- There are outliers at the 'age' column with min age is 0 and max age is 179 => needs to be removed since they are wrong infomation

In [44]:
df_all.sort_values(['age'])

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,...,deceased_indicator,owns_car,tenure,address,postcode,state,country,property_valuation,age,profit
7710,1445,32,1244,2017-11-14,0.0,Approved,Giant Bicycles,Standard,medium,medium,...,N,Yes,0,3466 Truax Terrace,2127.0,NSW,Australia,8,0,431.33
7627,18276,7,2469,2017-09-09,1.0,Approved,Trek Bicycles,Road,low,medium,...,N,Yes,0,49 Talmadge Trail,2036.0,NSW,Australia,11,0,745.94
7626,17668,8,2469,2017-06-19,1.0,Approved,Solex,Road,medium,small,...,N,Yes,0,49 Talmadge Trail,2036.0,NSW,Australia,11,0,187.39
7624,12802,33,2469,2017-06-06,1.0,Approved,Giant Bicycles,Standard,medium,small,...,N,Yes,0,49 Talmadge Trail,2036.0,NSW,Australia,11,0,144.26
7623,10859,45,2469,2017-08-06,1.0,Approved,Solex,Standard,medium,medium,...,N,Yes,0,49 Talmadge Trail,2036.0,NSW,Australia,11,0,356.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5907,12083,13,34,2017-07-23,0.0,Approved,Solex,Standard,medium,medium,...,N,No,20,833 Luster Way,4005.0,QLD,Australia,8,179,574.62
5908,16935,0,34,2017-02-14,0.0,Approved,0,0,0,0,...,N,No,20,833 Luster Way,4005.0,QLD,Australia,8,179,0.00
5909,17808,96,34,2017-04-10,1.0,Approved,WeareA2B,Road,low,small,...,N,No,20,833 Luster Way,4005.0,QLD,Australia,8,179,129.01
5910,19291,65,34,2017-09-19,0.0,Approved,WeareA2B,Standard,medium,medium,...,N,No,20,833 Luster Way,4005.0,QLD,Australia,8,179,1028.76


In [43]:
df_all.drop(['transaction_id', 'product_id', 'customer_id'], axis=1).corr()

Unnamed: 0,online_order,list_price,standard_cost,product_first_sold_date,past_3_years_bike_related_purchases,tenure,postcode,property_valuation,age,profit
online_order,1.0,-0.001491,0.006535,0.006441,0.003365,0.008264,0.001224,-0.000185,-0.007693,-0.00623
list_price,-0.001491,1.0,0.544153,0.038001,0.00713,-0.008476,0.012627,-0.009905,-0.006321,0.719563
standard_cost,0.006535,0.544153,1.0,0.125946,-0.007101,-0.012669,0.0115,7.3e-05,0.00333,-0.153864
product_first_sold_date,0.006441,0.038001,0.125946,1.0,0.010218,0.008932,0.016546,-0.001472,0.011883,0.114673
past_3_years_bike_related_purchases,0.003365,0.00713,-0.007101,0.010218,1.0,-0.009356,-0.018122,0.01135,-0.015272,0.016697
tenure,0.008264,-0.008476,-0.012669,0.008932,-0.009356,1.0,0.011496,-0.018016,0.499967,0.00092
postcode,0.001224,0.012627,0.0115,0.016546,-0.018122,0.011496,1.0,-0.495346,-0.009721,0.009429
property_valuation,-0.000185,-0.009905,7.3e-05,-0.001472,0.01135,-0.018016,-0.495346,1.0,0.012282,-0.011828
age,-0.007693,-0.006321,0.00333,0.011883,-0.015272,0.499967,-0.009721,0.012282,1.0,-0.009513
profit,-0.00623,0.719563,-0.153864,0.114673,0.016697,0.00092,0.009429,-0.011828,-0.009513,1.0


In [None]:
# Numerical columns
number_vars = [f for f in df_all.columns if df_all.dtypes[f] != 'object']
list_nums = ', '.join(number_vars)
list_nums

In [None]:
# Categorical columns
object_vars = [f for f in df_all.columns if df_all.dtypes[f] == 'object']
list_objs = ', '.join(object_vars)
list_objs

In [None]:
# Components in categorical columns
i = 1
for ob in object_vars:
 print(i, '/', ob, '\t',len(df_all[ob].unique()),':', df_all[ob].unique())
 i = i+1

**Categorical variables** include: 'order_status', 'brand', 'product_line', 'product_class', 'product_size', 'gender', 'job_industry_category', 'wealth_segment', 'deceased_indicator', 'owns_car', 'state', 'country'

## 3. ANALYZE

_Analyzing variables based on NewCustomerList table_

### 3.1. 'order_status'

In [None]:
df_all_status = df_all.groupby(['order_status']).order_status.count()
df_all_status

In [None]:
plt.figure(figsize=(6,4))
df_all_status.plot.bar(color=['coral', 'sienna'], title='Approved Vs. Cancelled')
plt.xlabel('Order Status')
plt.ylabel('Count');

- Number of approved status is much greater than cancelled status

### 3.2. 'brand'

In [None]:
df_all_brand = df_all.groupby(['brand']).brand.count().sort_values()
df_all_brand

In [None]:
plt.figure(figsize=(6,4))
df_all_brand.plot.bar(color=['coral', 'sienna'], title='Comparasion of Brands')
plt.xlabel('Brands')
plt.ylabel('Count');

- Solex has highest number of customers (4253)
- Norco Bicycles has smallest number of customers (2910)

### 3.3. 'gender'

In [None]:
df_all['gender'].unique()

**Transform 'M' to 'Male'; 'Femal' and 'F' to 'Female'**

In [None]:
df_all['gender'] = df_all['gender'].replace('M', 'Male')
df_all['gender'] = df_all['gender'].replace('F', 'Female')
df_all['gender'] = df_all['gender'].replace('Femal', 'Female')

In [None]:
df_all_gen = df_all.groupby(['gender']).gender.count().sort_values()
df_all_gen

In [None]:
plt.figure(figsize=(6,4))
df_all_gen.plot.bar(color=['coral', 'sienna'], title='Comparison of Gender')
plt.xlabel('Gender')
plt.ylabel('Count');

- Gender records where ‘U’ have been replaced based on the distribution from the training dataset
- Male and Female have similar number of customers

### 3.4. 'job_industry_category'

In [None]:
df_all_jobcate = df_all.groupby(['job_industry_category']).job_industry_category.count().sort_values()
df_all_jobcate

In [None]:
plt.figure(figsize=(6,4))
df_all_jobcate.plot.bar(color=['coral', 'sienna'], title='Comparasion of Job Industry Categories')
plt.xlabel('Categories')
plt.ylabel('Count');

- There are lots of customers are working at Financial Services and Manufacturing.

### 3.5. 'wealth_segment'

In [None]:
df_all_wealthseg = df_all.groupby(['wealth_segment']).wealth_segment.count().sort_values()
df_all_wealthseg

In [None]:
plt.figure(figsize=(6,4))
df_all_wealthseg.plot.bar(color=['coral', 'sienna'], title='Comparasion of Wealth Segments')
plt.xlabel('Segments')
plt.ylabel('Count');

- Mass Customer is the highest

### 3.6. 'deceased_indicator'

In [None]:
df_all_decind = df_all.groupby(['deceased_indicator']).deceased_indicator.count().sort_values()
df_all_decind

In [None]:
plt.figure(figsize=(6,4))
df_all_decind.plot.bar(color=['coral', 'sienna'], title='Deceased Indicator')
plt.xlabel('Yes/No')
plt.ylabel('Count');

### 3.7. 'owns_car'

In [None]:
df_all_own = df_all.groupby(['owns_car']).owns_car.count().sort_values()
df_all_own

In [None]:
plt.figure(figsize=(6,4))
df_all_own.plot.bar(color=['coral', 'sienna'], title='Customer Owns Car')
plt.xlabel('Yes/No')
plt.ylabel('Count');

### 3.8. 'state'

In [None]:
df_all_state = df_all.groupby(['state']).state.count().sort_values()
df_all_state

In [None]:
plt.figure(figsize=(6,4))
df_all_state.plot.bar(color=['coral', 'sienna'], title='Comparasion of States')
plt.xlabel('States')
plt.ylabel('Count');

### 3.10. 'postcode'

### 3.11. 'DOB'

### 3.12. 'tenure'

### 3.13. 'past_3_years_bike_related_purchases'

### 3.14. 'property_valuation'