In [138]:
# Look into Raw data
import pandas as pd 

file = pd.ExcelFile('../data/Raw_data.xlsx')
file.sheet_names

['Transactions', 'NewCustomerList', 'CustomerDemographic', 'CustomerAddress']

In [139]:
# Read respective sheet
cust_demo_data = pd.read_excel('../data/Raw_data.xlsx', sheet_name='CustomerDemographic')
cust_demo_data.head(2)

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,default,owns_car,tenure
0,1,Laraine,Medendorp,F,93,1953-10-12 00:00:00,Executive Secretary,Health,Mass Customer,N,"""'",Yes,11.0
1,2,Eli,Bockman,Male,81,1980-12-16 00:00:00,Administrative Officer,Financial Services,Mass Customer,N,<script>alert('hi')</script>,Yes,16.0


In [140]:
# Missing value counts
cust_demo_data.isnull().sum()

customer_id                              0
first_name                               0
last_name                              125
gender                                   0
past_3_years_bike_related_purchases      0
DOB                                     87
job_title                              506
job_industry_category                  656
wealth_segment                           0
deceased_indicator                       0
default                                302
owns_car                                 0
tenure                                  87
dtype: int64

In [141]:
# Missing value percentage
cust_demo_data.isnull().mean()*100

customer_id                             0.000
first_name                              0.000
last_name                               3.125
gender                                  0.000
past_3_years_bike_related_purchases     0.000
DOB                                     2.175
job_title                              12.650
job_industry_category                  16.400
wealth_segment                          0.000
deceased_indicator                      0.000
default                                 7.550
owns_car                                0.000
tenure                                  2.175
dtype: float64

In [142]:
# Address Missing Columns

# 1. last_name : Replace by 'missing'

cust_demo_data['last_name'] = cust_demo_data['last_name'].fillna('missing') # Replace last_name by 'missing'
assert cust_demo_data['last_name'].isnull().any() == False  # No missing values in 'last_name'


In [143]:
# 2. DOB
# Drop these values

cust_demo_data = cust_demo_data.dropna(subset='DOB')  
assert cust_demo_data['DOB'].isnull().any() == False  # No missing values in 'DOB'

In [144]:
# 3. Job title and job industry category
# Replace with 'Unknown category'

cust_demo_data['job_title'] = cust_demo_data['job_title'].fillna('Unknown category')
assert cust_demo_data['job_title'].isnull().any() == False # No missing values in 'job_title'

In [145]:
cust_demo_data['job_industry_category'] = cust_demo_data['job_industry_category'].fillna('Unknown category') 
assert cust_demo_data['job_industry_category'].isnull().any() == False # No missing values in 'job_industry_category'

In [146]:
# 3. default
# Drop this column as it is of no use for us

cust_demo_data = cust_demo_data.drop(columns=['default'])

In [147]:
assert cust_demo_data.isnull().any().sum() == 0 # All missing values resolved

In [148]:
# Renaming cleaned dataset

cust_demo_data_cleaned =  cust_demo_data

# Find age
- First get reference date from transaction data
- Find age using DOB and reference date

In [149]:
# Read transaction data csv
transaction_data = pd.read_csv('transaction_data_cleaned.csv')

# Calculate reference date
ref_date = max(pd.to_datetime(transaction_data['transaction_date']))
ref_date = ref_date.date()
print(ref_date)

2017-12-30


In [150]:
# Caluclate age
cust_demo_data_cleaned.info()  

<class 'pandas.core.frame.DataFrame'>
Index: 3913 entries, 0 to 3999
Data columns (total 12 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   customer_id                          3913 non-null   int64  
 1   first_name                           3913 non-null   object 
 2   last_name                            3913 non-null   object 
 3   gender                               3913 non-null   object 
 4   past_3_years_bike_related_purchases  3913 non-null   int64  
 5   DOB                                  3913 non-null   object 
 6   job_title                            3913 non-null   object 
 7   job_industry_category                3913 non-null   object 
 8   wealth_segment                       3913 non-null   object 
 9   deceased_indicator                   3913 non-null   object 
 10  owns_car                             3913 non-null   object 
 11  tenure                             

- DOB is in object type, convert it to datetime type.

In [151]:
# Convert DOB to datetime
cust_demo_data_cleaned['DOB'] = pd.to_datetime(cust_demo_data_cleaned['DOB'])

In [152]:
cust_demo_data_cleaned['DOB']

0      1953-10-12
1      1980-12-16
2      1954-01-20
3      1961-10-03
4      1977-05-13
          ...    
3994   1975-12-12
3995   1975-08-09
3996   2001-07-13
3998   1973-10-24
3999   1991-11-05
Name: DOB, Length: 3913, dtype: datetime64[ns]

In [153]:
# Convert reference date to datetime
ref_date = pd.to_datetime(ref_date)

In [159]:
# Find age
cust_demo_data['age'] = ((ref_date - cust_demo_data_cleaned['DOB'])  # Age by days
                         .dt.days/365.25).astype(int) # Convert days to years (int)


In [160]:
cust_demo_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3913 entries, 0 to 3999
Data columns (total 13 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   customer_id                          3913 non-null   int64         
 1   first_name                           3913 non-null   object        
 2   last_name                            3913 non-null   object        
 3   gender                               3913 non-null   object        
 4   past_3_years_bike_related_purchases  3913 non-null   int64         
 5   DOB                                  3913 non-null   datetime64[ns]
 6   job_title                            3913 non-null   object        
 7   job_industry_category                3913 non-null   object        
 8   wealth_segment                       3913 non-null   object        
 9   deceased_indicator                   3913 non-null   object        
 10  owns_car         

In [161]:
# Export file
cust_demo_data_cleaned.to_csv('customer_demography_cleaned.csv', index= False)