# 1. Load data

In [99]:
# import csv file

import pandas as pd 

file = pd.ExcelFile('../data/Raw_data.xlsx')
file.sheet_names

['Transactions', 'NewCustomerList', 'CustomerDemographic', 'CustomerAddress']

In [100]:
newcust_data = pd.read_excel('../data/Raw_data.xlsx', sheet_name='NewCustomerList')

In [101]:
newcust_data.head(2)

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,...,state,country,property_valuation,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Rank,Value
0,Chickie,Brister,Male,86,1957-07-12,General Manager,Manufacturing,Mass Customer,N,Yes,...,QLD,Australia,6,0.81,1.0125,1.265625,1.075781,1,1,1.71875
1,Morly,Genery,Male,69,1970-03-22,Structural Engineer,Property,Mass Customer,N,No,...,NSW,Australia,11,0.75,0.75,0.9375,0.796875,1,1,1.71875


In [102]:
newcust_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 23 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   first_name                           1000 non-null   object 
 1   last_name                            971 non-null    object 
 2   gender                               1000 non-null   object 
 3   past_3_years_bike_related_purchases  1000 non-null   int64  
 4   DOB                                  983 non-null    object 
 5   job_title                            894 non-null    object 
 6   job_industry_category                835 non-null    object 
 7   wealth_segment                       1000 non-null   object 
 8   deceased_indicator                   1000 non-null   object 
 9   owns_car                             1000 non-null   object 
 10  tenure                               1000 non-null   int64  
 11  address                        

# 2. Find missing values

In [103]:
newcust_data.isnull().sum()

first_name                               0
last_name                               29
gender                                   0
past_3_years_bike_related_purchases      0
DOB                                     17
job_title                              106
job_industry_category                  165
wealth_segment                           0
deceased_indicator                       0
owns_car                                 0
tenure                                   0
address                                  0
postcode                                 0
state                                    0
country                                  0
property_valuation                       0
Unnamed: 16                              0
Unnamed: 17                              0
Unnamed: 18                              0
Unnamed: 19                              0
Unnamed: 20                              0
Rank                                     0
Value                                    0
dtype: int6

In [104]:
newcust_data.isnull().mean()*100

first_name                              0.0
last_name                               2.9
gender                                  0.0
past_3_years_bike_related_purchases     0.0
DOB                                     1.7
job_title                              10.6
job_industry_category                  16.5
wealth_segment                          0.0
deceased_indicator                      0.0
owns_car                                0.0
tenure                                  0.0
address                                 0.0
postcode                                0.0
state                                   0.0
country                                 0.0
property_valuation                      0.0
Unnamed: 16                             0.0
Unnamed: 17                             0.0
Unnamed: 18                             0.0
Unnamed: 19                             0.0
Unnamed: 20                             0.0
Rank                                    0.0
Value                           

## (a) 'last_name' column
- 2.9% missing
- Replace it with a string - 'yy'

In [105]:
# Replace missing last_name with 'yy' 
newcust_data['last_name'].fillna('yy', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  newcust_data['last_name'].fillna('yy', inplace=True)


In [106]:
newcust_data['last_name'].isnull().sum()

0

## (b) 'DOB' column
- 1.7% missing values
- Less than 5% - we will drop these rows

In [107]:
newcust_data = newcust_data.dropna(subset=['DOB'])


In [108]:
newcust_data['DOB'].isnull().sum() 

0

- We will convert DOB to datetime format as it is object format.


In [109]:
newcust_data['DOB'] = pd.to_datetime(newcust_data['DOB'])

- But there is data inconsistency in DOB column
- Some cells have time also with date
- We will correct it.

In [110]:
newcust_data['DOB'] 

0     1957-07-12
1     1970-03-22
2     1974-08-28
3     1979-01-28
4     1965-09-21
         ...    
995   1959-10-07
996   2001-10-17
997   1954-10-05
998   1952-12-17
999   1955-10-02
Name: DOB, Length: 983, dtype: datetime64[ns]

## (c) 'job_title' column
- 10.6% missing values - very large
- Replace it with - 'not known'


In [111]:
newcust_data['job_title'].fillna('no known', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  newcust_data['job_title'].fillna('no known', inplace=True)


In [112]:
newcust_data['job_title'].isnull().sum()  # No null in job title now

0

## (d) 'job_industry_category' column
- 16.5% missing
- Replace with 'not known' 

In [113]:
newcust_data['job_industry_category'].fillna('not known', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  newcust_data['job_industry_category'].fillna('not known', inplace=True)


In [114]:
# Check again for any missing value

newcust_data.isnull().sum()  # Finally no value missing

first_name                             0
last_name                              0
gender                                 0
past_3_years_bike_related_purchases    0
DOB                                    0
job_title                              0
job_industry_category                  0
wealth_segment                         0
deceased_indicator                     0
owns_car                               0
tenure                                 0
address                                0
postcode                               0
state                                  0
country                                0
property_valuation                     0
Unnamed: 16                            0
Unnamed: 17                            0
Unnamed: 18                            0
Unnamed: 19                            0
Unnamed: 20                            0
Rank                                   0
Value                                  0
dtype: int64

In [115]:
# Check all columns
newcust_data.columns

# Remove columns Unnamed: 16 to 20
newcust_cleaned = newcust_data.drop(columns=['Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20'])


# 3. Analysis on New Customer data

## 3.1 Gender counts of new customer

In [116]:
gender_count = newcust_cleaned.groupby('gender').agg({'gender':'count'}).rename({'gender':'gender_count'},axis=1).reset_index()
print(gender_count)

   gender  gender_count
0  Female           513
1    Male           470


## 3.2 Age of New_customers
- Get reference date from transaction_data
- wrt reference date find age of all new customers.

In [117]:
# Get reference date

transaction_data = pd.read_csv('transaction_data_cleaned.csv')
ref_date = max(transaction_data['transaction_date'])
type(ref_date)
ref_date = pd.to_datetime(ref_date)

In [118]:
# Calculate age of each new customer

newcust_cleaned['newcust_age'] = ((ref_date - newcust_cleaned['DOB']).dt.days/365.25)
newcust_cleaned['newcust_age'] = (newcust_cleaned['newcust_age']).astype(int)

# Convert days to years
# newcust_cleaned['age'] = newcust_cleaned['age']/365.25

In [119]:
newcust_cleaned.head(5)

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,address,postcode,state,country,property_valuation,Rank,Value,newcust_age
0,Chickie,Brister,Male,86,1957-07-12,General Manager,Manufacturing,Mass Customer,N,Yes,14,45 Shopko Center,4500,QLD,Australia,6,1,1.71875,60
1,Morly,Genery,Male,69,1970-03-22,Structural Engineer,Property,Mass Customer,N,No,16,14 Mccormick Park,2113,NSW,Australia,11,1,1.71875,47
2,Ardelis,Forrester,Female,10,1974-08-28,Senior Cost Accountant,Financial Services,Affluent Customer,N,No,10,5 Colorado Crossing,3505,VIC,Australia,5,1,1.71875,43
3,Lucine,Stutt,Female,64,1979-01-28,Account Representative III,Manufacturing,Affluent Customer,N,Yes,5,207 Annamark Plaza,4814,QLD,Australia,1,4,1.703125,38
4,Melinda,Hadlee,Female,34,1965-09-21,Financial Analyst,Financial Services,Affluent Customer,N,No,19,115 Montana Place,2093,NSW,Australia,9,4,1.703125,52


# 4. Export

In [120]:
newcust_cleaned.to_csv('new_customer_cleaned.csv', index= False)