# Exploratory Data Analysis of the Customer Dataset
EDA performed after Data Quality Analysis and Data cleaning process if performed on the dataset.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import math
from datetime import datetime, date
plt.style.use('ggplot')

In [2]:
# Loading the Transactions and Customer Demographics datasets

trans = pd.read_csv('Transactions_cleaned.csv')
cust_demo = pd.read_csv('CustomerDemographic_cleaned.csv')

In [3]:
# Fetching first 5 rows of the Transactions dataset

trans.head()

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date,profit
0,1,2,2950,2017-02-25,0.0,Approved,Solex,Standard,medium,medium,71.49,53.62,41245.0,17.87
1,2,3,3120,2017-05-21,1.0,Approved,Trek Bicycles,Standard,medium,large,2091.47,388.92,41701.0,1702.55
2,3,37,402,2017-10-16,0.0,Approved,OHM Cycles,Standard,low,medium,1793.43,248.82,36361.0,1544.61
3,4,88,3135,2017-08-31,0.0,Approved,Norco Bicycles,Standard,medium,medium,1198.46,381.1,36145.0,817.36
4,5,78,787,2017-10-01,1.0,Approved,Giant Bicycles,Standard,medium,large,1765.3,709.48,42226.0,1055.82


In [5]:
# Analyzing the shape of the Transactions dataset

print(f'Total records in Transactions Dataset: {trans.shape[0]}')
print(f'Total features in the Transactions dataset: {trans.shape[1]}')

Total records in Transactions Dataset: 19803
Total features in the Transactions dataset: 14


In [6]:
# Fetching first 5 rows of the Customer Demographics dataset

cust_demo.head()

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,Age
0,1,Laraine,Medendorp,Female,93,1953-10-12,Executive Secretary,Health,Mass Customer,N,Yes,11.0,70
1,2,Eli,Bockman,Male,81,1980-12-16,Administrative Officer,Financial Services,Mass Customer,N,Yes,16.0,43
2,3,Arlin,Dearle,Male,61,1954-01-20,Recruiting Manager,Property,Mass Customer,N,Yes,15.0,69
3,4,Talbot,,Male,33,1961-10-03,,IT,Mass Customer,N,No,7.0,62
4,5,Sheila-kathryn,Calton,Female,56,1977-05-13,Senior Editor,,Affluent Customer,N,Yes,8.0,46


In [7]:
# Analyzing the shape of the Customer Demographics dataset

print(f'Total records in Customer Demographics Dataset: {cust_demo.shape[0]}')
print(f'Total features in the Customer Demographics dataset: {cust_demo.shape[1]}')

Total records in Customer Demographics Dataset: 3912
Total features in the Customer Demographics dataset: 13


In [8]:
# Merging the Transactions and Customer Demographics Datasets based on the Customer ID

merged_trans_cust = pd.merge(trans, cust_demo, left_on='customer_id', right_on='customer_id', how='inner')

In [9]:
# Fetching first 5 rows of the merged dataset

merged_trans_cust.head()

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,...,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,Age
0,1,2,2950,2017-02-25,0.0,Approved,Solex,Standard,medium,medium,...,Male,19,1955-01-11,Software Engineer I,Financial Services,Mass Customer,N,Yes,10.0,68
1,11065,1,2950,2017-10-16,0.0,Approved,Giant Bicycles,Standard,medium,medium,...,Male,19,1955-01-11,Software Engineer I,Financial Services,Mass Customer,N,Yes,10.0,68
2,18923,62,2950,2017-04-26,0.0,Approved,Solex,Standard,medium,medium,...,Male,19,1955-01-11,Software Engineer I,Financial Services,Mass Customer,N,Yes,10.0,68
3,2,3,3120,2017-05-21,1.0,Approved,Trek Bicycles,Standard,medium,large,...,Female,89,1979-02-04,Clinical Specialist,Health,Mass Customer,N,Yes,10.0,44
4,6862,4,3120,2017-10-05,0.0,Approved,Giant Bicycles,Standard,high,medium,...,Female,89,1979-02-04,Clinical Specialist,Health,Mass Customer,N,Yes,10.0,44


In [10]:
# Analyzing the shape of the merged dataset

print(f'Total records in the merged dataset: {merged_trans_cust.shape[0]}')
print(f'Total Number of features in the merged dataset: {merged_trans_cust.shape[1]}')

Total records in the merged dataset: 19354
Total Number of features in the merged dataset: 26


In [11]:
merged_trans_cust.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19354 entries, 0 to 19353
Data columns (total 26 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   transaction_id                       19354 non-null  int64  
 1   product_id                           19354 non-null  int64  
 2   customer_id                          19354 non-null  int64  
 3   transaction_date                     19354 non-null  object 
 4   online_order                         19354 non-null  float64
 5   order_status                         19354 non-null  object 
 6   brand                                19354 non-null  object 
 7   product_line                         19354 non-null  object 
 8   product_class                        19354 non-null  object 
 9   product_size                         19354 non-null  object 
 10  list_price                           19354 non-null  float64
 11  standard_cost               

`the columns 'transaction_date' and 'product_first_sold_date' are not in date-time format. Hence the data-type of the column should be changed from object to date-time format.`

In [12]:
merged_trans_cust['transaction_date'] = pd.to_datetime(merged_trans_cust['transaction_date'])

merged_trans_cust['product_first_sold_date'] = pd.to_datetime(merged_trans_cust['product_first_sold_date'])

merged_trans_cust['DOB'] = pd.to_datetime(merged_trans_cust['DOB'])