# Import libraries

In [1]:
import pandas as pd
import numpy as np
import re

# Import CSV Data - Oddle_Assignment Customers Data.csv

In [2]:
# Create pandas DF
cust_df = pd.read_csv("Oddle_Assignment Customers Data.csv")

In [3]:
# Check imported dataframe
cust_df.head()

Unnamed: 0.1,Unnamed: 0,created_on,index,email,gender,age
0,1,2017-12-06 08:21:19,1,JAODT1736K@hotmail.com,F,35-44
1,2,2018-02-05 03:48:40,2,EMLPI8026E@hotmail.com,F,25-34
2,3,2018-02-06 07:56:41,3,WVQCK7883L@gmail.com,F,35-44
3,4,2018-01-01 00:41:11,4,MGWAZ5408T@hotmail.com,M,25-34
4,5,2018-01-01 12:31:54,5,FCAUD7784F@hotmail.com,F,25-34


In [4]:
## Reset the Indexes
cust_df = cust_df.drop('Unnamed: 0', axis=1)

In [5]:
cust_df.head()

Unnamed: 0,created_on,index,email,gender,age
0,2017-12-06 08:21:19,1,JAODT1736K@hotmail.com,F,35-44
1,2018-02-05 03:48:40,2,EMLPI8026E@hotmail.com,F,25-34
2,2018-02-06 07:56:41,3,WVQCK7883L@gmail.com,F,35-44
3,2018-01-01 00:41:11,4,MGWAZ5408T@hotmail.com,M,25-34
4,2018-01-01 12:31:54,5,FCAUD7784F@hotmail.com,F,25-34


# Dataset Overview

## Check Dimensions of the Dataset.

In [6]:
# Check Dimensions
cust_df.shape

# 50440 observations (rows)
# 5 Features (cols incl index)

(50440, 5)

## Check for Missing Values

In [7]:
# Check for Missing Values
cust_df.isnull().sum()

## No missing values in all features

created_on    0
index         0
email         0
gender        0
age           0
dtype: int64

## Check for Datatypes (changed accordingly for work to be done in python)

In [8]:
#Check data types
cust_df.dtypes

## From the data dictionary provided in the Assignment PDF
# index == customer index no. (unique)
# created_on  ==  Date Time
# email == string obj
# gender == string categorical
# age == string categorical (bins)

created_on    object
index          int64
email         object
gender        object
age           object
dtype: object

## Check for Duplicates, Validity etc

### Index

In [9]:
# Check duplicates for 'index'
cust_df['index'].duplicated().sum()

### None. Good.

0

### created_on (Date time)

In [10]:
# Check duplicates for 'created on'
cust_df['created_on'].duplicated().sum()

# 57 duplicates of the created date and time

### However this is to be ignored. Nothing wrong with users signing up simultaneously.
### We would be more concerned with unique IDs

57

In [11]:
## Just to confirm and view the rows
##cust_df[cust_df['created_on'].duplicated()].count()

In [12]:
# Adjust datatypes for the dataframe:
## created_on = datetime.
cust_df['created_on'] = pd.to_datetime(cust_df['created_on'])

In [13]:
cust_df['created_on'].max()

Timestamp('2018-10-19 06:56:15')

In [14]:
cust_df['created_on'].min()

Timestamp('2014-03-04 18:00:32')

### email

In [15]:
# Check duplicates for 'email'
cust_df['email'].duplicated().sum()

### None. Good.

0

In [16]:
## Email checker function - Check that the email entered by the user is in a valid email format
## a string of a certain length containing characters followed by an @ symbol followed by characters
## followed by a "." followed by more characters.
## @.com = 5 chars , so it needs min 7 chars to qualify

def is_valid_email(email):
    if len(email) > 7:
        return bool(re.match("^.+@(\[?)[a-zA-Z0-9-.]+.([a-zA-Z]{2,3}|[0-9]{1,3})(]?)$", email)) ##Return boolean
    else:
        return False

In [17]:
## Testing.. Ok
is_valid_email("a@a.com")

False

In [18]:
cust_df['email_valid'] = cust_df['email'].apply(lambda x: is_valid_email(x))

In [19]:
cust_df['email_valid'].value_counts()

True     50134
False      306
Name: email_valid, dtype: int64

In [20]:
#cust_df[cust_df['email_valid'] == False]

## IGNORE email address issues. Assumption that all emails are VALID.

### gender

In [21]:
# Check types for 'gender'
cust_df['gender'].value_counts()

F          35169
M          10223
Unknown     5048
Name: gender, dtype: int64

In [22]:
cust_df['gender'].count()
## 35169 + 10223+ 5048

## Too little data to impute
## Imputation method to try: KNN
## Unknown possibilities:
## 1: Not compulsary to fill in the field or select from dropdown.
## 2: Can also be seen as 'unspecified'
## 3: In this day and age M/F might not be enough to describe oneself.

## Will IGNORE this first.

50440

### age

In [23]:
## Check AGE BINS
cust_df['age'].value_counts()

## Too little data to impute;
## Imputation method to try: KNN
## Other Method:
## - Exclude the Unknowns. 
## - Get the Percentage of the known age groups, then divide the Unknowns and distribute by that percentage

## Unknown possibilities:
## 1: Not compulsary to fill in the field or select from dropdown.
## 2: Can also be seen as 'unspecified'

25-34      15606
Unknown    12081
35-44       9488
18-24       6548
45-54       4101
55-64       1571
65+         1045
Name: age, dtype: int64

In [24]:
#Check data types
cust_df.dtypes

created_on     datetime64[ns]
index                   int64
email                  object
gender                 object
age                    object
email_valid              bool
dtype: object

# Summary of Customer dataset

- The dataset consist of **50440 observations (rows) 5 Features (cols incl index)**
- No missing values or nulls or NaNs in all columns
- This dataset spans from **2014-03-04** to **2018-10-19**

<b>From the data dictionary provided in the Assignment PDF</b>
- index == customer index no. (unique)
- created_on  ==  Date Time
- email == string obj
- gender == string categorical
- age == string categorical (bins)

<b>Data validity:</b>
- indexes are unique. No duplicates
- created_on has 57 duplicates of the created date and time
    - However this is to be ignored. Nothing wrong with users signing up simultaneously.
    - We would be mor concerned with unique IDs
- email has no duplicates.
    - A potential concern was for invalid email addresses
    - Simple check for email was done (Check for @ . and more than 7 chars)
    - 306 was flagged to be invalid
    - Howevere for this case study, since all emails are encrypted, we will assume that they are all valid
- gender has 3 categories [ M / F / Unknown ]
    - Unknown possibilities:
        - 1: Not compulsary to fill in the field or select from dropdown.
        - 2: Can also be seen as 'unspecified'
        - 3: In this day and age M/F might not be enough to describe oneself.
    - Might relook at imputation later.
- age has 7 bins [ 18-24 / 25-34 / 35-44 / 45-54 / 55-64 / 65+ / Unknown ]
    - Unknown possibilities:
        - 1: Not compulsary to fill in the field or select from dropdown.
        - 2: Can also be seen as 'unspecified'
    - Might relook at imputation later.

# Please see Tableau Dashboard for Customers Data for Viz

https://public.tableau.com/profile/ziig.yee#!/vizhome/Oddle_Customer_Overview/CustDataOverview

- Please click the full-screen button at the bottom left of the viz
<img src=fullscreen.png>