## Import Modules

In [1]:
# Set paths
import os
from imp import reload

# Data manipulation
import pandas as pd
import numpy as np
from scipy import stats

# Geolocation
import geonamescache

# Custom package for data preprocessing
import mytools as mt 

# Set notebook options
pd.set_option("display.max_columns", 150)
pd.set_option("display.max_rows", 10000)
pd.set_option("display.float_format", lambda x: '%.2f' % x)

# Pretty display of multiple functions in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### File Location

In [2]:
# Data paths
import filepaths

### Create Project Directories and Sub-Directories

In [3]:
mt.create_directories(filepaths.project_parent_directory)

Directory  deliverables  already exists
Directory  deprecated  already exists
Directory  dictionary  already exists
Directory  visualizations  already exists
Subdirectory  data/raw  already exists
Subdirectory  data/interim  already exists
Subdirectory  data/external  already exists
Subdirectory  data/processed  already exists


## Load Data

In [4]:
df_customer = pd.read_csv(filepaths.raw_customer_data, sep=',')

df_cities = pd.read_csv(filepaths.raw_city_data, sep=',')

df_us_regions = pd.read_csv(filepaths.raw_us_regions_data, sep=',')

df_transactions = pd.read_csv(filepaths.raw_transactions_data, sep=',')

df_products = pd.read_csv(filepaths.raw_products_data, sep=',')

## Data Audit + Cleaning
### Customer Data

In [5]:
df_customer.columns = [c.lower() for c in df_customer.columns]
df_customer.duplicated(subset=['customer_id']).sum()
df_customer.shape
mt.check_unique_no(df_customer, ['customer_id'])
mt.missing_data_table(df_customer)
df_customer.head()

0

(5647, 4)

Data has 5647 unique customer_id
Missing data distribution:

  Variable  Count  Proportion
    gender      2        0.00
 city_code      2        0.00


Unnamed: 0,customer_id,dob,gender,city_code
0,268408,02-01-1970,M,4.0
1,269696,07-01-1970,F,8.0
2,268159,08-01-1970,F,8.0
3,270181,10-01-1970,F,2.0
4,268073,11-01-1970,M,1.0


 **Replace the missing values in 'gender' and 'city_code' features**

In [6]:
df_customer['gender'] = df_customer['gender'].replace({np.nan:df_customer['gender'].mode()[0]})
df_customer['city_code'] = df_customer['city_code'].replace({np.nan:df_customer['city_code'].mode()[0]})

### Cities Data

In [7]:
df_cities.columns = [c.lower() for c in df_cities.columns]
df_cities.duplicated(subset=['city']).sum()
df_cities.shape
mt.check_unique_no(df_cities, ['city_code'])
mt.missing_data_table(df_cities)
df_cities.head()

0

(10, 2)

Data has 10 unique city_code
There is no missing data


Unnamed: 0,city_code,city
0,1,Arlington
1,2,Miami
2,3,Chicago
3,4,Pittsburgh
4,5,Dayton


### USA Regions Data

In [8]:
df_us_regions.columns = [c.lower() for c in df_us_regions.columns]
df_us_regions.duplicated(subset=['state']).sum()
df_us_regions.shape
mt.missing_data_table(df_us_regions)
df_us_regions.head()

0

(51, 4)

There is no missing data


Unnamed: 0,state,state code,region,division
0,Alaska,AK,West,Pacific
1,Alabama,AL,South,East South Central
2,Arkansas,AR,South,West South Central
3,Arizona,AZ,West,Mountain
4,California,CA,West,Pacific


### Products Data

In [9]:
df_products.rename(columns={'prod_sub_cat_code':'prod_subcat_code'}, inplace=True)
df_products['counter'] = 1
mt.missing_data_table(df_products)
df_products.head()

There is no missing data


Unnamed: 0,prod_cat_code,prod_cat,prod_subcat_code,prod_subcat,counter
0,1,Clothing,4,Mens,1
1,1,Clothing,1,Women,1
2,1,Clothing,3,Kids,1
3,2,Footwear,1,Mens,1
4,2,Footwear,3,Women,1


In [10]:
prod_cat_dict = dict(zip(df_products['prod_cat_code'],df_products['prod_cat']))
prod_subcat_dict = dict(zip(df_products['prod_subcat_code'],df_products['prod_subcat']))

### Transactions Data

In [11]:
df_transactions.columns = [c.lower() for c in df_transactions.columns]
df_transactions.rename(columns={'transaction_id':'trans_id', 'cust_id':'customer_id', 'tran_date':'trans_date'}, inplace=True)
df_transactions.duplicated(subset=['trans_id']).sum()
df_transactions.shape
mt.check_unique_no(df_transactions, ['customer_id', 'trans_id'])
mt.missing_data_table(df_transactions)
df_transactions.head()
df_transactions.describe()

2175

(23053, 10)

Data has 5506 unique customer_id
Data has 20878 unique trans_id
There is no missing data


Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt,store_type
0,80712190438,270351,28-02-2014,1,1,-5,-772,405.3,-4265.3,e-Shop
1,29258453508,270384,27-02-2014,5,3,-5,-1497,785.92,-8270.92,e-Shop
2,51750724947,273420,24-02-2014,6,5,-2,-791,166.11,-1748.11,TeleShop
3,93274880719,271509,24-02-2014,11,6,-3,-1363,429.35,-4518.35,e-Shop
4,51750724947,273420,23-02-2014,6,5,-2,-791,166.11,-1748.11,TeleShop


Unnamed: 0,trans_id,customer_id,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt
count,23053.0,23053.0,23053.0,23053.0,23053.0,23053.0,23053.0,23053.0
mean,50073480358.45,271021.75,6.15,3.76,2.43,636.37,248.67,2107.31
std,28981936062.0,2431.69,3.73,1.68,2.27,622.36,187.18,2507.56
min,3268991.0,266783.0,1.0,1.0,-5.0,-1499.0,7.35,-8270.92
25%,24938639453.0,268935.0,3.0,2.0,1.0,312.0,98.28,762.45
50%,50093131361.0,270980.0,5.0,4.0,3.0,710.0,199.08,1754.74
75%,75329995679.0,273114.0,10.0,5.0,4.0,1109.0,365.71,3569.15
max,99987549630.0,275265.0,12.0,6.0,5.0,1500.0,787.5,8287.5


In [12]:
df_transactions['trans_id'].value_counts(dropna=False).head()

426787191      4
4170892941     4
32263938079    4
3130889793     3
42255136382    3
Name: trans_id, dtype: int64

In [13]:
df_transactions[df_transactions['trans_id'] == 4170892941]

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt,store_type
16207,4170892941,266852,24-12-2011,8,3,-1,-412,43.26,-455.26,MBR
16267,4170892941,266852,21-12-2011,8,3,-1,-412,43.26,-455.26,MBR
16269,4170892941,266852,21-12-2011,8,3,-1,-412,43.26,-455.26,MBR
16288,4170892941,266852,20-12-2011,8,3,1,412,43.26,455.26,MBR


In [14]:
df_transactions[df_transactions['customer_id'] == 266852]

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt,store_type
8514,37550766365,266852,1/1/2013,3,1,3,1125,354.38,3729.38,e-Shop
15309,6136914509,266852,8/2/2012,3,2,1,1211,127.16,1338.15,e-Shop
15676,99967775192,266852,20-01-2012,2,6,2,539,113.19,1191.19,Flagship store
16207,4170892941,266852,24-12-2011,8,3,-1,-412,43.26,-455.26,MBR
16267,4170892941,266852,21-12-2011,8,3,-1,-412,43.26,-455.26,MBR
16269,4170892941,266852,21-12-2011,8,3,-1,-412,43.26,-455.26,MBR
16288,4170892941,266852,20-12-2011,8,3,1,412,43.26,455.26,MBR


**Merge products data with transactions data**

In [15]:
df_transactions = pd.merge(df_transactions, df_products, on=['prod_cat_code', 'prod_subcat_code'], how='left')
df_transactions['product'] = df_transactions['prod_cat'] + '_' + df_transactions['prod_subcat']
df_transactions.head()

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt,store_type,prod_cat,prod_subcat,counter,product
0,80712190438,270351,28-02-2014,1,1,-5,-772,405.3,-4265.3,e-Shop,Clothing,Women,1,Clothing_Women
1,29258453508,270384,27-02-2014,5,3,-5,-1497,785.92,-8270.92,e-Shop,Electronics,Computers,1,Electronics_Computers
2,51750724947,273420,24-02-2014,6,5,-2,-791,166.11,-1748.11,TeleShop,Books,DIY,1,Books_DIY
3,93274880719,271509,24-02-2014,11,6,-3,-1363,429.35,-4518.35,e-Shop,Home and kitchen,Bath,1,Home and kitchen_Bath
4,51750724947,273420,23-02-2014,6,5,-2,-791,166.11,-1748.11,TeleShop,Books,DIY,1,Books_DIY


**Tag transactions where items where returned**

In [16]:
returned_item = df_transactions['trans_id'].value_counts(dropna=False)[df_transactions['trans_id'].value_counts(dropna=False).values > 1].index.unique();
len(returned_item)

2057

In [17]:
df_transactions['returned_item'] = np.where(df_transactions['trans_id'].isin(returned_item), 'yes', 'no')
df_transactions['returned_item'].value_counts(dropna=False)

no     18821
yes     4232
Name: returned_item, dtype: int64

**Drop transaction records (with negative qty) of items returned**

In [18]:
df_transactions = df_transactions[~(df_transactions['qty'] < 0)]
df_transactions['qty'].describe()
df_transactions.duplicated(subset=['trans_id']).sum()

mt.check_unique_no(df_transactions, ['customer_id', 'trans_id'])
df_transactions.shape

count   20876.00
mean        3.00
std         1.42
min         1.00
25%         2.00
50%         3.00
75%         4.00
max         5.00
Name: qty, dtype: float64

0

Data has 5506 unique customer_id
Data has 20876 unique trans_id


(20876, 15)

**Replace 'qty', 'rate', 'tax', 'total_amt', 'counter' of transactions with returned items with 0**

In [19]:
df_transactions['qty'] = np.where((df_transactions['returned_item'] == 'yes'), 0, df_transactions['qty'])
df_transactions['rate'] = np.where((df_transactions['returned_item'] == 'yes'), 0, df_transactions['rate'])
df_transactions['tax'] = np.where((df_transactions['returned_item'] == 'yes'), 0, df_transactions['tax'])
df_transactions['total_amt'] = np.where((df_transactions['returned_item'] == 'yes'), 0, df_transactions['total_amt'])
df_transactions['counter'] = np.where((df_transactions['returned_item'] == 'yes'), 0, df_transactions['counter'])
df_transactions.head()

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt,store_type,prod_cat,prod_subcat,counter,product,returned_item
10,29258453508,270384,20-02-2014,5,3,0,0,0.0,0.0,e-Shop,Electronics,Computers,0,Electronics_Computers,yes
11,25455265351,267750,20-02-2014,12,6,3,1360,428.4,4508.4,e-Shop,Home and kitchen,Tools,1,Home and kitchen_Tools,no
12,1571002198,275023,20-02-2014,6,5,4,587,246.54,2594.54,e-Shop,Books,DIY,1,Books_DIY,no
14,36554696014,269345,20-02-2014,3,5,3,1253,394.69,4153.69,e-Shop,Books,Comics,1,Books_Comics,no
15,56814940239,268799,20-02-2014,7,5,5,368,193.2,2033.2,e-Shop,Books,Fiction,1,Books_Fiction,no


In [20]:
df_transactions['store_type'].value_counts(dropna=False)

e-Shop            8429
MBR               4210
Flagship store    4145
TeleShop          4092
Name: store_type, dtype: int64

In [21]:
df_transactions = df_transactions.sort_values(['customer_id','trans_date'])
df_transactions['trans_date'] = pd.to_datetime(df_transactions['trans_date'])

### Sanity Check Data

In [22]:
mt.check_unique_no(df_customer, ['customer_id'])
df_customer.shape

mt.check_unique_no(df_transactions, ['customer_id', 'trans_id'])
df_transactions.shape

Data has 5647 unique customer_id


(5647, 4)

Data has 5506 unique customer_id
Data has 20876 unique trans_id


(20876, 15)

In [23]:
df_transactions['trans_date'].sort_values().min()
df_transactions['trans_date'].sort_values().max()

Timestamp('2011-01-02 00:00:00')

Timestamp('2014-12-02 00:00:00')

In [24]:
df_transactions['total_amt'].sum()
df_transactions['tax'].sum()
df_transactions['rate'].sum()
df_transactions['qty'].sum()

48903316.475

4646921.475

14766262

56434

## Export Data

In [25]:
df_customer.to_csv(filepaths.clean_customer_data_v1, index=False)
df_us_regions.to_csv(filepaths.clean_us_regions_data_v1, index=False)
df_cities.to_csv(filepaths.clean_cities_data_v1, index=False)
df_transactions.to_csv(filepaths.clean_transactions_data_v1, index=False)