In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Upload TRANSACTION Data From CSV

In [None]:
#Load Transactions Data 
Transactions= pd.read_csv ('TRANSACTION_TAKEHOME (1).csv')

### Data Exploration and Cleaning

In [None]:
Transactions['FINAL_QUANTITY'] = Transactions['FINAL_QUANTITY'].replace(['', ' '], np.nan)
Transactions['FINAL_SALE'] = Transactions['FINAL_SALE'].replace(['', ' '], np.nan)
Transactions.head()

In [133]:
Transactions.shape

# Get overview
Transactions.info()

# CHeck missing values
Transactions.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   RECEIPT_ID      50000 non-null  object 
 1   PURCHASE_DATE   50000 non-null  object 
 2   SCAN_DATE       50000 non-null  object 
 3   STORE_NAME      50000 non-null  object 
 4   USER_ID         50000 non-null  object 
 5   BARCODE         44238 non-null  float64
 6   FINAL_QUANTITY  50000 non-null  object 
 7   FINAL_SALE      37500 non-null  object 
dtypes: float64(1), object(7)
memory usage: 3.1+ MB


RECEIPT_ID            0
PURCHASE_DATE         0
SCAN_DATE             0
STORE_NAME            0
USER_ID               0
BARCODE            5762
FINAL_QUANTITY        0
FINAL_SALE        12500
dtype: int64

Barcpde has missing values.

**Check Duplicated Rows**

In [65]:
Transactions.nunique()

RECEIPT_ID        24440
PURCHASE_DATE        89
SCAN_DATE         24440
STORE_NAME          954
USER_ID           17694
BARCODE           11027
FINAL_QUANTITY       87
FINAL_SALE         1435
dtype: int64

In [117]:
# Display all duplicate rows
duplicated_rows = Transactions[Transactions.duplicated()] .sort_values(by = ['RECEIPT_ID', 'USER_ID'])
print("Duplicated rows:")
print(duplicated_rows)

Duplicated rows:
                                 RECEIPT_ID PURCHASE_DATE  \
45553  007d3232-3990-497f-a081-549e9e7a478b    2024-06-25   
49759  01a70fe0-026f-4bea-9da4-7d13bbf21e9a    2024-09-02   
32463  0273cbd8-1620-46c9-8e99-6971e850a2fc    2024-09-08   
34323  0764c668-2863-49e3-8914-52772620b05d    2024-09-07   
36222  0be1ab3d-ea77-48a2-b954-7ba1a5a12c79    2024-07-12   
...                                     ...           ...   
33356  f2e87c6f-c9ef-4615-9ad2-a1b832f8f69e    2024-07-11   
43320  f79e00be-fbc1-4a27-b68c-a5f406c9a5e0    2024-09-02   
48463  f871a430-7fcb-4d95-989e-aa0b57497eca    2024-09-01   
41604  fa8ab2d7-b051-47d7-bd56-d0d88997d367    2024-07-22   
46640  fb825ba4-fe3b-45b4-a547-5a33d23e5e33    2024-08-24   

                             SCAN_DATE              STORE_NAME  \
45553 2024-06-27 21:21:53.442000+00:00  DOLLAR TREE STORES INC   
49759 2024-09-07 16:02:39.835000+00:00                 WALMART   
32463 2024-09-08 22:17:11.989000+00:00              

**Duplicted rows based on Receipt_Id**

In [110]:
duplicated_rows_rec = Transactions[Transactions.duplicated(subset=['RECEIPT_ID'], keep=False)].sort_values(by = 'RECEIPT_ID') # keep=False marks all duplicates as True
print("Duplicated rows:")
print("Duplicated rows:")
print(duplicated_rows_rec)

Duplicated rows:
                                 RECEIPT_ID PURCHASE_DATE  \
0      0000d256-4041-4a3e-adc4-5623fb6e0c99    2024-08-21   
41567  0000d256-4041-4a3e-adc4-5623fb6e0c99    2024-08-21   
1      0001455d-7a92-4a7b-a1d2-c747af1c8fd3    2024-07-20   
39291  0001455d-7a92-4a7b-a1d2-c747af1c8fd3    2024-07-20   
2      00017e0a-7851-42fb-bfab-0baa96e23586    2024-08-18   
...                                     ...           ...   
28152  fffbb112-3cc5-47c2-b014-08db2f87e0c7    2024-07-30   
24998  fffbfb2a-7c1f-41c9-a5da-628fa7fcc746    2024-07-28   
31602  fffbfb2a-7c1f-41c9-a5da-628fa7fcc746    2024-07-28   
25233  fffe8012-7dcf-4d84-b6c6-feaacab5074a    2024-09-07   
24999  fffe8012-7dcf-4d84-b6c6-feaacab5074a    2024-09-07   

                             SCAN_DATE STORE_NAME                   USER_ID  \
0     2024-08-21 14:19:06.539000+00:00    WALMART  63b73a7f3d310dceeabd4758   
41567 2024-08-21 14:19:06.539000+00:00    WALMART  63b73a7f3d310dceeabd4758   
1     2024-07

The duplicated rows shows there are different FINAL_SALE or FINAL_QUANTITY

### Data Quality & Issues
**1. Convert PURCHASE_DATE & SCAN_DATE to Datetime**
- The date is a ISO-8601 format, UTC date time.

In [89]:
Transactions['PURCHASE_DATE'] = pd.to_datetime(Transactions['PURCHASE_DATE'])
Transactions['SCAN_DATE'] = pd.to_datetime(Transactions['SCAN_DATE'])
Transactions.head()

Unnamed: 0,RECEIPT_ID,PURCHASE_DATE,SCAN_DATE,STORE_NAME,USER_ID,BARCODE,FINAL_QUANTITY,FINAL_SALE
0,0000d256-4041-4a3e-adc4-5623fb6e0c99,2024-08-21,2024-08-21 14:19:06.539000+00:00,WALMART,63b73a7f3d310dceeabd4758,15300010000.0,1.0,
1,0001455d-7a92-4a7b-a1d2-c747af1c8fd3,2024-07-20,2024-07-20 09:50:24.206000+00:00,ALDI,62c08877baa38d1a1f6c211a,,0.0,1.49
2,00017e0a-7851-42fb-bfab-0baa96e23586,2024-08-18,2024-08-19 15:38:56.813000+00:00,WALMART,60842f207ac8b7729e472020,78742230000.0,1.0,
3,000239aa-3478-453d-801e-66a82e39c8af,2024-06-18,2024-06-19 11:03:37.468000+00:00,FOOD LION,63fcd7cea4f8442c3386b589,783399700000.0,0.0,3.49
4,00026b4c-dfe8-49dd-b026-4c2f0fd5c6a1,2024-07-04,2024-07-05 15:56:43.549000+00:00,RANDALLS,6193231ae9b3d75037b0f928,47900500000.0,1.0,


**2. Check if all values in FINAL_QUANTITY and FINAL_SALE are numeric**

In [119]:
# FINAL_QUANTITY
non_numeric_quantity = pd.to_numeric(Transactions['FINAL_QUANTITY'], errors='coerce').isna()
non_numeric_quantity_rows = Transactions[non_numeric_quantity]
print(f"Rows with non-numeric FINAL_QUANTITY:\n{non_numeric_quantity_rows}")
print(non_numeric_quantity_rows.nunique())

#FINAL_SALE
non_numeric_sale = pd.to_numeric(Transactions['FINAL_SALE'], errors='coerce').isna()
non_numeric_sale_rows = Transactions[non_numeric_sale]
print(f"Rows with non-numeric FINAL_SALE:\n{non_numeric_sale_rows}")
print(non_numeric_sale_rows.nunique())

Rows with non-numeric FINAL_QUANTITY:
Empty DataFrame
Columns: [RECEIPT_ID, PURCHASE_DATE, SCAN_DATE, STORE_NAME, USER_ID, BARCODE, FINAL_QUANTITY, FINAL_SALE]
Index: []
RECEIPT_ID        0
PURCHASE_DATE     0
SCAN_DATE         0
STORE_NAME        0
USER_ID           0
BARCODE           0
FINAL_QUANTITY    0
FINAL_SALE        0
dtype: int64
Rows with non-numeric FINAL_SALE:
                                 RECEIPT_ID PURCHASE_DATE  \
0      0000d256-4041-4a3e-adc4-5623fb6e0c99    2024-08-21   
2      00017e0a-7851-42fb-bfab-0baa96e23586    2024-08-18   
4      00026b4c-dfe8-49dd-b026-4c2f0fd5c6a1    2024-07-04   
6      000550b2-1480-4c07-950f-ff601f242152    2024-07-06   
8      000e1d35-15e5-46c6-b6b3-33653ed3d27e    2024-08-13   
...                                     ...           ...   
24990  ffeb1ff4-0db9-4cb9-9574-20ec2db3e5ad    2024-08-25   
24992  ffec42ba-c71d-44da-b8d0-eb529632e87a    2024-06-17   
24994  fff15a3d-25ea-4c36-b84a-91eb4157daf9    2024-07-19   
24996  fff6c6

**For FINAL_QUANTITY, the only non-numeric value is the string 'zero' - to ensure future calculation would need to change str 'zero' to numeric 0.  
For FINAL_SALE, the only non-numeric value is blank.**

In [95]:
Transactions['FINAL_QUANTITY'] = Transactions['FINAL_QUANTITY'].apply(lambda x: 0 if x =='zero' else x)
Transactions.head()

Unnamed: 0,RECEIPT_ID,PURCHASE_DATE,SCAN_DATE,STORE_NAME,USER_ID,BARCODE,FINAL_QUANTITY,FINAL_SALE
0,0000d256-4041-4a3e-adc4-5623fb6e0c99,2024-08-21,2024-08-21 14:19:06.539000+00:00,WALMART,63b73a7f3d310dceeabd4758,15300010000.0,1.0,
1,0001455d-7a92-4a7b-a1d2-c747af1c8fd3,2024-07-20,2024-07-20 09:50:24.206000+00:00,ALDI,62c08877baa38d1a1f6c211a,,0.0,1.49
2,00017e0a-7851-42fb-bfab-0baa96e23586,2024-08-18,2024-08-19 15:38:56.813000+00:00,WALMART,60842f207ac8b7729e472020,78742230000.0,1.0,
3,000239aa-3478-453d-801e-66a82e39c8af,2024-06-18,2024-06-19 11:03:37.468000+00:00,FOOD LION,63fcd7cea4f8442c3386b589,783399700000.0,0.0,3.49
4,00026b4c-dfe8-49dd-b026-4c2f0fd5c6a1,2024-07-04,2024-07-05 15:56:43.549000+00:00,RANDALLS,6193231ae9b3d75037b0f928,47900500000.0,1.0,


**3.Convert scientific notion to string for Barcode**

In [104]:
Transactions['BARCODE'] = Transactions['BARCODE'].astype(str)
Transactions.head()

Unnamed: 0,RECEIPT_ID,PURCHASE_DATE,SCAN_DATE,STORE_NAME,USER_ID,BARCODE,FINAL_QUANTITY,FINAL_SALE
0,0000d256-4041-4a3e-adc4-5623fb6e0c99,2024-08-21,2024-08-21 14:19:06.539000+00:00,WALMART,63b73a7f3d310dceeabd4758,15300014978.0,1.0,
1,0001455d-7a92-4a7b-a1d2-c747af1c8fd3,2024-07-20,2024-07-20 09:50:24.206000+00:00,ALDI,62c08877baa38d1a1f6c211a,,0.0,1.49
2,00017e0a-7851-42fb-bfab-0baa96e23586,2024-08-18,2024-08-19 15:38:56.813000+00:00,WALMART,60842f207ac8b7729e472020,78742229751.0,1.0,
3,000239aa-3478-453d-801e-66a82e39c8af,2024-06-18,2024-06-19 11:03:37.468000+00:00,FOOD LION,63fcd7cea4f8442c3386b589,783399746536.0,0.0,3.49
4,00026b4c-dfe8-49dd-b026-4c2f0fd5c6a1,2024-07-04,2024-07-05 15:56:43.549000+00:00,RANDALLS,6193231ae9b3d75037b0f928,47900501183.0,1.0,


**4.Date Logic Inconsistencies**

In [138]:
invalid_dates = Transactions[Transactions['SCAN_DATE'] < Transactions['PURCHASE_DATE']]
print(invalid_dates)

                                 RECEIPT_ID PURCHASE_DATE  \
51     008c1dcc-0f96-4b04-98c8-2a2bb63ef89d    2024-07-21   
455    04a320ed-2903-45e5-8fd7-6eaf08daef32    2024-06-29   
494    05023b3d-5f83-47a7-a17c-8e8521d0bc94    2024-09-08   
675    06ce3da3-a588-4c37-93b4-0b6d11e42704    2024-06-22   
870    08d0e78f-3e63-40a3-8eb0-73fdf76da52c    2024-06-22   
...                                     ...           ...   
46034  08d0e78f-3e63-40a3-8eb0-73fdf76da52c    2024-06-22   
46539  718aa730-b62f-4e18-8dba-1d7105dac341    2024-09-05   
46941  af2b818f-4a92-4e98-958c-65f2ce0b271d    2024-06-15   
47653  72bb7b71-d958-4a46-ae62-43abdeb0e693    2024-06-15   
47837  99c2e8dc-9dc7-4267-9342-0b19c3fb35a0    2024-06-15   

                       SCAN_DATE            STORE_NAME  \
51     2024-07-20 19:54:23.133 Z               WALMART   
455    2024-06-28 11:03:31.783 Z  DOLLAR GENERAL STORE   
494    2024-09-07 22:22:29.903 Z             SHOP RITE   
675    2024-06-21 12:34:15.665 Z   

**5. Check for Categorical Data - STORE_NAME**

In [125]:
Transactions['STORE_NAME'] .unique()

array(['WALMART', 'ALDI', 'FOOD LION', 'RANDALLS', 'TARGET', 'COSTCO',
       'DOLLAR TREE STORES INC', 'FAMILY DOLLAR', 'KROGER', 'FOODS CO',
       'REAES STORE', 'IGA', 'DOLLAR GENERAL STORE', 'PUBLIX', "MACEY'S",
       'WALGREENS', 'CVS', 'WINCO FOODS', 'THE HOME DEPOT', 'FRED MEYER',
       'MARKET BASKET', '7-ELEVEN', 'HOBBY LOBBY', 'GIANT EAGLE',
       'HY-VEE', "SAM'S CLUB", 'WHISPERING PINES FRUIT FARMS',
       'JEWEL OSCO', 'SHELL', 'NELES GRANACOT', "TRADER JOE'S",
       'FIVE BELOW', 'GROCERY OUTLET BARGAIN MARKET', 'SMART SHOP',
       'TINKEN AFB COMMIE', 'ALBERTSONS', 'LIDL', 'PHARMACY', 'WEIS',
       'FASTRAC', 'RITE AID', 'MEIJER', 'COMMISSARY', 'FOODLAND',
       'PAL CAMPO RESTAURANT', "BJ'S WHOLESALE CLUB", 'CARRS',
       'SHOP RITE', 'PLAVERS CAFE LIDA', 'WINN-DIXIE', 'SAFEWAY',
       'MI TIENDA', 'CIRCLE K', 'NUTS FACTORY 74TH', 'AMAZON', "MARC'S",
       'WEGMANS', 'SCHNUCKS', 'FOOD DEPOT', 'BURGER KING', 'H-E-B',
       'PIONEER SUPERMARKETS', 'KING SOOPE

The quality for STORE_NAME looks good. 