In [19]:
# 1 - data cleaning

# Import necessary libraries
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns

In [20]:
# Read initial csv file
df = pd.read_csv("/Users/thomassimmons/c/td/data/dirty_cafe_sales.csv")

In [21]:
# Get basic info of df
df.head()

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,ERROR,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11


In [22]:
# 10000 row dataset starting out, will narrow down to less than half the size
df.shape

(10000, 8)

In [23]:
# Finding initial null values
df.isnull().sum()

Transaction ID         0
Item                 333
Quantity             138
Price Per Unit       179
Total Spent          173
Payment Method      2579
Location            3265
Transaction Date     159
dtype: int64

In [24]:
# All types are object, this should be changed for some columns
df.dtypes

Transaction ID      object
Item                object
Quantity            object
Price Per Unit      object
Total Spent         object
Payment Method      object
Location            object
Transaction Date    object
dtype: object

In [25]:
# Convert as needed
df['Transaction Date'] = pd.to_datetime(df['Transaction Date'], errors='coerce')
df['Quantity'] = pd.to_numeric(df['Quantity'], errors='coerce')
df['Price Per Unit'] = pd.to_numeric(df['Price Per Unit'], errors='coerce')
df['Total Spent'] = pd.to_numeric(df['Total Spent'], errors='coerce')

In [26]:
# Check data types now
df.dtypes

Transaction ID              object
Item                        object
Quantity                   float64
Price Per Unit             float64
Total Spent                float64
Payment Method              object
Location                    object
Transaction Date    datetime64[ns]
dtype: object

In [27]:
# Drop null values
df = df.dropna(subset=['Total Spent', 'Payment Method', 'Location', 'Quantity', 'Item', 'Price Per Unit', 'Transaction Date'])

In [28]:
df.isnull().sum()

Transaction ID      0
Item                0
Quantity            0
Price Per Unit      0
Total Spent         0
Payment Method      0
Location            0
Transaction Date    0
dtype: int64

In [29]:
# Another way to from nulls
df = df[df['Location'] != 'UNKNOWN']
df = df[df['Item'] != 'UNKNOWN']
df = df[df['Payment Method'] != 'UNKNOWN']

In [30]:
# No nulls now
df.isnull().sum()

Transaction ID      0
Item                0
Quantity            0
Price Per Unit      0
Total Spent         0
Payment Method      0
Location            0
Transaction Date    0
dtype: int64

In [31]:
# New shape
df.shape

(3522, 8)

In [32]:
# Encountered ERROR column here
df['Item'].value_counts().head(10)

Item
Salad       463
Juice       460
Cake        433
Cookie      430
Sandwich    428
Tea         413
Coffee      394
Smoothie    391
ERROR       110
Name: count, dtype: int64

In [34]:
# Remove ERROR
df = df[df['Item'] != 'ERROR']

In [36]:
# Getting value counts, last check of everything
df['Item'].value_counts()

Item
Salad       463
Juice       460
Cake        433
Cookie      430
Sandwich    428
Tea         413
Coffee      394
Smoothie    391
Name: count, dtype: int64

In [39]:
df['Location'].value_counts()

Location
In-store    1614
Takeaway    1601
Name: count, dtype: int64

In [43]:
df = df[df['Location'] != 'ERROR']
df = df[df['Payment Method'] != 'ERROR']

In [44]:
df['Payment Method'].value_counts()

Payment Method
Digital Wallet    1069
Cash              1018
Credit Card       1002
Name: count, dtype: int64

In [45]:
df['Price Per Unit'].value_counts(normalize=True)

Price Per Unit
3.0    0.262868
4.0    0.235999
5.0    0.135319
1.0    0.126578
1.5    0.120427
2.0    0.118809
Name: proportion, dtype: float64

In [46]:
df['Quantity'].value_counts(normalize=True)

Quantity
5.0    0.206863
2.0    0.203302
4.0    0.202978
1.0    0.194885
3.0    0.191972
Name: proportion, dtype: float64

In [47]:
df['Total Spent'].value_counts(normalize=True)

Total Spent
6.0     0.106507
12.0    0.102622
4.0     0.094853
3.0     0.094205
20.0    0.077695
15.0    0.074134
8.0     0.067659
10.0    0.056005
2.0     0.052120
5.0     0.050178
9.0     0.049531
16.0    0.046293
25.0    0.029136
7.5     0.025898
1.0     0.025575
4.5     0.025251
1.5     0.022337
Name: proportion, dtype: float64

In [49]:
df['Transaction Date'].value_counts()

Transaction Date
2023-03-13    20
2023-01-05    19
2023-11-06    16
2023-09-13    16
2023-06-30    15
              ..
2023-05-12     3
2023-05-13     2
2023-12-13     2
2023-04-27     1
2023-08-08     1
Name: count, Length: 365, dtype: int64

In [50]:
df['Transaction ID'].value_counts()

Transaction ID
TXN_1961373    1
TXN_7971660    1
TXN_1234390    1
TXN_6271633    1
TXN_4805204    1
              ..
TXN_7766134    1
TXN_2617257    1
TXN_6179169    1
TXN_3364751    1
TXN_6170729    1
Name: count, Length: 3089, dtype: int64

In [51]:
df['Month'] = df['Transaction Date'].dt.to_period('M')

In [53]:
df['Month'].head(10)

0     2023-09
1     2023-05
4     2023-06
10    2023-11
12    2023-05
15    2023-11
17    2023-02
19    2023-01
21    2023-03
22    2023-12
Name: Month, dtype: period[M]

In [None]:
(df == 'ERROR').sum()

Transaction ID      0
Item                0
Quantity            0
Price Per Unit      0
Total Spent         0
Payment Method      0
Location            0
Transaction Date    0
Month               0
dtype: int64

In [None]:
# Final cleaned dataset
df.shape

(3089, 9)