In [4]:
# 1 - data cleaning

# Import necessary libraries
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# Read initial csv file
df = pd.read_csv("/Users/thomassimmons/c/td/data/dirty_cafe_sales.csv")

In [6]:
# Get basic info of df
df.head()

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,ERROR,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11


In [7]:
# 10000 row dataset starting out, will narrow down to less than half the size
df.shape

(10000, 8)

In [8]:
# Finding initial null values
df.isnull().sum()

Transaction ID         0
Item                 333
Quantity             138
Price Per Unit       179
Total Spent          173
Payment Method      2579
Location            3265
Transaction Date     159
dtype: int64

In [9]:
# All types are object, this should be changed for some columns
df.dtypes

Transaction ID      object
Item                object
Quantity            object
Price Per Unit      object
Total Spent         object
Payment Method      object
Location            object
Transaction Date    object
dtype: object

In [10]:
# Convert as needed
df['Transaction Date'] = pd.to_datetime(df['Transaction Date'], errors='coerce')
df['Quantity'] = pd.to_numeric(df['Quantity'], errors='coerce')
df['Price Per Unit'] = pd.to_numeric(df['Price Per Unit'], errors='coerce')
df['Total Spent'] = pd.to_numeric(df['Total Spent'], errors='coerce')

In [11]:
# Check data types now
df.dtypes

Transaction ID              object
Item                        object
Quantity                   float64
Price Per Unit             float64
Total Spent                float64
Payment Method              object
Location                    object
Transaction Date    datetime64[ns]
dtype: object

In [12]:
# Drop null values
df = df.dropna(subset=['Total Spent', 'Payment Method', 'Location', 'Quantity', 'Item', 'Price Per Unit', 'Transaction Date'])

In [13]:
df.isnull().sum()

Transaction ID      0
Item                0
Quantity            0
Price Per Unit      0
Total Spent         0
Payment Method      0
Location            0
Transaction Date    0
dtype: int64

In [14]:
# Another way to from nulls
df = df[df['Location'] != 'UNKNOWN']
df = df[df['Item'] != 'UNKNOWN']
df = df[df['Payment Method'] != 'UNKNOWN']

In [15]:
# No nulls now
df.isnull().sum()

Transaction ID      0
Item                0
Quantity            0
Price Per Unit      0
Total Spent         0
Payment Method      0
Location            0
Transaction Date    0
dtype: int64

In [16]:
# New shape
df.shape

(3522, 8)

In [17]:
# Encountered ERROR column here
df['Item'].value_counts().head(10)

Item
Salad       463
Juice       460
Cake        433
Cookie      430
Sandwich    428
Tea         413
Coffee      394
Smoothie    391
ERROR       110
Name: count, dtype: int64

In [18]:
# Remove ERROR
df = df[df['Item'] != 'ERROR']