## Fetch the dataset

In [4]:
import pandas as pd
df = pd.read_csv("clean_coffee_shop_sales.csv")

df.head()


Unnamed: 0.1,Unnamed: 0,transaction_id,transaction_date,transaction_time,transaction_qty,store_id,store_location,product_id,unit_price,product_category,product_type,product_detail
0,0,1,2023-01-01,07:06:11,2,5,Lower Manhattan,32,3.0,Coffee,Gourmet brewed coffee,Ethiopia Rg
1,1,2,2023-01-01,07:08:56,2,5,Lower Manhattan,57,3.1,Tea,Brewed Chai tea,Spicy Eye Opener Chai Lg
2,2,3,2023-01-01,07:14:04,2,5,Lower Manhattan,59,4.5,Drinking Chocolate,Hot chocolate,Dark chocolate Lg
3,3,4,2023-01-01,07:20:24,1,5,Lower Manhattan,22,2.0,Coffee,Drip coffee,Our Old Time Diner Blend Sm
4,4,5,2023-01-01,07:22:41,2,5,Lower Manhattan,57,3.1,Tea,Brewed Chai tea,Spicy Eye Opener Chai Lg


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149116 entries, 0 to 149115
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        149116 non-null  int64  
 1   transaction_id    149116 non-null  int64  
 2   transaction_date  149116 non-null  object 
 3   transaction_time  149116 non-null  object 
 4   transaction_qty   149116 non-null  int64  
 5   store_id          149116 non-null  int64  
 6   store_location    149116 non-null  object 
 7   product_id        149116 non-null  int64  
 8   unit_price        149116 non-null  float64
 9   product_category  149116 non-null  object 
 10  product_type      149116 non-null  object 
 11  product_detail    149116 non-null  object 
dtypes: float64(1), int64(5), object(6)
memory usage: 13.7+ MB


In [6]:
# convert date and time to its correct datatype
df['transaction_date'] = pd.to_datetime(df['transaction_date'])
df['transaction_time'] = pd.to_datetime(df['transaction_time'], format="%H:%M:%S").dt.time
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149116 entries, 0 to 149115
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   Unnamed: 0        149116 non-null  int64         
 1   transaction_id    149116 non-null  int64         
 2   transaction_date  149116 non-null  datetime64[ns]
 3   transaction_time  149116 non-null  object        
 4   transaction_qty   149116 non-null  int64         
 5   store_id          149116 non-null  int64         
 6   store_location    149116 non-null  object        
 7   product_id        149116 non-null  int64         
 8   unit_price        149116 non-null  float64       
 9   product_category  149116 non-null  object        
 10  product_type      149116 non-null  object        
 11  product_detail    149116 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(5), object(5)
memory usage: 13.7+ MB


In [10]:
# Get all unique month year
# Extract the month and year and create a new column
df['month_year'] = df['transaction_date'].dt.strftime('%Y-%m')

# Get all unique month-year combinations
unique_month_years = df['month_year'].unique()
unique_month_years

array(['2023-01', '2023-02', '2023-03', '2023-04', '2023-05', '2023-06'],
      dtype=object)

In [12]:
oldest_datetime = df['transaction_date'].min()
newest_datetime = df['transaction_date'].max()

print("Oldest datetime:", oldest_datetime)
print("Newest datetime:", newest_datetime)

Oldest datetime: 2023-01-01 00:00:00
Newest datetime: 2023-06-30 00:00:00


## Calculate the total sales for each respective month.

In [28]:
# create a function
def total_monthly_sales (month,year):
    try:
        # Filter the DataFrame for the specified month and year
        subset_df = df[df['transaction_date'].dt.strftime('%m,%Y') == f'{month:02d},{year}']
        # 0: This indicates that the number should be zero-padded if it has fewer digits than the specified width.
        # 2: This sets the width to two digits.
        # d: This indicates that the value is an integer.
        # f'{5:02d}' would format the number 5 as 05.
        
        # Calculate total sales for the selected month and year
        total_sales = (subset_df['unit_price']*subset_df['transaction_qty']).sum()

        return total_sales
    except Exception as e:
        print(f"Ann error occured: {str(e)}")
        return None

In [29]:
# try using the function
total_monthly_sales (5,2023)

156727.75999999998

## Determine the month-on-month % increase or decrease in sales.

In [None]:
# create a function for % change in monthly sales
def percent_change_monthly (month,year)
    