In [1]:
import tensorflow as tf
import numpy as np 
import pandas as pd

customer_df = pd.read_csv('data/customer_data.csv')
sales_df = pd.read_csv('data/sales_data.csv')

merged_df = pd.merge(customer_df, sales_df, on='customer_id')


In [2]:
print("Customer and sales dataset: ")
print("Features and their domains:")
print(merged_df.dtypes)

print("\nUnique values per feature:")
for column in merged_df.columns:
    unique_values = merged_df[column].dropna().unique()
    print(f"{column}: {unique_values}")

    num_rows = merged_df.shape[0]
print("\nNumber of rows:", num_rows)

# Number of features
num_features = merged_df.shape[1]
print("Number of features:", num_features)

Customer and sales dataset: 
Features and their domains:
customer_id        object
gender             object
age               float64
payment_method     object
invoice_no         object
category           object
quantity            int64
price             float64
invoice_date       object
shopping_mall      object
dtype: object

Unique values per feature:
customer_id: ['C241288' 'C111565' 'C266599' ... 'C103292' 'C800631' 'C273973']
gender: ['Female' 'Male']
age: [28. 21. 20. 66. 53. 49. 32. 69. 60. 36. 29. 67. 25. 24. 65. 42. 46. 23.
 27. 52. 44. 51. 50. 68. 43. 59. 54. 48. 40. 41. 19. 18. 22. 61. 45. 64.
 33. 63. 34. 47. 38. 57. 30. 26. 62. 39. 55. 56. 35. 31. 37. 58.]
payment_method: ['Credit Card' 'Debit Card' 'Cash']
invoice_no: ['I138884' 'I317333' 'I127801' ... 'I824010' 'I702964' 'I232867']
category: ['Clothing' 'Shoes' 'Books' 'Cosmetics' 'Food & Beverage' 'Toys'
 'Technology' 'Souvenir']
quantity: [5 3 1 4 2]
price: [1.50040e+03 1.80051e+03 3.00080e+02 3.00085e+03 6.06000e+0

In [3]:
# amount of missing data
total_missing = merged_df.isnull().sum().sum()
print("\nTotal missing values in the customer data:", total_missing)


Total missing values in the customer data: 119


In [4]:
# unique categories and amount of items bought in any category
unique_cats = merged_df['category'].dropna().unique()
print("unique categories:")
print(unique_cats)
# most occuiring
mode = merged_df['category'].mode()
print(f"Most common category: {list(mode)}")
# highest quantity
highest_quantity = merged_df['quantity'].max()
highest_quantity_cat = merged_df.loc[highest_quantity, 'category']
print(f"largest quantity of items bought: {highest_quantity} in category {highest_quantity_cat}")


unique categories:
['Clothing' 'Shoes' 'Books' 'Cosmetics' 'Food & Beverage' 'Toys'
 'Technology' 'Souvenir']
Most common category: ['Clothing']
largest quantity of items bought: 5 in category Clothing


In [5]:
# who buys more
mode_gender = merged_df['gender'].mode()
print(f"{list(mode_gender)} buys more")
# items by gender
item_counts = merged_df.groupby(['gender', 'category']).size().reset_index(name='Count')
print("top 3 categories by gender:")
top_items_by_gender = (
    item_counts.sort_values(['gender', 'Count'], ascending=[True, False])
    .groupby('gender')
    .head(3)  # Get top 3 items for each gender
)

print(top_items_by_gender)

['Female'] buys more
top 3 categories by gender:
    gender         category  Count
1   Female         Clothing  20652
2   Female        Cosmetics   9070
3   Female  Food & Beverage   8804
9     Male         Clothing  13835
10    Male        Cosmetics   6027
11    Male  Food & Beverage   5972


In [6]:
# most used payment method
payment_mode = merged_df['payment_method'].mode()[0]
print(f"The most used means of purchase is: {payment_mode}")

The most used means of purchase is: Cash


In [7]:
# month with most purchases
merged_df['invoice_date'] = pd.to_datetime(merged_df['invoice_date'], format='%d-%m-%Y')

# get month
merged_df['month'] = merged_df['invoice_date'].dt.month

# get number of purchases for each month
monthly_purchases = merged_df['month'].value_counts().sort_index()

# Find the month with the most purchases
most_purchases_month = monthly_purchases.idxmax()
most_purchases_count = monthly_purchases.max()

print(f"The month with the most purchases is: {most_purchases_month} with {most_purchases_count} purchases")

The month with the most purchases is: 1 with 11608 purchases


In [10]:
# Create bins of $500
bins = pd.cut(merged_df['price'], bins=range(0, int(merged_df['price'].max()) + 500, 500))

# Group by bins and calculate the total and count of purchases in each bin
bin_grouped = merged_df.groupby(bins, observed=True)['price'].agg(['sum', 'count']).reset_index()

# Find the bin with the highest total purchase
highest_purchase_bin = bin_grouped.loc[bin_grouped['sum'].idxmax()]

# Find the bin with the most purchases
most_purchases_bin = bin_grouped.loc[bin_grouped['count'].idxmax()]

# Find most purchases in January (highest month)
january_data = merged_df[merged_df['invoice_date'].dt.month == 1]
january_bins = pd.cut(january_data['price'], bins=range(0, int(january_data['price'].max()) + 500, 500))
january_grouped = january_data.groupby(january_bins, observed=True)['price'].agg(['sum', 'count']).reset_index()
jan_most_purchases = january_grouped.loc[january_grouped['count'].idxmax()]

print(f"Bin with the highest total purchase: {highest_purchase_bin['price']} with total purchase of ${highest_purchase_bin['sum']}")
print(f"Bin with the most purchases: {most_purchases_bin['price']} with {most_purchases_bin['count']} purchases")
print(f"Bin with Most Purchases in January: {jan_most_purchases['price']} with {jan_most_purchases['count']} purchases")

Bin with the highest total purchase: (1500, 2000] with total purchase of $14031801.100000001
Bin with the most purchases: (0, 500] with 56781 purchases
Bin with Most Purchases in January: (0, 500] with 6664 purchases


In [11]:
#Image Classification with keras
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt



