# Cleaned Sales Preview

Quick validation notebook for the `data/processed/synosales_cleaned.parquet` output. Once the structure is confirmed, lift the aggregation logic into a production module under `src/` as part of the export-to-module plan.

In [1]:
from pathlib import Path

import pandas as pd

In [2]:
project_root = Path().resolve()
if project_root.name == "notebook":
    project_root = project_root.parent

data_path = project_root / "data/processed/synosales_cleaned.parquet"
df_cleaned = pd.read_parquet(data_path)
df_cleaned.head()

Unnamed: 0,PI,Customer,ItemCode,Product,Currency,Price,Quantity,Discount,T/T Discount,Total,...,source_sheet,Type,sub_cat,exchange_rate_to_usd,usd_adjusted_price,usd_adjusted_total,Year,Capacity,Unit,total_cap
0,2364503.0,C2,,C2 Storage Advanced,EUR,104.69,1,0.0,0.0,104.69,...,2023-C2,C2-HYBRID,C2-STORAGE-UNKNOWN-ADVANCED,1.05,109.9245,109.9245,2023,,,0
1,2364504.0,C2,,C2 Storage Basic,EUR,35.33,1,0.0,0.0,35.33,...,2023-C2,C2-HYBRID,C2-STORAGE-UNKNOWN-BASIC,1.05,37.0965,37.0965,2023,,,0
2,2364505.0,C2,,C2 Storage Basic,USD,1.96,1,0.0,0.0,1.96,...,2023-C2,C2-HYBRID,C2-STORAGE-UNKNOWN-BASIC,1.0,1.96,1.96,2023,,,0
3,2364506.0,C2,,Active Insight Premium Early Bird,EUR,12.53,1,0.0,0.0,12.53,...,2023-C2,C2-HYBRID,Other,1.05,13.1565,13.1565,2023,,,0
4,2364507.0,C2,,C2 Storage Advanced,EUR,1246.98,1,0.0,0.0,1246.98,...,2023-C2,C2-HYBRID,C2-STORAGE-UNKNOWN-ADVANCED,1.05,1309.329,1309.329,2023,,,0


In [3]:
df_cleaned.columns.tolist()

['PI',
 'Customer',
 'ItemCode',
 'Product',
 'Currency',
 'Price',
 'Quantity',
 'Discount',
 'T/T Discount',
 'Total',
 'ShipDate',
 'InvDate',
 'DeliveryFrom',
 'Destination',
 'Comments',
 'Country',
 'ShipTo',
 'source_sheet',
 'Type',
 'sub_cat',
 'exchange_rate_to_usd',
 'usd_adjusted_price',
 'usd_adjusted_total',
 'Year',
 'Capacity',
 'Unit',
 'total_cap']

In [4]:
df_cleaned.describe(include='all')

Unnamed: 0,PI,Customer,ItemCode,Product,Currency,Price,Quantity,Discount,T/T Discount,Total,...,source_sheet,Type,sub_cat,exchange_rate_to_usd,usd_adjusted_price,usd_adjusted_total,Year,Capacity,Unit,total_cap
count,205705.0,205708,174798,205708,205259,205708.0,205708.0,205708.0,205708.0,205708.0,...,205708.0,205708,205708,192355.0,192355.0,192355.0,205708.0,30794.0,30794,205708.0
unique,,197,2531,1609,6,,,,,,...,4.0,10,61,,,,,,1,
top,,C2,13-08RKS2202,C2 Storage Advanced,EUR,,,,,,...,2024.0,SERVER,Other,,,,,,GB,
freq,,30910,4820,6660,107644,,,,,,...,91386.0,78389,31451,,,,,,30794,
mean,608597.3,,,,,2028.878,31.515469,0.009982,0.553372,17761.85,...,,,,1.024808,829.0111,6992.862,2023.520515,7010.469377,,56414.48
std,748459.0,,,,,33811.88,426.518178,0.266056,1.045254,111839.5,...,,,,0.045569,7327.454,25520.06,0.49958,5969.055162,,454694.4
min,244070.0,,,,,0.0,1.0,-0.02,0.0,0.0,...,,,,0.0067,0.01,0.1,2023.0,2.0,,0.0
25%,283924.0,,,,,108.24,1.0,0.0,0.0,368.0,...,,,,1.0,102.69,320.0303,2023.0,960.0,,0.0
50%,298794.0,,,,,264.0,4.0,0.0,0.0,1867.205,...,,,,1.05,257.5965,1689.208,2024.0,6000.0,,0.0
75%,312414.0,,,,,835.71,16.0,0.0,1.0,6184.25,...,,,,1.05,617.652,5320.0,2024.0,12000.0,,0.0


In [7]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205708 entries, 0 to 205707
Data columns (total 27 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   PI                    205705 non-null  float64
 1   Customer              205708 non-null  object 
 2   ItemCode              174798 non-null  object 
 3   Product               205708 non-null  object 
 4   Currency              205259 non-null  object 
 5   Price                 205708 non-null  float64
 6   Quantity              205708 non-null  int64  
 7   Discount              205708 non-null  float64
 8   T/T Discount          205708 non-null  float64
 9   Total                 205708 non-null  float64
 10  ShipDate              205708 non-null  object 
 11  InvDate               205708 non-null  object 
 12  DeliveryFrom          205708 non-null  object 
 13  Destination           205657 non-null  object 
 14  Comments              5156 non-null    object 
 15  