# Prepare validation data

Load sales for the period: 2017, January - March (PE, TRG)

In [1]:
import pandas as pd
import numpy as np
sales2017 =  pd.DataFrame.from_csv('PE-TRG-Jan-Mar-2017.csv').reset_index()

#remove duplications by summing up quantity by days
sales2017 = pd.pivot_table(sales2017, values='Quantity', index=['Locationid','PLU','Year','Month','Day'], aggfunc=np.sum).reset_index()

Keep only March data for validation

In [2]:
sales_2017_march = sales2017[(sales2017['Year']==2017)&(sales2017['Month']==3)]

## Raw data properties

In [3]:
positions = sales_2017_march.groupby(['Locationid','PLU']).size().rename('counts').reset_index()
positions_count = len(positions.index)
locations_count = len(positions.drop_duplicates(['Locationid']).index)
products_count = len(positions.drop_duplicates(['PLU']).index)

def setCountGroup(row):
    if row['counts'] < 10:
        return "1. less than 10"
    elif row['counts'] <= 20:
        return "2. 20 or less"
    elif row['counts'] <= 30:
        return "3. 30 or less"
    else: 
        return "4. more than 30"
    
counts_groups = positions
counts_groups['segment'] = counts_groups.apply(setCountGroup, axis=1)
counts_groups = counts_groups.groupby('segment').size().rename('counts').reset_index()
counts_groups['%'] = counts_groups.apply(lambda r: r['counts']/positions_count*100, axis=1)

rows_in_march = len(sales_2017_march.index)
rows_with_negative = len(sales_2017_march[sales_2017_march['Quantity'] < 0].index)
rows_with_zero = len(sales_2017_march[sales_2017_march['Quantity'] == 0].index)
rows_with_positive = len(sales_2017_march[sales_2017_march['Quantity'] > 0].index)

# ==== output results ====

print("Total unique positions (Location-Recipe) %d" % (positions_count))
print("Total locations %d" % (locations_count))
print("Total recipes %d" % (products_count))

print("Data rows %d" % (rows_in_march))
print("Negative data %.2f%% (%d)" % (rows_with_negative/rows_in_march*100, rows_with_negative))
print("Zero data %.2f%% (%d)" % (rows_with_zero/rows_in_march*100, rows_with_zero))
print("Positive data %.2f%% (%d)" % (rows_with_positive/rows_in_march*100, rows_with_positive))

counts_groups

Total unique positions (Location-Recipe) 431646
Total locations 515
Total recipes 5213
Data rows 4735802
Negative data 2.85% (134970)
Zero data 1.46% (69156)
Positive data 95.69% (4531676)


Unnamed: 0,segment,counts,%
0,1. less than 10,239403,55.46281
1,2. 20 or less,101377,23.486144
2,3. 30 or less,76369,17.692507
3,4. more than 30,14497,3.358539


## Save to CSV

In [4]:
sales_2017_march.to_csv('sales-2017-march.csv')