# Analysis of Transaction Log presented to us by The Math Company

In [1]:
import pandas as pd
import seaborn as sn
import pandas_profiling as pp
import numpy as np

In [2]:
product = pd.read_csv("product_dataset.csv")
transact = pd.read_csv("transaction_dataset.csv")
store = pd.read_csv("store_dataset.csv")

In [3]:
pp.ProfileReport(product).to_file(outputfile="./product_profile.html")
pp.ProfileReport(store).to_file(outputfile="./store_profile.html")
pp.ProfileReport(transact).to_file(outputfile="./transact_profile.html")

### Clean product data

#### Based on the product data set profile we identify several features which can be cleaned
-  Missing values in DBSKU, 7.2% or 2110, these need to be removed.
-  duplicated department/department name columns
-  duplicated class/class name columnss
-  class value of 99/class name of 6 which seems to be a test class

In [18]:
product.nunique()

DBSKU              24292
DEPARTMENT             2
CLASS                  6
SUBCLASS              15
DEPARTMENT_NAME        2
CLASS_NAME             6
SUBCLASS_NAME          8
dtype: int64

In [19]:
product.shape

(29342, 7)

In [20]:
#remove blanks
product1= product.dropna()

#drop redundant department name, class name
product1 =product1.drop(["DEPARTMENT_NAME","CLASS_NAME"], axis=1)

In [21]:
product1.nunique()

DBSKU            24292
DEPARTMENT           2
CLASS                6
SUBCLASS            15
SUBCLASS_NAME        8
dtype: int64

In [22]:
product1.shape

(27232, 5)

In [23]:
product1.head()

Unnamed: 0,DBSKU,DEPARTMENT,CLASS,SUBCLASS,SUBCLASS_NAME
0,2182204.0,12,3,32,1
1,2860882.0,12,3,31,2
2,2695858.0,12,5,50,3
3,675793.0,10,4,41,2
4,2864173.0,12,4,40,4


#### We can drop class 99, seeing that this is specified as a test case.
#### We will also introduce a new department and subclass concatenation, based on the pattern we see in subclass de-anonymizing methods.

In [24]:
product1=product1[product1.CLASS!=99]

In [25]:
product1.nunique()

DBSKU            24284
DEPARTMENT           2
CLASS                5
SUBCLASS            14
SUBCLASS_NAME        7
dtype: int64

In [26]:
product1.shape

(27224, 5)

In [28]:
product1['DSUBCLASS']=product1['DEPARTMENT'].astype(str)+product1['SUBCLASS'].astype(str)

In [29]:
product1.head()

Unnamed: 0,DBSKU,DEPARTMENT,CLASS,SUBCLASS,SUBCLASS_NAME,DSUBCLASS
0,2182204.0,12,3,32,1,1232
1,2860882.0,12,3,31,2,1231
2,2695858.0,12,5,50,3,1250
3,675793.0,10,4,41,2,1041
4,2864173.0,12,4,40,4,1240


#### We also have to remove duplicate SKUs, take DBSKU=818336 for example.

In [30]:
product1[product1['DBSKU']==818336.0]

Unnamed: 0,DBSKU,DEPARTMENT,CLASS,SUBCLASS,SUBCLASS_NAME,DSUBCLASS
7023,818336.0,10,2,21,2,1021
7089,818336.0,10,2,21,2,1021
9618,818336.0,10,2,21,2,1021
12549,818336.0,10,2,21,2,1021
18439,818336.0,10,2,21,2,1021
19657,818336.0,10,2,21,2,1021
23658,818336.0,10,2,21,2,1021
28733,818336.0,10,2,21,2,1021


In [31]:
product1.shape

(27224, 6)

In [32]:
product1=product1.drop_duplicates()

In [33]:
product1.shape

(24284, 6)

In [34]:
product1.nunique()

DBSKU            24284
DEPARTMENT           2
CLASS                5
SUBCLASS            14
SUBCLASS_NAME        7
DSUBCLASS           27
dtype: int64

In [35]:
pp.ProfileReport(product1).to_file(outputfile="./cleaned_product_profile.html")

### Clean Store Data

In [10]:
store.shape

(1303, 6)

In [None]:
store.nunique()

#### Drop missing store types(n=33) and missing store size  (n=184)

In [36]:
store.shape

(1303, 6)

In [38]:
store.nunique()

LOC_IDNT      1303
CITY           926
STATE           50
STORE_TYPE      14
POSTAL_CD      983
STORE_SIZE     829
dtype: int64

#### From our profile we see that
-  STORE_SIZE has missing = 14.1%, n=184
-  STORE_TYPE has missing = 2.5%, n=33 

which we remove with dropna()

In [41]:
store1=store.dropna()

In [42]:
store1.shape

(1118, 6)

In [43]:
store1.head()

Unnamed: 0,LOC_IDNT,CITY,STATE,STORE_TYPE,POSTAL_CD,STORE_SIZE
0,249,ST LOUIS,MO,Strip Store,63119,3963.0
1,401,PATCHOGUE,NY,Power Strip,11772,3378.0
2,644,NAPLES,FL,Outlet Strip,34114,3652.0
4,1270,CONCORD,NH,Regional Mall,3301,2535.0
6,403,BELLPORT,NY,Outlet Strip,11713,3068.0


#### Of interest from the profile, we see that there are stores with size 1, which can be excluded based on TMC input
 Also of interest, is the significant number of different types of stores, in a singular ZIP CODE: Suffern New York, we are not sure why this is, but this may be test data which could be dropped when a join is made with the transaction log

In [74]:
store1[store1['CITY'].str.match("SUFFERN")].shape

(40, 6)

In [75]:
store1[store1["STORE_SIZE"]<=55].head()

Unnamed: 0,LOC_IDNT,CITY,STATE,STORE_TYPE,POSTAL_CD,STORE_SIZE
437,2870,PARSIPPANY,NJ,Strip Store,7054,1.0
454,2885,FORT WASHINGTON,MD,Mini Mall,20744,1.0
493,2865,BALA CYNWYD,PA,Strip Store,19004,1.0
526,2906,TROUTDALE,OR,Outlet Strip,97060,1.0
543,2942,WASHINGTON,DC,Downtown Store,20006,1.0


In [76]:
store1[store1["STORE_SIZE"]<=55].size

180

In [77]:
store1=store1[store1['STORE_SIZE']>1]

#### TO note, there is a single store with size = 55 sq feet

In [19]:
store1[store1['STORE_SIZE']<=55]

Unnamed: 0,LOC_IDNT,CITY,STATE,STORE_TYPE,POSTAL_CD,STORE_SIZE
970,2946,CHICAGO,IL,Downtown Store,60601,55.0


#### We also check to ensure that STORE_TYPE = NOT APPLICABLE, is not present

In [78]:
store1['STORE_TYPE'].unique()

array(['Strip Store', 'Power Strip', 'Outlet Strip', 'Regional Mall',
       'Mega Outlet Mall', 'Outlet Mall', 'Tourist Outlet Mall',
       'Downtown Store', 'Tourist Outlet Strip', 'Freestanding Store',
       'Mini Mall', 'Lifestyle Center'], dtype=object)

In [79]:
store1.shape

(1089, 6)

In [1]:
pp.ProfileReport(store1).to_file(outputfile="./cleaned_store_profile.html")

NameError: name 'pp' is not defined

#### AT this point we have removed NA, not applicable stores, stores with sqft less than 55

#### It is important to acknowledge that at this point, we should ensure that zip code is treated as a 5 digit string, and in cases where the length is less than 6, we must pad it with a 0. 

In [81]:
store1[store1['POSTAL_CD']<9999].head()

Unnamed: 0,LOC_IDNT,CITY,STATE,STORE_TYPE,POSTAL_CD,STORE_SIZE
4,1270,CONCORD,NH,Regional Mall,3301,2535.0
11,385,W SPRINGFIELD,MA,Strip Store,1089,4814.0
18,660,FLEMINGTON,NJ,Strip Store,8822,3470.0
24,240,WESTBROOK,CT,Outlet Strip,6498,3824.0
25,392,SHREWSBURY,MA,Strip Store,1545,4290.0


In [82]:
store1[store1['POSTAL_CD']<9999].shape

(140, 6)

#### We will apply a formatting change for convenience at this stage and ensure the column is padded with a leading 0

In [83]:
store1['POSTAL_CD']=store1['POSTAL_CD'].apply(lambda x: '{0:0>5}'.format(x))

In [84]:
store1.head()

Unnamed: 0,LOC_IDNT,CITY,STATE,STORE_TYPE,POSTAL_CD,STORE_SIZE
0,249,ST LOUIS,MO,Strip Store,63119,3963.0
1,401,PATCHOGUE,NY,Power Strip,11772,3378.0
2,644,NAPLES,FL,Outlet Strip,34114,3652.0
4,1270,CONCORD,NH,Regional Mall,3301,2535.0
6,403,BELLPORT,NY,Outlet Strip,11713,3068.0


## Transaction data cleaning

In [85]:
transact.rename(columns={"LOC_INDT":"LOC_IDNT"},inplace=True)

In [335]:
transact.shape

(13053149, 9)

In [446]:
round(100*(transact.isnull().sum()/len(transact.index)), 2)

DAY_DT              0.00
LOC_IDNT            0.00
DBSKU               0.01
ONLINE_FLAG         0.00
FULL_PRICE_IND      0.00
TOTAL_SALES         0.00
TOTAL_UNITS         0.00
TOTAL_SALES_PRFT    0.00
TOTAL_COST          0.00
dtype: float64

In [447]:
transact.isnull().sum()

DAY_DT                0
LOC_IDNT              0
DBSKU               804
ONLINE_FLAG           0
FULL_PRICE_IND        0
TOTAL_SALES           0
TOTAL_UNITS           0
TOTAL_SALES_PRFT      0
TOTAL_COST            0
dtype: int64

#### From analysis with open refine, we know that there are "NA"/"NAN" values in the DBSKU column, however DBSKU profile shows no missing values. We will remove these values first



#### In addition we need to 
-  remove 0 unit, or negative unit transactions
-  Isolate and identify fractional unit transactions
-  remove 0 value transactions (TOTAL_SALES == 0)



-  Add a column for unit sales price
-  change day_dt to type date time
-  Visualize transaction volume over time, and identify months to be truncated 

#### We could additionally split date to month, day of week, and year, but this will be done later.

#### Treatment for transactions where TOTAL_SALES = 0, needs to be confirmed. However for our project, we choose to drop such transactions during data cleaning

#### We had initially hoped to round the cost and price columns - however this causes significant shifts to the number of transactions which are profitable. Since this has significant downstream impacts, we are leaving these fields as is.

In [448]:
transact1=transact.dropna()

In [449]:
transact1.shape

(13052345, 9)

In [450]:
transact1.head()

Unnamed: 0,DAY_DT,LOC_IDNT,DBSKU,ONLINE_FLAG,FULL_PRICE_IND,TOTAL_SALES,TOTAL_UNITS,TOTAL_SALES_PRFT,TOTAL_COST
0,2015-09-26,1218,466896.0,0,NFP,16.8,1.0,1.3,15.5
1,2015-08-02,1218,412445.0,0,NFP,29.99,1.0,12.99,17.0
2,2015-10-21,1218,491738.0,0,FP,44.0,1.0,28.25,15.75
3,2015-08-02,1218,414979.0,0,NFP,24.0,1.0,6.9936,17.0064
4,2015-07-26,1218,458372.0,0,FP,48.0,1.0,30.0,18.0


In [451]:
transact1[transact1['TOTAL_UNITS']<1].shape

(4505, 9)

#### NOTE: 
-  0 unit transactions: 4497
-  Less than 0 unit transactions: 4
- 0.5 unit transactions: 8



#### Remove TOTAL_UNITS <1

In [452]:
transact1=transact1[transact1['TOTAL_UNITS']>=1]

In [453]:
transact1.shape

(13047840, 9)

#### Identify fractional transactions

In [454]:
transact1[transact1['TOTAL_UNITS']%1!=0]

Unnamed: 0,DAY_DT,LOC_IDNT,DBSKU,ONLINE_FLAG,FULL_PRICE_IND,TOTAL_SALES,TOTAL_UNITS,TOTAL_SALES_PRFT,TOTAL_COST
7756481,2017-05-27,716,590356.0,0,NFP,50.2,1.5,46.4,3.8
8271192,2017-05-27,491,2154872.0,0,NFP,61.11,1.5,65.97,-4.86
8523946,2017-05-27,532,604520.0,0,NFP,59.83,1.5,58.16,1.67
8774271,2017-05-27,635,631440.0,0,NFP,52.43,1.5,53.86,-1.43
9142926,2017-05-27,685,626267.0,0,FP,73.5,1.5,64.5,9.0
9439391,2017-05-27,1365,626317.0,0,NFP,44.725,1.5,41.45,3.275


#### For now, we will work with whole values of units sold, and these 6 rows will be dropped

In [455]:
transact1=transact1[transact1['TOTAL_UNITS']%1==0]

In [456]:
transact1.shape

(13047834, 9)

### Look at outliers in total units sold (max units sold)

we use a threshold of 25 units in a transaction

In [457]:
transact1[transact1['TOTAL_UNITS']>=25].shape

(7813, 9)

In [458]:
transact1[transact1['TOTAL_UNITS']>=25]

Unnamed: 0,DAY_DT,LOC_IDNT,DBSKU,ONLINE_FLAG,FULL_PRICE_IND,TOTAL_SALES,TOTAL_UNITS,TOTAL_SALES_PRFT,TOTAL_COST
460421,2015-08-03,4150,2999797.0,1,NFP,1166.81,36.0,410.8100,756.0000
479725,2015-11-04,4150,485011.0,1,FP,962.63,25.0,600.1300,362.5000
483414,2015-09-10,4150,2105114.0,1,NFP,1025.36,27.0,498.8600,526.5000
490945,2015-11-05,4150,483107.0,1,FP,1906.61,39.0,1087.6100,819.0000
495160,2015-11-02,4150,486621.0,1,FP,1558.90,37.0,726.4000,832.5000
503098,2015-09-01,4150,2105437.0,1,FP,1176.88,26.0,672.7400,504.1400
503158,2015-09-14,4150,2108233.0,1,FP,1324.02,26.0,778.0200,546.0000
503259,2015-11-03,4150,487025.0,1,FP,1326.85,31.0,629.3500,697.5000
508075,2015-08-04,4150,2999797.0,1,NFP,913.20,30.0,283.2000,630.0000
1080880,2015-10-06,4150,485011.0,1,FP,1333.20,35.0,825.7000,507.5000


In [459]:
transact1[(transact1['TOTAL_UNITS']>=25)&(transact1['ONLINE_FLAG']==1)].shape

(7812, 9)

#### We see that there is a high overlap with online transactions for the large quantities ordered in a single transaction

#### We will be treating online transactions separately, so we leave max transactions at this point



#### Remove TOTAL_SALES=0

In [460]:
transact1[transact1['TOTAL_SALES']==0].head()

Unnamed: 0,DAY_DT,LOC_IDNT,DBSKU,ONLINE_FLAG,FULL_PRICE_IND,TOTAL_SALES,TOTAL_UNITS,TOTAL_SALES_PRFT,TOTAL_COST
4555,2015-08-24,254,2962365.0,0,NFP,0.0,1.0,-16.5,16.5
9326,2015-10-28,1287,447417.0,0,NFP,0.0,1.0,-11.54,11.54
10376,2015-09-11,375,450163.0,0,NFP,0.0,1.0,-24.0,24.0
10632,2015-11-02,1218,451781.0,0,NFP,0.0,1.0,-23.0,23.0
21391,2015-09-14,375,450163.0,0,NFP,0.0,1.0,-24.0,24.0


In [461]:
transact1[transact1['TOTAL_SALES']==0].shape

(3220, 9)

In [462]:
transact1=transact1[transact1['TOTAL_SALES']>0]

In [463]:
transact1.shape

(13044614, 9)

#### We obseve that there are costs which are negative

This field needs to be examined closely

In [477]:
transact1[transact1['TOTAL_COST']<=0].size

6696

In [478]:
transact1[transact1['TOTAL_COST']<=0].head()

Unnamed: 0,DAY_DT,LOC_IDNT,DBSKU,ONLINE_FLAG,FULL_PRICE_IND,TOTAL_SALES,TOTAL_UNITS,TOTAL_SALES_PRFT,TOTAL_COST
1140823,2015-08-07,1146,292144.0,0,FP,40.0,1.0,40.0,0.0
1287500,2015-09-07,1146,292144.0,0,NFP,32.0,1.0,32.0,0.0
7462944,2017-05-27,419,546002.0,0,NFP,28.29,1.0,29.38,-1.09
7463202,2017-05-27,636,622159.0,0,NFP,49.48,1.0,51.96,-2.48
7463531,2017-05-27,1229,2152702.0,0,NFP,44.68,1.0,50.48,-5.8


#### These negative cost transactions, result in unusual higher profit results (profit of more than sales value) and will be removed

In [479]:
transact1 = transact1[transact1['TOTAL_COST']>0]

### NOTE: There are many transactions at low cost (cost =1 to 7). 

#### We are including these for now, but treatment may have to change


#### Note: There are large numbers of transactions with 0 or negative profit

In [480]:
transact1[transact1['TOTAL_SALES_PRFT']<=0].size

8892657

#### Add Unit Sales price Column

In [481]:
transact1['UNIT_SALES_PRICE']=transact1['TOTAL_SALES']/transact1['TOTAL_UNITS']

In [484]:
transact1[transact1['UNIT_SALES_PRICE']<2].head(5)

Unnamed: 0,DAY_DT,LOC_IDNT,DBSKU,ONLINE_FLAG,FULL_PRICE_IND,TOTAL_SALES,TOTAL_UNITS,TOTAL_SALES_PRFT,TOTAL_COST,UNIT_SALES_PRICE
22483,2015-08-13,914,436030.0,0,NFP,1.62,1.0,-17.88,19.5,1.62
30575,2015-09-06,417,442467.0,0,NFP,1.0,1.0,-17.5,18.5,1.0
32520,2015-08-29,292,408500.0,0,NFP,1.8,1.0,-14.7,16.5,1.8
36420,2015-10-09,11,412478.0,0,NFP,0.36,1.0,-18.14,18.5,0.36
40159,2015-09-01,257,466896.0,0,NFP,1.8,1.0,-13.7,15.5,1.8


#### We observe that there are several transactions with sales value itself is less than 1, which results in significantly low per unit costs

#### Lacking a specific threshold from the Customer, we are taking a minimal threshold of 1 for TOTAL_SALES. Lower values will be dropped


In [500]:
transact1[transact1['UNIT_SALES_PRICE']<1].shape

(1241, 10)

In [501]:
transact1=transact1[transact1['UNIT_SALES_PRICE']>=1]

In [502]:
transact1.shape

(13042629, 10)

#### Change DAY_DT  column to Datetime, and visualize

In [503]:
#date is an object at this point

transact1.dtypes

DAY_DT               object
LOC_IDNT              int64
DBSKU               float64
ONLINE_FLAG           int64
FULL_PRICE_IND       object
TOTAL_SALES         float64
TOTAL_UNITS         float64
TOTAL_SALES_PRFT    float64
TOTAL_COST          float64
UNIT_SALES_PRICE    float64
dtype: object

In [504]:
transact1['DAY_DT']=pd.to_datetime(transact1['DAY_DT'])

In [505]:
transact1.head()

Unnamed: 0,DAY_DT,LOC_IDNT,DBSKU,ONLINE_FLAG,FULL_PRICE_IND,TOTAL_SALES,TOTAL_UNITS,TOTAL_SALES_PRFT,TOTAL_COST,UNIT_SALES_PRICE
0,2015-09-26,1218,466896.0,0,NFP,16.8,1.0,1.3,15.5,16.8
1,2015-08-02,1218,412445.0,0,NFP,29.99,1.0,12.99,17.0,29.99
2,2015-10-21,1218,491738.0,0,FP,44.0,1.0,28.25,15.75,44.0
3,2015-08-02,1218,414979.0,0,NFP,24.0,1.0,6.9936,17.0064,24.0
4,2015-07-26,1218,458372.0,0,FP,48.0,1.0,30.0,18.0,48.0


#### Visualize Timeline

In [506]:
timeline=transact1[['DAY_DT','TOTAL_UNITS']]

In [507]:
timeline.shape

(13042629, 2)

In [508]:
timeline.nunique()

DAY_DT         1077
TOTAL_UNITS     145
dtype: int64

In [509]:
timeline=timeline.groupby(['DAY_DT']).sum()

In [510]:
timeline.head()

Unnamed: 0_level_0,TOTAL_UNITS
DAY_DT,Unnamed: 1_level_1
2015-07-26,13955.0
2015-07-27,15280.0
2015-07-28,16341.0
2015-07-29,16718.0
2015-07-30,15778.0


In [511]:
timeline.shape

(1077, 1)

In [512]:
import matplotlib as mp

In [513]:
%matplotlib tk

In [514]:
timeline.plot(figsize=(35,4))

<matplotlib.axes._subplots.AxesSubplot at 0x15e12c3eb70>

#### Significant findings
-  multiple seasonal and cyclical effects in our data
-  Sales increase on week ends - points of sales are physical stores, so it follows that people visit on weekends

-  Peaks during March - Possibly due to financial year sales pressure?
-  not visible in initial chart, but seasonal sales improvements near christmas


### Further steps in later version of code: Use statsmodel to decompose and separately identify trend, seasonal and random effects in the time series data.

#### Additonal Suggested Cleaning 
-  Update format to include day
-  Group by month and replot
-  Dates start at 25 Jun 2015, and end at 8 Jul 2018. Suggesting that these incomplete months of data can be dropped if needed

#### Generate the profile for transact1 to determine any other issues which may exist

In [515]:
transact1.head()

Unnamed: 0,DAY_DT,LOC_IDNT,DBSKU,ONLINE_FLAG,FULL_PRICE_IND,TOTAL_SALES,TOTAL_UNITS,TOTAL_SALES_PRFT,TOTAL_COST,UNIT_SALES_PRICE
0,2015-09-26,1218,466896.0,0,NFP,16.8,1.0,1.3,15.5,16.8
1,2015-08-02,1218,412445.0,0,NFP,29.99,1.0,12.99,17.0,29.99
2,2015-10-21,1218,491738.0,0,FP,44.0,1.0,28.25,15.75,44.0
3,2015-08-02,1218,414979.0,0,NFP,24.0,1.0,6.9936,17.0064,24.0
4,2015-07-26,1218,458372.0,0,FP,48.0,1.0,30.0,18.0,48.0


#### Downcast FULL_PRICE_IND to int

In [516]:
transact1['FULL_PRICE_IND'] = transact1['FULL_PRICE_IND'].map({'NFP': 0, 'FP': 1})

In [517]:
transact1.head()

Unnamed: 0,DAY_DT,LOC_IDNT,DBSKU,ONLINE_FLAG,FULL_PRICE_IND,TOTAL_SALES,TOTAL_UNITS,TOTAL_SALES_PRFT,TOTAL_COST,UNIT_SALES_PRICE
0,2015-09-26,1218,466896.0,0,0,16.8,1.0,1.3,15.5,16.8
1,2015-08-02,1218,412445.0,0,0,29.99,1.0,12.99,17.0,29.99
2,2015-10-21,1218,491738.0,0,1,44.0,1.0,28.25,15.75,44.0
3,2015-08-02,1218,414979.0,0,0,24.0,1.0,6.9936,17.0064,24.0
4,2015-07-26,1218,458372.0,0,1,48.0,1.0,30.0,18.0,48.0


In [518]:
transact1['FULL_PRICE_IND'] = transact1['FULL_PRICE_IND'].astype(np.int8)

In [519]:
transact1['ONLINE_FLAG'] = transact1['ONLINE_FLAG'].astype(np.int8)

In [520]:
transact1.head()

Unnamed: 0,DAY_DT,LOC_IDNT,DBSKU,ONLINE_FLAG,FULL_PRICE_IND,TOTAL_SALES,TOTAL_UNITS,TOTAL_SALES_PRFT,TOTAL_COST,UNIT_SALES_PRICE
0,2015-09-26,1218,466896.0,0,0,16.8,1.0,1.3,15.5,16.8
1,2015-08-02,1218,412445.0,0,0,29.99,1.0,12.99,17.0,29.99
2,2015-10-21,1218,491738.0,0,1,44.0,1.0,28.25,15.75,44.0
3,2015-08-02,1218,414979.0,0,0,24.0,1.0,6.9936,17.0064,24.0
4,2015-07-26,1218,458372.0,0,1,48.0,1.0,30.0,18.0,48.0


#### From the profile we obserrve an anomaly with LOC_IDNT = 4150 

#### This individual location has ~300000 k transactions 

From our store profile, we know that location 4150 does not have any store size, and has been removed from our data set.

This may be an issue when we join our transaction and store data sets

In [521]:
transact1[transact1["LOC_IDNT"]==4150].head()

Unnamed: 0,DAY_DT,LOC_IDNT,DBSKU,ONLINE_FLAG,FULL_PRICE_IND,TOTAL_SALES,TOTAL_UNITS,TOTAL_SALES_PRFT,TOTAL_COST,UNIT_SALES_PRICE
460415,2015-08-11,4150,2105437.0,1,1,307.51,6.0,191.17,116.34,51.251667
460416,2015-08-18,4150,307272.0,1,1,214.75,5.0,131.0,83.75,42.95
460417,2015-08-04,4150,2995522.0,1,0,83.7,3.0,13.2,70.5,27.9
460419,2015-08-17,4150,356840.0,1,1,100.0,1.0,72.0,28.0,100.0
460420,2015-08-17,4150,434464.0,1,0,203.45,8.0,11.45,192.0,25.43125


#### We note that all transactions at this location, are online transactions



In [522]:
transact1[transact1["ONLINE_FLAG"]==1].shape

(329116, 10)

In [523]:
transact1[transact1["LOC_IDNT"]==4150].shape

(329116, 10)

In [412]:
transact1[transact1["ONLINE_FLAG"]==1].nunique()

DAY_DT               1017
LOC_IDNT                1
DBSKU                3862
ONLINE_FLAG             1
FULL_PRICE_IND          2
TOTAL_SALES         58205
TOTAL_UNITS           145
TOTAL_SALES_PRFT     1906
TOTAL_COST           1341
UNIT_SALES_PRICE    75740
dtype: int64

#### Analysis of the original transaction data set, shows 
-  Only 2 Store code were used for Online transactions LOC_IDNT == 4100, 4150. N =4 and n = 330607
-  Transactions at store 4100 have 0 units sold, and can be discarded
-  After simple cleaning of Transaction data set, we are left with 329119 transactions for ONLINE transactions

We propose removing these transactions and moving them to an alternative data set, to be treated and analyzed separately from the physical store transactions

### Removing Online transactions to its own data set

In [524]:
transact_online=transact1[transact1['ONLINE_FLAG']==1]

In [525]:
transact_online.to_csv("cleaned_online_transaction.csv")

In [526]:
transact1=transact1[transact1['ONLINE_FLAG']!=1]

In [527]:
transact1.drop(['ONLINE_FLAG'], axis=1, inplace=True)

In [528]:
transact1.head()

Unnamed: 0,DAY_DT,LOC_IDNT,DBSKU,FULL_PRICE_IND,TOTAL_SALES,TOTAL_UNITS,TOTAL_SALES_PRFT,TOTAL_COST,UNIT_SALES_PRICE
0,2015-09-26,1218,466896.0,0,16.8,1.0,1.3,15.5,16.8
1,2015-08-02,1218,412445.0,0,29.99,1.0,12.99,17.0,29.99
2,2015-10-21,1218,491738.0,1,44.0,1.0,28.25,15.75,44.0
3,2015-08-02,1218,414979.0,0,24.0,1.0,6.9936,17.0064,24.0
4,2015-07-26,1218,458372.0,1,48.0,1.0,30.0,18.0,48.0


In [529]:
transact1.shape

(12713513, 9)

In [530]:
pp.ProfileReport(transact1).to_file(outputfile="./cleaned_transact_profile.html")

### Create a frequency histogram for our data

In [568]:
transact1.hist(bins=75)

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000015E12EF3AC8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000015E12A03EF0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000015E11EF0400>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000015E12AC6D68>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000015E8CAB9710>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000015E8C9C90B8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000015E8C9A5A20>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000015E8C877400>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000015E8C877438>]],
      dtype=object)

### Creating CSV files for cleaned Data sets

In [417]:
product1.to_csv("cleaned_products.csv")

In [418]:
store1.to_csv("cleaned_store.csv")

In [532]:
transact1.to_csv("cleaned_transaction.csv")