# batches.ipynb
Author: UFO Software, LLC<br>
Created: Wednesday, February 24, 2021 19:04<br>

License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html> <br>
This is free software: you are free to change and redistribute it.<br>
There is NO WARRANTY, to the extent permitted by law.<br>
<br>
Reads in the Batches_0.csv file from the [December 2020 WA State Traceability Data](https://lcb.app.box.com/s/fnku9nr22dhx04f6o646xv6ad6fswfy9?page=1) and drops the columns that have been depreciated.



In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

# replace with the path to your data
file_path = Path('../dec-2020')
pd.set_option('display.max_columns', None)

In [2]:
def get_batches_df():
    
    # reduce the size of the dataframe's memory footprint by specifying data types
    # comment out columns you are not using to further decrease the memory footprint
    col_dtypes = {'global_id': 'string',
                 'mme_id': 'string',
                 'user_id': 'string',
                 'external_id': 'string',
                 'uom': 'category',
                 #'planted_at': 'string', depreciated
                 #'created_by_mme_id', 'string', Every entry is nan
                 'num_plants': 'int32',
                 'status': 'category',
                 'strain_id': 'string',
                 'is_parent_batch': 'bool',
                 'is_child_batch': 'bool',
                 'type': 'category',
                 'harvest_stage': 'category',
                 #'qty_accumulated_waste': 'float32', depreciated
                 'qty_packaged_flower': 'float32',
                 'qty_packaged_by_product': 'float32',
                 'area_id': 'string',
                 'origin': 'category',
                 #'qty_cure': 'float32' depreciated
                 'plant_stage': 'category',
                 'flower_dry_weight': 'float32',
                 'waste': 'float32',
                 'other_dry_weight': 'float32',
                 'flower_wet_weight': 'float32',
                 'other_wet_weight': 'float32'}

    # parse date columns 
    date_cols = ['created_at',
                 'updated_at',
                 #'planted_at', depreciated
                 'harvested_at',
                 'batch_created_at',
                 'packaged_completed_at',
                 'deleted_at',
                 'harvested_end_at']
    
    # combine the column names to load only the columns you are using
    cols = list(col_dtypes.keys()) + date_cols

    batches_df = pd.read_csv(file_path / 'Batches_0.csv', sep = '\t', encoding = 'utf-16', usecols = cols, dtype = col_dtypes, parse_dates = date_cols)
    
    return batches_df

In [3]:
# read in the dataframe
batches_df = get_batches_df()
batches_df.head()

Unnamed: 0,global_id,created_at,updated_at,mme_id,user_id,external_id,uom,harvested_at,batch_created_at,num_plants,status,strain_id,is_parent_batch,is_child_batch,type,harvest_stage,qty_packaged_flower,qty_packaged_by_product,packaged_completed_at,area_id,origin,plant_stage,deleted_at,flower_dry_weight,waste,other_dry_weight,harvested_end_at,flower_wet_weight,other_wet_weight
0,WAJ412598.BA1,2018-01-31 17:24:31,2018-01-31 17:24:32,WAWA1.MMDJ,WAWA1.USAM,4125980000004911,ea,1900-01-01,2018-01-31 17:24:31,0,open,WAJ412598.STX,False,False,propagation material,,0.0,0.0,1900-01-01,WAJ412598.AR3,clone,seedling,NaT,0.0,0.0,0.0,NaT,0.0,0.0
1,WAJ412598.BA2,2018-01-31 17:24:32,2018-01-31 17:24:32,WAWA1.MMDJ,WAWA1.USAM,4125980000004921,ea,1900-01-01,2018-01-31 17:24:32,0,open,WAJ412598.STX,False,False,propagation material,,0.0,0.0,1900-01-01,WAJ412598.AR3,clone,seedling,NaT,0.0,0.0,0.0,NaT,0.0,0.0
2,WAJ412598.BA3,2018-01-31 17:24:30,2018-01-31 17:24:30,WAWA1.MMDJ,WAWA1.USAM,4125980000005382,ea,1900-01-01,2018-01-31 17:24:30,0,open,WAJ412598.STX,False,False,propagation material,,0.0,0.0,1900-01-01,WAJ412598.AR3,clone,seedling,NaT,0.0,0.0,0.0,NaT,0.0,0.0
3,WAJ412598.BA4,2018-01-31 17:24:35,2018-01-31 17:24:35,WAWA1.MMDJ,WAWA1.USAM,4125980000005399,ea,1900-01-01,2018-01-31 17:24:35,0,open,WAJ412598.STX,False,False,propagation material,,0.0,0.0,1900-01-01,WAJ412598.AR3,clone,seedling,NaT,0.0,0.0,0.0,NaT,0.0,0.0
4,WAJ412598.BA5,2018-01-31 17:24:32,2018-01-31 17:24:32,WAWA1.MMDJ,WAWA1.USAM,4125980000005409,ea,1900-01-01,2018-01-31 17:24:32,0,open,WAJ412598.STX,False,False,propagation material,,0.0,0.0,1900-01-01,WAJ412598.AR3,clone,seedling,NaT,0.0,0.0,0.0,NaT,0.0,0.0


In [4]:
batches_df.tail()

Unnamed: 0,global_id,created_at,updated_at,mme_id,user_id,external_id,uom,harvested_at,batch_created_at,num_plants,status,strain_id,is_parent_batch,is_child_batch,type,harvest_stage,qty_packaged_flower,qty_packaged_by_product,packaged_completed_at,area_id,origin,plant_stage,deleted_at,flower_dry_weight,waste,other_dry_weight,harvested_end_at,flower_wet_weight,other_wet_weight
37646467,WAJ416113.BAMF89H,2021-01-06 05:38:56,2021-01-06 05:38:58,WAWA1.MM1F4,WAWA1.US205,,ea,1900-01-01,2021-01-06 05:38:56,1,open,WAJ416113.STO1DS,True,False,plant,,0.0,0.0,1900-01-01 00:00:00,WAJ416113.AR5XU7,plant,growing,NaT,0.0,0.0,0.0,NaT,0.0,0.0
37646468,WAJ416113.BAMF89I,2021-01-06 05:38:57,2021-01-06 05:38:59,WAWA1.MM1F4,WAWA1.US205,,ea,1900-01-01,2021-01-06 05:38:57,1,open,WAJ416113.ST1L6,True,False,plant,,0.0,0.0,1900-01-01 00:00:00,WAJ416113.AR5XU7,plant,growing,NaT,0.0,0.0,0.0,NaT,0.0,0.0
37646469,WAJ412130.BAMF89J,2021-01-06 05:38:57,2021-01-06 05:38:57,WAWA1.MMHO,WAWA1.USF0,4121300000011549,ea,1900-01-01,2021-01-06 05:38:57,0,open,WAJ412130.STIHA7,False,False,intermediate/ end product,,0.0,0.0,2021-01-05 16:00:00,WAJ412130.AR4C5K,,seedling,NaT,0.0,0.0,0.0,NaT,0.0,0.0
37646470,WAJ417150.BAMF89K,2021-01-06 05:38:57,2021-01-06 05:38:57,WAWA1.MMXN,WAWA1.US2B5,IN_CONV2_WAJ417150.INSOT2R,gm,1900-01-01,2021-01-06 05:38:57,0,open,WAJ417150.STNP01,False,True,intermediate/ end product,,0.0,0.0,2021-01-05 16:00:00,WAJ417150.AR2456,,seedling,NaT,0.0,0.0,0.0,NaT,0.0,0.0
37646471,WAJ412130.BAMF89L,2021-01-06 05:38:58,2021-01-06 05:38:58,WAWA1.MMHO,WAWA1.USF0,4121300000011550,ea,1900-01-01,2021-01-06 05:38:58,0,open,WAJ412130.STIHA7,False,False,intermediate/ end product,,0.0,0.0,2021-01-05 16:00:00,WAJ412130.AR4C5K,,seedling,NaT,0.0,0.0,0.0,NaT,0.0,0.0


In [5]:
# comment out if you are not planning on using Dask
batches_df.to_parquet(file_path / 'batches.parquet')

In [6]:
# get an overview of the numerical columns
batches_df.describe()

Unnamed: 0,num_plants,qty_packaged_flower,qty_packaged_by_product,flower_dry_weight,waste,other_dry_weight,flower_wet_weight,other_wet_weight
count,37646470.0,37646470.0,37646470.0,37646470.0,37646470.0,37646470.0,37646470.0,37646470.0
mean,2.507006,425.486,344.6942,207.816,192.6857,51.53087,2256.291,160.7805
std,288.9761,145084.2,6095.849,17112.99,10032.24,2002.182,74060.1,29858.75
min,-1169767.0,-654602400.0,-155.5,-1.0,-1130.0,-26049.7,-1.0,-1.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,200200.0,23885060.0,4000000.0,90084340.0,25438790.0,1176496.0,248132500.0,40433060.0
