In [1]:
import pandas as pd
import os
import numpy as np
import json

In [2]:
file = ['receipts', 'users', 'brands']

receipts = os.path.join('raw_data_files', file[0]+'.json')
users = os.path.join('raw_data_files', file[1]+'.json')
brands = os.path.join('raw_data_files', file[2]+'.json')

In [3]:
receipts

'raw_data_files/receipts.json'

In [187]:
df_receipts = pd.read_json(receipts, orient='records', lines=True)
df_users = pd.read_json(users, orient='records', lines=True)
df_brands = pd.read_json(brands, orient='records', lines=True)

<hr>

In [188]:
# Function to move column order
def change_column_order(df, col_name, index):
    cols = df.columns.tolist()
    cols.remove(col_name)
    cols.insert(index, col_name)
    return df[cols]

## Converting receipts data

In [189]:
df_receipts.head(2)

Unnamed: 0,_id,bonusPointsEarned,bonusPointsEarnedReason,createDate,dateScanned,finishedDate,modifyDate,pointsAwardedDate,pointsEarned,purchaseDate,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId
0,{'$oid': '5ff1e1eb0a720f0523000575'},500.0,"Receipt number 2 completed, bonus point schedu...",{'$date': 1609687531000},{'$date': 1609687531000},{'$date': 1609687531000},{'$date': 1609687536000},{'$date': 1609687531000},500.0,{'$date': 1609632000000},5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,26.0,5ff1e1eacfcf6c399c274ae6
1,{'$oid': '5ff1e1bb0a720f052300056b'},150.0,"Receipt number 5 completed, bonus point schedu...",{'$date': 1609687483000},{'$date': 1609687483000},{'$date': 1609687483000},{'$date': 1609687488000},{'$date': 1609687483000},150.0,{'$date': 1609601083000},2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,11.0,5ff1e194b6a9d73a3a9f1052


In [190]:
df_receipts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1119 entries, 0 to 1118
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   _id                      1119 non-null   object 
 1   bonusPointsEarned        544 non-null    float64
 2   bonusPointsEarnedReason  544 non-null    object 
 3   createDate               1119 non-null   object 
 4   dateScanned              1119 non-null   object 
 5   finishedDate             568 non-null    object 
 6   modifyDate               1119 non-null   object 
 7   pointsAwardedDate        537 non-null    object 
 8   pointsEarned             609 non-null    float64
 9   purchaseDate             671 non-null    object 
 10  purchasedItemCount       635 non-null    float64
 11  rewardsReceiptItemList   679 non-null    object 
 12  rewardsReceiptStatus     1119 non-null   object 
 13  totalSpent               684 non-null    float64
 14  userId                  

In [191]:
# Change column name '_id' to 'id'
df_receipts = df_receipts.rename(columns={'_id': 'id'})
df_receipts.head(1)

Unnamed: 0,id,bonusPointsEarned,bonusPointsEarnedReason,createDate,dateScanned,finishedDate,modifyDate,pointsAwardedDate,pointsEarned,purchaseDate,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId
0,{'$oid': '5ff1e1eb0a720f0523000575'},500.0,"Receipt number 2 completed, bonus point schedu...",{'$date': 1609687531000},{'$date': 1609687531000},{'$date': 1609687531000},{'$date': 1609687536000},{'$date': 1609687531000},500.0,{'$date': 1609632000000},5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,26.0,5ff1e1eacfcf6c399c274ae6


In [192]:
# Get rid of the prefix (e.g. '$oid', '$date') and flatten the nested 1-level dict to plain data

# First, let's take care of the datetime columns
columns = ['createDate', 'dateScanned', 'finishedDate',
           'modifyDate', 'pointsAwardedDate', 'purchaseDate']

for index, row in df_receipts.iterrows():
    for c in columns:
        if str(df_receipts.loc[index, c]) != 'nan':
            df_receipts.loc[index, c] = df_receipts.loc[index, c]['$date']

df_receipts.head(2)

Unnamed: 0,id,bonusPointsEarned,bonusPointsEarnedReason,createDate,dateScanned,finishedDate,modifyDate,pointsAwardedDate,pointsEarned,purchaseDate,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId
0,{'$oid': '5ff1e1eb0a720f0523000575'},500.0,"Receipt number 2 completed, bonus point schedu...",1609687531000,1609687531000,1609687531000,1609687536000,1609687531000,500.0,1609632000000,5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,26.0,5ff1e1eacfcf6c399c274ae6
1,{'$oid': '5ff1e1bb0a720f052300056b'},150.0,"Receipt number 5 completed, bonus point schedu...",1609687483000,1609687483000,1609687483000,1609687488000,1609687483000,150.0,1609601083000,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,11.0,5ff1e194b6a9d73a3a9f1052


In [193]:
# Second, also convert the id column
for index, row in df_receipts.iterrows():
    if str(df_receipts.loc[index, 'id']) != 'nan':
        df_receipts.loc[index, 'id'] = df_receipts.loc[index, 'id']['$oid']

df_receipts.head(2)

Unnamed: 0,id,bonusPointsEarned,bonusPointsEarnedReason,createDate,dateScanned,finishedDate,modifyDate,pointsAwardedDate,pointsEarned,purchaseDate,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId
0,5ff1e1eb0a720f0523000575,500.0,"Receipt number 2 completed, bonus point schedu...",1609687531000,1609687531000,1609687531000,1609687536000,1609687531000,500.0,1609632000000,5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,26.0,5ff1e1eacfcf6c399c274ae6
1,5ff1e1bb0a720f052300056b,150.0,"Receipt number 5 completed, bonus point schedu...",1609687483000,1609687483000,1609687483000,1609687488000,1609687483000,150.0,1609601083000,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,11.0,5ff1e194b6a9d73a3a9f1052


In [194]:
# Clean up data formats to convert from unix 13-digit to datetime/timestamp
df_receipts_cleaned = df_receipts.copy()

columns = ['createDate', 'dateScanned', 'finishedDate',
           'modifyDate', 'pointsAwardedDate', 'purchaseDate']

for c in columns:
    df_receipts_cleaned[c] = pd.to_datetime(df_receipts_cleaned[c], unit='ms')
    
df_receipts_cleaned.head()

Unnamed: 0,id,bonusPointsEarned,bonusPointsEarnedReason,createDate,dateScanned,finishedDate,modifyDate,pointsAwardedDate,pointsEarned,purchaseDate,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId
0,5ff1e1eb0a720f0523000575,500.0,"Receipt number 2 completed, bonus point schedu...",2021-01-03 15:25:31,2021-01-03 15:25:31,2021-01-03 15:25:31,2021-01-03 15:25:36,2021-01-03 15:25:31,500.0,2021-01-03 00:00:00,5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,26.0,5ff1e1eacfcf6c399c274ae6
1,5ff1e1bb0a720f052300056b,150.0,"Receipt number 5 completed, bonus point schedu...",2021-01-03 15:24:43,2021-01-03 15:24:43,2021-01-03 15:24:43,2021-01-03 15:24:48,2021-01-03 15:24:43,150.0,2021-01-02 15:24:43,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,11.0,5ff1e194b6a9d73a3a9f1052
2,5ff1e1f10a720f052300057a,5.0,All-receipts receipt bonus,2021-01-03 15:25:37,2021-01-03 15:25:37,NaT,2021-01-03 15:25:42,NaT,5.0,2021-01-03 00:00:00,1.0,"[{'needsFetchReview': False, 'partnerItemId': ...",REJECTED,10.0,5ff1e1f1cfcf6c399c274b0b
3,5ff1e1ee0a7214ada100056f,5.0,All-receipts receipt bonus,2021-01-03 15:25:34,2021-01-03 15:25:34,2021-01-03 15:25:34,2021-01-03 15:25:39,2021-01-03 15:25:34,5.0,2021-01-03 00:00:00,4.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,28.0,5ff1e1eacfcf6c399c274ae6
4,5ff1e1d20a7214ada1000561,5.0,All-receipts receipt bonus,2021-01-03 15:25:06,2021-01-03 15:25:06,2021-01-03 15:25:11,2021-01-03 15:25:11,2021-01-03 15:25:06,5.0,2021-01-02 15:25:06,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,1.0,5ff1e194b6a9d73a3a9f1052


In [195]:
df_receipts_final = df_receipts_cleaned.drop(['rewardsReceiptItemList'], axis='columns')
df_receipts_final = df_receipts_final.rename(columns={'id': 'receiptId'})
df_receipts_final.head()

Unnamed: 0,receiptId,bonusPointsEarned,bonusPointsEarnedReason,createDate,dateScanned,finishedDate,modifyDate,pointsAwardedDate,pointsEarned,purchaseDate,purchasedItemCount,rewardsReceiptStatus,totalSpent,userId
0,5ff1e1eb0a720f0523000575,500.0,"Receipt number 2 completed, bonus point schedu...",2021-01-03 15:25:31,2021-01-03 15:25:31,2021-01-03 15:25:31,2021-01-03 15:25:36,2021-01-03 15:25:31,500.0,2021-01-03 00:00:00,5.0,FINISHED,26.0,5ff1e1eacfcf6c399c274ae6
1,5ff1e1bb0a720f052300056b,150.0,"Receipt number 5 completed, bonus point schedu...",2021-01-03 15:24:43,2021-01-03 15:24:43,2021-01-03 15:24:43,2021-01-03 15:24:48,2021-01-03 15:24:43,150.0,2021-01-02 15:24:43,2.0,FINISHED,11.0,5ff1e194b6a9d73a3a9f1052
2,5ff1e1f10a720f052300057a,5.0,All-receipts receipt bonus,2021-01-03 15:25:37,2021-01-03 15:25:37,NaT,2021-01-03 15:25:42,NaT,5.0,2021-01-03 00:00:00,1.0,REJECTED,10.0,5ff1e1f1cfcf6c399c274b0b
3,5ff1e1ee0a7214ada100056f,5.0,All-receipts receipt bonus,2021-01-03 15:25:34,2021-01-03 15:25:34,2021-01-03 15:25:34,2021-01-03 15:25:39,2021-01-03 15:25:34,5.0,2021-01-03 00:00:00,4.0,FINISHED,28.0,5ff1e1eacfcf6c399c274ae6
4,5ff1e1d20a7214ada1000561,5.0,All-receipts receipt bonus,2021-01-03 15:25:06,2021-01-03 15:25:06,2021-01-03 15:25:11,2021-01-03 15:25:11,2021-01-03 15:25:06,5.0,2021-01-02 15:25:06,2.0,FINISHED,1.0,5ff1e194b6a9d73a3a9f1052


In [196]:
# Check dtypes
df_receipts_final.dtypes

receiptId                          object
bonusPointsEarned                 float64
bonusPointsEarnedReason            object
createDate                 datetime64[ns]
dateScanned                datetime64[ns]
finishedDate               datetime64[ns]
modifyDate                 datetime64[ns]
pointsAwardedDate          datetime64[ns]
pointsEarned                      float64
purchaseDate               datetime64[ns]
purchasedItemCount                float64
rewardsReceiptStatus               object
totalSpent                        float64
userId                             object
dtype: object

**Save cleaned `receipts` dataset**

In [197]:
# df_receipts_final.to_csv(os.path.join('cleaned_csv_data_files', file[0]+'.csv'), index=False, header=True)

In [203]:
json_receipts_formatted = json.loads(df_receipts_cleaned.to_json(orient='records'))
item_list = pd.json_normalize(json_receipts_formatted, record_path =['rewardsReceiptItemList'], meta=['id'])

# Move 'id' to be the first column
item_list = change_column_order(item_list, 'id', 0)
item_list = item_list.rename(columns={'id': 'receiptId'})
item_list.head()

Unnamed: 0,receiptId,barcode,description,finalPrice,itemPrice,needsFetchReview,partnerItemId,preventTargetGapPoints,quantityPurchased,userFlaggedBarcode,...,itemNumber,originalMetaBriteQuantityPurchased,pointsEarned,targetPrice,competitiveProduct,originalFinalPrice,originalMetaBriteItemPrice,deleted,priceAfterCoupon,metabriteCampaignId
0,5ff1e1eb0a720f0523000575,4011.0,ITEM NOT FOUND,26.0,26.0,False,1,True,5.0,4011.0,...,,,,,,,,,,
1,5ff1e1bb0a720f052300056b,4011.0,ITEM NOT FOUND,1.0,1.0,,1,,1.0,,...,,,,,,,,,,
2,5ff1e1bb0a720f052300056b,28400642255.0,DORITOS TORTILLA CHIP SPICY SWEET CHILI REDUCE...,10.0,10.0,True,2,True,1.0,28400642255.0,...,,,,,,,,,,
3,5ff1e1f10a720f052300057a,,,,,False,1,True,,4011.0,...,,,,,,,,,,
4,5ff1e1ee0a7214ada100056f,4011.0,ITEM NOT FOUND,28.0,28.0,False,1,True,4.0,4011.0,...,,,,,,,,,,


In [204]:
# Convert data types for the rewardsReceiptItemList dataset
columns_to_float = ['finalPrice', 'itemPrice', 'userFlaggedPrice', 'userFlaggedQuantity',
                    'originalMetaBriteQuantityPurchased', 'discountedItemPrice', 'pointsEarned',
                    'targetPrice', 'originalFinalPrice', 'originalMetaBriteItemPrice',
                    'priceAfterCoupon']

for c in columns_to_float:
    item_list[c] = item_list[c].astype(float)

item_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6941 entries, 0 to 6940
Data columns (total 35 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   receiptId                           6941 non-null   object 
 1   barcode                             3090 non-null   object 
 2   description                         6560 non-null   object 
 3   finalPrice                          6767 non-null   float64
 4   itemPrice                           6767 non-null   float64
 5   needsFetchReview                    813 non-null    object 
 6   partnerItemId                       6941 non-null   object 
 7   preventTargetGapPoints              358 non-null    object 
 8   quantityPurchased                   6767 non-null   float64
 9   userFlaggedBarcode                  337 non-null    object 
 10  userFlaggedNewItem                  323 non-null    object 
 11  userFlaggedPrice                    299 non

**Save cleaned `rewardsReceiptItemList` dataset**

In [206]:
# item_list.to_csv(os.path.join('cleaned_csv_data_files', 'rewards_receipt_item_list'+'.csv'), index=False, header=True)

## Converting users data

In [64]:
df_users.head(2)

Unnamed: 0,_id,active,createdDate,lastLogin,role,signUpSource,state
0,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
1,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI


In [65]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495 entries, 0 to 494
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   _id           495 non-null    object
 1   active        495 non-null    bool  
 2   createdDate   495 non-null    object
 3   lastLogin     433 non-null    object
 4   role          495 non-null    object
 5   signUpSource  447 non-null    object
 6   state         439 non-null    object
dtypes: bool(1), object(6)
memory usage: 23.8+ KB


In [66]:
# Change column name '_id' to 'id'
df_users = df_users.rename(columns={'_id': 'id'})
df_users.head(1)

Unnamed: 0,id,active,createdDate,lastLogin,role,signUpSource,state
0,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI


In [67]:
# Clean up the format of the id column
for index, row in df_users.iterrows():
    if str(df_users.loc[index, 'id']) != 'nan':
        df_users.loc[index, 'id'] = df_users.loc[index, 'id']['$oid']

df_users.head(2)

Unnamed: 0,id,active,createdDate,lastLogin,role,signUpSource,state
0,5ff1e194b6a9d73a3a9f1052,True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
1,5ff1e194b6a9d73a3a9f1052,True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI


In [69]:
# Clean up the formats of the datetime columns
columns = ['createdDate', 'lastLogin']

for index, row in df_users.iterrows():
    for c in columns:
        if str(df_users.loc[index, c]) != 'nan':
            df_users.loc[index, c] = df_users.loc[index, c]['$date']

df_users.head(2)

Unnamed: 0,id,active,createdDate,lastLogin,role,signUpSource,state
0,5ff1e194b6a9d73a3a9f1052,True,1609687444800,1609687537858,consumer,Email,WI
1,5ff1e194b6a9d73a3a9f1052,True,1609687444800,1609687537858,consumer,Email,WI


In [71]:
# Convert unix 13-digit time to regular datetime format
df_users_cleaned = df_users.copy()

columns = ['createdDate', 'lastLogin']

for c in columns:
    df_users_cleaned[c] = pd.to_datetime(df_users_cleaned[c], unit='ms')
    
df_users_cleaned.head()

Unnamed: 0,id,active,createdDate,lastLogin,role,signUpSource,state
0,5ff1e194b6a9d73a3a9f1052,True,2021-01-03 15:24:04.800,2021-01-03 15:25:37.858,consumer,Email,WI
1,5ff1e194b6a9d73a3a9f1052,True,2021-01-03 15:24:04.800,2021-01-03 15:25:37.858,consumer,Email,WI
2,5ff1e194b6a9d73a3a9f1052,True,2021-01-03 15:24:04.800,2021-01-03 15:25:37.858,consumer,Email,WI
3,5ff1e1eacfcf6c399c274ae6,True,2021-01-03 15:25:30.554,2021-01-03 15:25:30.597,consumer,Email,WI
4,5ff1e194b6a9d73a3a9f1052,True,2021-01-03 15:24:04.800,2021-01-03 15:25:37.858,consumer,Email,WI


In [73]:
# Check dtypes
df_users_cleaned.dtypes

id                      object
active                    bool
createdDate     datetime64[ns]
lastLogin       datetime64[ns]
role                    object
signUpSource            object
state                   object
dtype: object

### Luckily we were able to find out that the customer id in the users dataset is not not unique, thus let's clean it up.

In [136]:
df_users_final = df_users_cleaned.copy()
df_users_final = df_users_final.drop_duplicates(subset='id', ignore_index=True)
df_users_final = df_users_final.rename(columns={'id': 'userId'})
df_users_final

Unnamed: 0,userId,active,createdDate,lastLogin,role,signUpSource,state
0,5ff1e194b6a9d73a3a9f1052,True,2021-01-03 15:24:04.800,2021-01-03 15:25:37.858,consumer,Email,WI
1,5ff1e1eacfcf6c399c274ae6,True,2021-01-03 15:25:30.554,2021-01-03 15:25:30.597,consumer,Email,WI
2,5ff1e1e8cfcf6c399c274ad9,True,2021-01-03 15:25:28.354,2021-01-03 15:25:28.392,consumer,Email,WI
3,5ff1e1b7cfcf6c399c274a5a,True,2021-01-03 15:24:39.626,2021-01-03 15:24:39.665,consumer,Email,WI
4,5ff1e1f1cfcf6c399c274b0b,True,2021-01-03 15:25:37.564,2021-01-03 15:25:37.599,consumer,Email,WI
...,...,...,...,...,...,...,...
207,5fc961c3b8cfca11a077dd33,True,2020-12-03 22:08:03.936,2021-02-26 22:39:16.799,fetch-staff,Email,NH
208,5fa41775898c7a11a6bcef3e,True,2020-11-05 15:17:09.396,2021-03-04 16:02:02.026,fetch-staff,Email,
209,5fa32b4d898c7a11a6bcebce,True,2020-11-04 22:29:33.309,2021-03-04 07:21:58.047,fetch-staff,Google,AL
210,5964eb07e4b03efd0c0f267b,True,2017-07-11 15:13:11.771,2021-03-04 19:07:49.770,fetch-staff,,IL


**Save cleaned `users` dataset**

In [137]:
# df_users_final.to_csv(os.path.join('cleaned_csv_data_files', 'users'+'.csv'), index=False, header=True)

## Converting brands data

In [208]:
df_brands.head(2)

Unnamed: 0,_id,barcode,category,categoryCode,cpg,name,topBrand,brandCode
0,{'$oid': '601ac115be37ce2ead437551'},511111019862,Baking,BAKING,"{'$id': {'$oid': '601ac114be37ce2ead437550'}, ...",test brand @1612366101024,0.0,
1,{'$oid': '601c5460be37ce2ead43755f'},511111519928,Beverages,BEVERAGES,"{'$id': {'$oid': '5332f5fbe4b03c9a25efd0ba'}, ...",Starbucks,0.0,STARBUCKS


In [209]:
df_brands.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1167 entries, 0 to 1166
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   _id           1167 non-null   object 
 1   barcode       1167 non-null   int64  
 2   category      1012 non-null   object 
 3   categoryCode  517 non-null    object 
 4   cpg           1167 non-null   object 
 5   name          1167 non-null   object 
 6   topBrand      555 non-null    float64
 7   brandCode     933 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 73.1+ KB


In [210]:
# Change column name '_id' to 'id'
df_brands = df_brands.rename(columns={'_id': 'id'})
df_brands.head(1)

Unnamed: 0,id,barcode,category,categoryCode,cpg,name,topBrand,brandCode
0,{'$oid': '601ac115be37ce2ead437551'},511111019862,Baking,BAKING,"{'$id': {'$oid': '601ac114be37ce2ead437550'}, ...",test brand @1612366101024,0.0,


In [211]:
# Clean up the format of the id column
for index, row in df_brands.iterrows():
    if str(df_brands.loc[index, 'id']) != 'nan':
        df_brands.loc[index, 'id'] = df_brands.loc[index, 'id']['$oid']

df_brands.head(2)

Unnamed: 0,id,barcode,category,categoryCode,cpg,name,topBrand,brandCode
0,601ac115be37ce2ead437551,511111019862,Baking,BAKING,"{'$id': {'$oid': '601ac114be37ce2ead437550'}, ...",test brand @1612366101024,0.0,
1,601c5460be37ce2ead43755f,511111519928,Beverages,BEVERAGES,"{'$id': {'$oid': '5332f5fbe4b03c9a25efd0ba'}, ...",Starbucks,0.0,STARBUCKS


In [212]:
# Clean up the cpg column
for index, row in df_brands.iterrows():
    if str(df_brands.loc[index, 'cpg']) != 'nan':
        df_brands.loc[index, 'cpg'] = df_brands.loc[index, 'cpg']['$id']['$oid']

df_brands.head(2)

Unnamed: 0,id,barcode,category,categoryCode,cpg,name,topBrand,brandCode
0,601ac115be37ce2ead437551,511111019862,Baking,BAKING,601ac114be37ce2ead437550,test brand @1612366101024,0.0,
1,601c5460be37ce2ead43755f,511111519928,Beverages,BEVERAGES,5332f5fbe4b03c9a25efd0ba,Starbucks,0.0,STARBUCKS


In [213]:
# Check dtypes
df_brands.dtypes

id               object
barcode           int64
category         object
categoryCode     object
cpg              object
name             object
topBrand        float64
brandCode        object
dtype: object

In [214]:
# Let's convert `barcode` to object and `topBrand` to boolean
df_brands_cleaned = df_brands.copy()
df_brands_cleaned['barcode'] = df_brands_cleaned['barcode'].astype(str)

# Mapping 1 to True and 0 to False in the `topBrand` column
# P.S. We're not using "astpye(bool)" as that turns NaN to True which we don't want
one_zero_map = {1.0: True, 0.0: False}
df_brands_cleaned['topBrand'] = df_brands_cleaned['topBrand'].map(one_zero_map)

df_brands_cleaned.dtypes

id              object
barcode         object
category        object
categoryCode    object
cpg             object
name            object
topBrand        object
brandCode       object
dtype: object

**Save cleaned `brands` dataset**

In [217]:
# df_brands_cleaned.to_csv(os.path.join('cleaned_csv_data_files', 'brands'+'.csv'), index=False, header=True)