## Tannis McCartney
## May 26, 2022

### This notebook goes through importing, wrangling, and checking the products dataframe and departments data dictionary in preparation for merging with the orders dataframe.

## Contents
### 01 Import libraries
### 02 Import products data
### 03 Data wrangling products data
### 04 Consistency checks on products data
### 05 Change products data types to reduce memory usage
### 06 Import departments data
### 07 Data wrangling of departments data
### 08 Convert departments dataframe to data dictionary
### 09 Export products and departments data

# 01 Import libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 02 Import products data

In [2]:
# Turn project folder path into a string
path = r'C:\Users\tmmcc\Google Drive\Data Analytics Bootcamp\4 Python Fundamentals for Data Analysts\05-2022 Instacart Basket Analysis'

In [3]:
# Import products.csv to df_prods
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col=False)
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [4]:
# Check shape of df_prods
df_prods.shape

(49693, 5)

# 03 Wrangling products data

#### No wrangling is needed for the products dataframe

# 04 Consistency checks on products dataframe

In [5]:
# Check for mixed types in the products dataframe
for col in df_prods.columns.tolist():
    weird = (df_prods[[col]].applymap(type) != df_prods[[col]].iloc[0].apply(type)).any(axis=1)
    if len (df_prods[weird]) > 0:
        print(col)

product_name


#### The product_name column has mixed-type data. It will be changed to string.

In [6]:
# Change product_name data type to string
df_prods['product_name'] = df_prods['product_name'].astype('string')
df_prods.dtypes

product_id         int64
product_name      string
aisle_id           int64
department_id      int64
prices           float64
dtype: object

In [7]:
# Find missing observations in products dataframe
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

#### There are 16 rows with missing data in the product_name column. The client may be able to provide the product_names for these items.

In [8]:
# Create a subset of the products dataframe to see the rows with missing values
df_nan = df_prods[df_prods['product_name'].isnull()==True]
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [9]:
# Copy df_nan to clipboard so client can check product names for these 16 items
df_nan.to_clipboard()

In [10]:
# Look for full duplicates in the products dataframe
df_dups = df_prods[df_prods.duplicated()]
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [11]:
# Create a products dataframe that doesn't include the duplicates
df_prods_no_dups = df_prods.drop_duplicates()
df_prods_no_dups.shape

(49688, 5)

In [12]:
# Describe the products dataframe
df_prods_no_dups.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49688.0,49688.0,49688.0,49688.0
mean,24844.50004,67.769582,11.728687,9.994254
std,14343.834402,38.316162,5.85041,453.542503
min,1.0,1.0,1.0,1.0
25%,12422.75,35.0,7.0,4.1
50%,24844.5,69.0,13.0,7.1
75%,37266.25,100.0,17.0,11.2
max,49688.0,134.0,21.0,99999.0


#### The maximum price is high. It is probably incorrect. This needs to be fixed.

In [13]:
# Show the rows of the dataframe with the high prices
df_prods_no_dups.loc[df_prods_no_dups['prices'] > 100]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
21554,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0
33666,33664,2 % Reduced Fat Milk,84,16,99999.0


In [None]:
# Copy to clipboard
df_prods_no_dups.loc[df_prods_no_dups['prices'] > 100].to_clipboard()

In [14]:
# Turn the incorrect prices into NaNs (NOT best practice in real world)
df_prods_no_dups.loc[df_prods_no_dups['prices'] > 100, 'prices']=np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


#### While the two likely incorrect prices have been changed to NaN for now, the correct values should be obtained from the client and the analysis rerun.

In [15]:
# Check results
df_prods_no_dups['prices'].max()

25.0

# 05 Change products data types to reduce memory usage

In [16]:
# Check data types and memory usage for df_prods_no_dups
df_prods_no_dups.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49688 entries, 0 to 49692
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49688 non-null  int64  
 1   product_name   49672 non-null  string 
 2   aisle_id       49688 non-null  int64  
 3   department_id  49688 non-null  int64  
 4   prices         49686 non-null  float64
dtypes: float64(1), int64(3), string(1)
memory usage: 2.3 MB


In [17]:
# Change data types for df_prods_no_dups
df_prods_no_dups['product_id'] = df_prods_no_dups['product_id'].astype('int32')
df_prods_no_dups['aisle_id'] = df_prods_no_dups['aisle_id'].astype('int16')
df_prods_no_dups['department_id'] = df_prods_no_dups['department_id'].astype('int16')
df_prods_no_dups['prices'] = df_prods_no_dups['prices'].astype('float32')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prods_no_dups['product_id'] = df_prods_no_dups['product_id'].astype('int32')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prods_no_dups['aisle_id'] = df_prods_no_dups['aisle_id'].astype('int16')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prods_no_dups['department_id'] = df_prods_no_d

In [18]:
# Recheck data types and memory usage for df_prods_no_dups
df_prods_no_dups.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49688 entries, 0 to 49692
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49688 non-null  int32  
 1   product_name   49672 non-null  string 
 2   aisle_id       49688 non-null  int16  
 3   department_id  49688 non-null  int16  
 4   prices         49686 non-null  float32
dtypes: float32(1), int16(2), int32(1), string(1)
memory usage: 1.3 MB


#### The memory usage was reduced from 3.3 Mb to 2.3 Mb

In [19]:
# Check statistics for df_prods_no_dups
df_prods_no_dups.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49688.0,49688.0,49688.0,49686.0
mean,24844.50004,67.769582,11.728687,7.682189
std,14343.834402,38.316162,5.85041,4.200362
min,1.0,1.0,1.0,1.0
25%,12422.75,35.0,7.0,4.1
50%,24844.5,69.0,13.0,7.1
75%,37266.25,100.0,17.0,11.2
max,49688.0,134.0,21.0,25.0


# 06 Import departments data

In [20]:
# Import departments.csv to df
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'), index_col=False)
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


# 07 Data wrangling of departments data

In [21]:
# Transpose departments dataframe
df_dep_t=df_dep.T
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [22]:
# Add an index to transposed dataframe
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [23]:
# Create a new header for transposed dataframe
new_header = df_dep_t.iloc[0]
new_header

0    department
Name: department_id, dtype: object

In [24]:
# Create a new dataframe that starts after the first row of dataframe
df_dep_t_new = df_dep_t[1:]
df_dep_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [25]:
# Add new header to new dataframe
df_dep_t_new.columns = new_header
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


# 08 Turn departments dataframe into data dictionary

In [26]:
#Turn transposed departments dataframe int a dictionary
data_dict = df_dep_t_new.to_dict('index')
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

# 09  Export products and departments data

In [27]:
# Export df_prods to pkl
df_prods_no_dups.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'products.pkl'))

In [28]:
# Export the transformed departments dataframe
df_dep_t_new.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'departments_wrangled.csv'))