## Chapter 1 Data Preparation and Cleaning

This chapter focuses on data transformation and manipulation.

### Data Cleaning and Transformation

In [1]:
# Import the libraries
import pandas as pd

In [2]:
# Read in sales dataset
sales_df = pd.read_csv('datasets/sales.csv')

In [3]:
# Inspect the first rows
sales_df.head()

Unnamed: 0,Year,Product,line,Product.1,type,Product.2,Order,method,type.1,Retailer,country,Revenue
0,2004,Camping,Equipment,Cooking,Gear,TrailChef,Water,Bag,Telephone,United,States,315044.33
1,2004,Camping,Equipment,Cooking,Gear,TrailChef,Water,Bag,Telephone,Canada,,14313.48
2,2004,Camping,Equipment,Cooking,Gear,TrailChef,Water,Bag,Telephone,Mexico,,156644.47
3,2004,Camping,Equipment,Cooking,Gear,TrailChef,Water,Bag,Telephone,Brazil,,59191.72
4,2004,Camping,Equipment,Cooking,Gear,TrailChef,Water,Bag,Telephone,Japan,,7029.33


In [4]:
# Inspect the last 5 rows
sales_df.tail()

Unnamed: 0,Year,Product,line,Product.1,type,Product.2,Order,method,type.1,Retailer,country,Revenue
95,2004,Camping,Equipment,Cooking,Gear,TrailChef,Water,Bag,Mail,Finland,,6615.84
96,2004,Camping,Equipment,Cooking,Gear,TrailChef,Water,Bag,Mail,Denmark,,52613.47
97,2004,Camping,Equipment,Cooking,Gear,TrailChef,Water,Bag,Mail,France,,41912.85
98,2004,Camping,Equipment,Cooking,Gear,TrailChef,Water,Bag,Mail,Germany,,59479.91
99,2004,Camping,Equipment,Cooking,Gear,TrailChef,Water,Bag,Mail,United,Kingdom,156324.28


In [5]:
# Inspect the dimensions of the DataFrame
sales_df.shape

print(f'This dataset has {sales_df.shape[0]} rows and {sales_df.shape[1]} columns.')

This dataset has 100 rows and 12 columns.


In [6]:
# Check the data types of the DataFrame
sales_df.dtypes

Year           int64
Product       object
line          object
Product.1     object
type          object
Product.2     object
Order         object
method        object
type.1        object
Retailer      object
country       object
Revenue      float64
dtype: object

In [7]:
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Year       100 non-null    int64  
 1   Product    100 non-null    object 
 2   line       100 non-null    object 
 3   Product.1  100 non-null    object 
 4   type       100 non-null    object 
 5   Product.2  100 non-null    object 
 6   Order      100 non-null    object 
 7   method     100 non-null    object 
 8   type.1     100 non-null    object 
 9   Retailer   100 non-null    object 
 10  country    9 non-null      object 
 11  Revenue    100 non-null    float64
dtypes: float64(1), int64(1), object(10)
memory usage: 9.5+ KB


In [8]:
# Check for missing values
sales_df.isna().sum()

Year          0
Product       0
line          0
Product.1     0
type          0
Product.2     0
Order         0
method        0
type.1        0
Retailer      0
country      91
Revenue       0
dtype: int64

The `country` is missing 91 values.

In [9]:
# Clean the data
# Reorder the columns
cols = ['year', 'product_line', 'product_type','product',
        'retailer_country', 'order_method', 'revenue']

df = (sales_df
      .assign(product_line = sales_df.Product + ' ' + sales_df.line,
              product_type = sales_df['Product.1'] + ' ' + sales_df['type'],
              product = sales_df['Product.2'] + ' ' + sales_df['Order'] + ' ' + sales_df['method'],
              retailer_country = sales_df['Retailer'].replace('United', 'United States')
             )
      .rename(columns = {'type.1':'order_method', 'Revenue':'revenue', 'Year':'year'})
      [cols]  
)

# Inspect the first 5 rows
df.head()

Unnamed: 0,year,product_line,product_type,product,retailer_country,order_method,revenue
0,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,United States,Telephone,315044.33
1,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,Canada,Telephone,14313.48
2,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,Mexico,Telephone,156644.47
3,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,Brazil,Telephone,59191.72
4,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,Japan,Telephone,7029.33


In [10]:
df.tail()

Unnamed: 0,year,product_line,product_type,product,retailer_country,order_method,revenue
95,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,Finland,Mail,6615.84
96,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,Denmark,Mail,52613.47
97,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,France,Mail,41912.85
98,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,Germany,Mail,59479.91
99,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,United States,Mail,156324.28


In [11]:
df.columns

Index(['year', 'product_line', 'product_type', 'product', 'retailer_country',
       'order_method', 'revenue'],
      dtype='object')

### Converting the Code into a Function


In [15]:
def tweak_sales(sales_df):
    """
    Clean and transform sales dataset. 
    i). Use your subject matter expertise to transform the columns as needed.
    """
    
    # Reorder columns
    cols = ['year', 'product_line', 'product_type','product',
        'retailer_country', 'order_method', 'revenue']
    
    # Return a cleaned DataFrame
    return((sales_df
            .assign(product_line = sales_df['Product'] + ' ' + sales_df['line'],
                    product_type = sales_df['Product.1'] + ' ' + sales_df['type'],
                    product = sales_df['Product.2'] + ' ' + sales_df['Order'] + ' ' + sales_df['method'],
                    retailer_country = sales_df['Retailer'].replace('United', 'United States')
                )
            .rename(columns = {'type.1':'order_method', 'Revenue':'revenue', 'Year':'year'})
            [cols]
        )
    )
    

Now, let's test our new function

In [16]:
df = tweak_sales(sales_df)

# Inspect the first 10 rows
df.head(10)

Unnamed: 0,year,product_line,product_type,product,retailer_country,order_method,revenue
0,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,United States,Telephone,315044.33
1,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,Canada,Telephone,14313.48
2,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,Mexico,Telephone,156644.47
3,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,Brazil,Telephone,59191.72
4,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,Japan,Telephone,7029.33
5,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,Korea,Telephone,52613.47
6,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,China,Telephone,41912.85
7,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,Singapore,Telephone,59479.91
8,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,Australia,Telephone,156324.28
9,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,Netherlands,Telephone,23042.67


In [18]:
# Inspect the last 15 rows
df.tail(15)

Unnamed: 0,year,product_line,product_type,product,retailer_country,order_method,revenue
85,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,Canada,Mail,6615.84
86,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,Mexico,Mail,52613.47
87,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,Brazil,Mail,41912.85
88,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,Japan,Mail,59479.91
89,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,Korea,Mail,156324.28
90,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,China,Mail,23042.67
91,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,Singapore,Mail,59479.91
92,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,Australia,Mail,66446.59
93,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,Netherlands,Mail,13620.11
94,2004,Camping Equipment,Cooking Gear,TrailChef Water Bag,Sweden,Mail,77447.93
