# Valerie Magalong - Advanced Pandas: Transforming Data Challenge

## Wednesday, July 26, 2023
---

In [12]:
import pandas as pd

# Read xlsx data into new dataframe
original_df = pd.read_excel("Northwind_products_suppliers.xlsx")

# Make copy of original_df for later transformation
northwind_df = original_df.copy()

___

### 1. What data is in the first 4 rows?

In [2]:
northwind_df.head(4)

Unnamed: 0,product_id,product_name,quantity_per_unit,unit_price,units_in_stock,units_on_order,reorder_level,discontinued,category_name,company_name,country,website
0,1,Chai,10 boxes x 30 bags,18.0,39,0,10.0,1,Beverages,"Specialty Biscuits, Ltd.",UK,/
1,2,Chang,24 - 12 oz bottles,19.0,17,40,25.0,1,Beverages,Exotic Liquids,UK,
2,2,Chang,24 - 12 oz bottles,19.0,17,40,25.0,1,Beverages,Exotic Liquids,UK,
3,3,Aniseed Syrup,12 - 550 ml bottles,10.0,13,70,25.0,0,Condiments,Exotic Liquids,UK,""""""


___

### 2. Calculate how many rows and columns are in this file.

In [3]:
df_rows = northwind_df.index.size
df_columns = northwind_df.columns.size

print(f"There are {df_rows} rows in the Northwind dataframe.")
print(f"There are {df_columns} columns in the Northwind dataframe.")

There are 78 rows in the Northwind dataframe.
There are 12 columns in the Northwind dataframe.


In [4]:
# Double check using the info method
northwind_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78 entries, 0 to 77
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   product_id         78 non-null     int64  
 1   product_name       78 non-null     object 
 2   quantity_per_unit  78 non-null     object 
 3   unit_price         78 non-null     float64
 4   units_in_stock     78 non-null     int64  
 5   units_on_order     78 non-null     int64  
 6   reorder_level      76 non-null     float64
 7   discontinued       78 non-null     int64  
 8   category_name      78 non-null     object 
 9   company_name       78 non-null     object 
 10  country            78 non-null     object 
 11  website            6 non-null      object 
dtypes: float64(2), int64(4), object(6)
memory usage: 7.4+ KB


___

### 3. What’s the maximum unit_price?

In [5]:
max_unit_price = northwind_df['unit_price'].max()

print(f"The maximum value in the unit_price column is: ${max_unit_price}")

The maximum value in the unit_price column is: $263.5


In [6]:
# Alternative solution
northwind_df.describe().loc['max', 'unit_price']

263.5

___

### 4. Use describe() to get the descriptive statistics for this file.

In [7]:
northwind_df.describe()

Unnamed: 0,product_id,unit_price,units_in_stock,units_on_order,reorder_level,discontinued
count,78.0,78.0,78.0,78.0,76.0,78.0
mean,38.525641,28.707821,40.205128,10.512821,12.631579,0.141026
std,22.617499,33.627362,36.010229,23.237757,10.907908,0.350301
min,1.0,2.5,0.0,0.0,0.0,0.0
25%,19.25,13.0625,15.0,0.0,0.0,0.0
50%,38.5,19.475,26.0,0.0,12.5,0.0
75%,57.75,33.1375,60.0,0.0,25.0,0.0
max,77.0,263.5,125.0,100.0,30.0,1.0


___

### 5. Replace any NULL values in reorder_level with 0.

In [8]:
# Find all rows where there are NaN values in reorder_level
northwind_df.loc[northwind_df['reorder_level'].isnull()]

Unnamed: 0,product_id,product_name,quantity_per_unit,unit_price,units_in_stock,units_on_order,reorder_level,discontinued,category_name,company_name,country,website
6,6,Grandma's Boysenberry Spread,12 - 8 oz jars,25.0,120,0,,0,Condiments,Grandma Kelly's Homestead,USA,2ww
24,71,Flotemysost,10 - 500 g pkgs.,21.5,26,0,,0,Dairy Products,Norske Meierier,Norway,W


In [9]:
count_nulls_in_reorder_level = northwind_df['reorder_level'].isnull().sum()

print(f"The current number of nulls in reorder_level is: {count_nulls_in_reorder_level}")

The current number of nulls in reorder_level is: 2


In [10]:
# Replace the two NaN values in the reorder_level column with 0 and assign to Series reorder_level_no_nulls
reorder_level_no_nulls = northwind_df['reorder_level'].fillna(0)

# Reassign reorder_level column in northwind_df to reorder_level_no_nulls
northwind_df['reorder_level'] = reorder_level_no_nulls

# When we look again for any null values in reorder_level, nothing is returned
northwind_df.loc[northwind_df['reorder_level'].isnull()]

Unnamed: 0,product_id,product_name,quantity_per_unit,unit_price,units_in_stock,units_on_order,reorder_level,discontinued,category_name,company_name,country,website


In [11]:
count_nulls_in_reorder_level = northwind_df['reorder_level'].isnull().sum()

print(f"The number of nulls in reorder_level is now: {count_nulls_in_reorder_level}")

The number of nulls in reorder_level is now: 0


___

### 6. Remove duplicate rows.

In [16]:
# Find any duplicated rows
northwind_df.loc[northwind_df.duplicated()]

Unnamed: 0,product_id,product_name,quantity_per_unit,unit_price,units_in_stock,units_on_order,reorder_level,discontinued,category_name,company_name,country,website
2,2,Chang,24 - 12 oz bottles,19.0,17,40,25.0,1,Beverages,Exotic Liquids,UK,


In [19]:
# Declare new dataframe nw_df_no_dupes with duplicate rows dropped
nw_df_no_dupes = northwind_df.drop_duplicates()

# No duplicated rows are returned from nw_df_no_dupes, so the duplicate rows were successfully removed
nw_df_no_dupes.loc[nw_df_no_dupes.duplicated()]

Unnamed: 0,product_id,product_name,quantity_per_unit,unit_price,units_in_stock,units_on_order,reorder_level,discontinued,category_name,company_name,country,website


___

### 7. Change "Gustaf's Kn√§ckebr√∂d" To "Gustaf's Knäckebröd".

### 8. Remove the column for website.

### 9. Create a crosstab, with category_name in rows and country in columns.

### 10. Create a pivot table with average unit_price by category.

### 11. Create a pivot table with average reorder_level  by category (rows) and country (columns).

### 12. Create a groupby table with sum of discontinued products by company name.

### 13. Create a groupby table with maximum unit_price by country and category (both in rows).