# <ins> Orderlines cleanup</ins>

# Import pandas and upload data.

In [1]:
import pandas as pd

In [None]:
url_brands = 'https://drive.google.com/file/d/1m1ThDDIYRTTii-rqM5SEQjJ8McidJskD/view?usp=sharing' # brands.csv
path_brands = "https://drive.google.com/uc?export=download&id="+url_brands.split("/")[-2]

brands = pd.read_csv(path_brands)

In [2]:
url_ol = 'https://drive.google.com/file/d/1FYhN_2AzTBFuWcfHaRuKcuCE6CWXsWtG/view?usp=sharing' # orderlines.csv
path_ol = "https://drive.google.com/uc?export=download&id="+url_ol.split("/")[-2]

orderlines = pd.read_csv(path_ol)

In [None]:
url_orders = 'https://drive.google.com/file/d/1Vu0q91qZw6lqhIqbjoXYvYAQTmVHh6uZ/view?usp=sharing' # orders.csv
path_orders = "https://drive.google.com/uc?export=download&id="+url_orders.split("/")[-2]

orders = pd.read_csv(path_orders)

In [None]:
url_products = 'https://drive.google.com/file/d/1afxwDXfl-7cQ_qLwyDitfcCx3u7WMvkU/view?usp=drive_link' # products.csv
path_products = "https://drive.google.com/uc?export=download&id="+url_products.split("/")[-2]

products = pd.read_csv(path_products)

# Have a look at orderlines

In [None]:
orderlines

Unnamed: 0,id,id_order,product_id,product_quantity,sku,unit_price,date
0,1119109,299539,0,1,OTT0133,18.99,2017-01-01 00:07:19
1,1119110,299540,0,1,LGE0043,399.00,2017-01-01 00:19:45
2,1119111,299541,0,1,PAR0071,474.05,2017-01-01 00:20:57
3,1119112,299542,0,1,WDT0315,68.39,2017-01-01 00:51:40
4,1119113,299543,0,1,JBL0104,23.74,2017-01-01 01:06:38
...,...,...,...,...,...,...,...
293978,1650199,527398,0,1,JBL0122,42.99,2018-03-14 13:57:25
293979,1650200,527399,0,1,PAC0653,141.58,2018-03-14 13:57:34
293980,1650201,527400,0,2,APP0698,9.99,2018-03-14 13:57:41
293981,1650202,527388,0,1,BEZ0204,19.99,2018-03-14 13:58:01


In [None]:
orderlines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293983 entries, 0 to 293982
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   id                293983 non-null  int64 
 1   id_order          293983 non-null  int64 
 2   product_id        293983 non-null  int64 
 3   product_quantity  293983 non-null  int64 
 4   sku               293983 non-null  object
 5   unit_price        293983 non-null  object
 6   date              293983 non-null  object
dtypes: int64(4), object(3)
memory usage: 15.7+ MB


In [None]:
orderlines.duplicated().sum()

0

# Clean up orderlines


TODO
- Set 'id' as index √
- Drop 'product_id' √
- 'unit_price' should be type float √
- 'date' should be type datetime √
- Save tidy csv to drive √

## Set 'id' as index

In [None]:
orderlines = orderlines.set_index('id')

## Get rid of product_id.

In [None]:
orderlines = orderlines.drop("product_id", axis=1)
orderlines.head(1)

Unnamed: 0_level_0,id_order,product_quantity,sku,unit_price,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1119109,299539,1,OTT0133,18.99,2017-01-01 00:07:19


## Format unit_price correctly.

In [None]:
# pd.to_numeric(orderlines["unit_price"])

Not possible because some values are formatted incorrectly. How many?

In [None]:
wrong_price = orderlines.loc[orderlines["unit_price"].str.contains('\d+\.\d+\.\d+'), "unit_price"].tolist()
len(wrong_price)

36169

In percentage that is:

In [None]:
(len(wrong_price) * 100) / orderlines.shape[0]

12.30309235568043

In [None]:
wrong_price_series = pd.Series(wrong_price)
wrong_price_series.sample(20)

15113    1.842.59
21052    2.371.59
23573    3.528.59
24853    1.329.00
15773    2.679.00
29322    2.653.59
22386    1.399.00
31147    1.159.00
6972     1.159.00
20413    3.597.59
2070     3.124.99
34176    1.067.00
7923     1.269.00
23920    1.159.00
4122     1.016.99
4337     1.019.00
17752    1.886.59
18779    2.398.60
16015    3.938.59
3696     2.068.99
dtype: object

Let's try comparing orderline's incorrect 'unit_price's with products 'price's. This shows is that the unit price is reasonable approximate to base price if we disregard that first dot.

In [None]:
compare_df = orderlines.merge(products, how='left', on='sku')

In [None]:
compare_df = compare_df.loc[compare_df["unit_price"].isin(wrong_price), ["sku", "unit_price", "price"]]
compare_df.sample(10)

Unnamed: 0,sku,unit_price,price
237325,APP2085,2.058.59,2255.59
335328,PAC2158,6.766.59,7289.0
77349,APP0958,1.348.99,1449.0
25653,APP1821,1.798.99,1939.0
376914,APP2277,2.217.00,2359.0
89665,PAC1055,3.366.99,3949.0
98975,PAC1055,3.366.99,3949.0
204649,APP2068,1.435.59,1505.59
124800,PAC1596,2.953.99,4009.0
36062,PAC1593,2.647.99,3409.0


So let's get rid of that first dot.

In [None]:
# Create a new column so we do not mess up anything by accident
orderlines["new_unit_price"] = orderlines["unit_price"]

# Create a mask for the incorrect values
mask = orderlines["unit_price"].isin(wrong_price)

# Use the mask to remove the first period in incorrect unit prices
orderlines.loc[mask, "new_unit_price"] = orderlines.loc[mask, "new_unit_price"].str.replace('\.', '', 1, regex=True)

In [None]:
orderlines.loc[orderlines["unit_price"].isin(wrong_price), ["sku", "unit_price", "new_unit_price"]]

Unnamed: 0_level_0,sku,unit_price,new_unit_price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1119115,APP1582,1.137.99,1137.99
1119126,PAC0929,2.565.99,2565.99
1119131,APP1854,3.278.99,3278.99
1119195,PAC0961,2.616.99,2616.99
1119214,PAC1599,2.873.99,2873.99
...,...,...,...
1649999,APP2075,2.999.00,2999.00
1650045,PAC2148,3.497.00,3497.00
1650050,PAC2117,3.075.00,3075.00
1650088,APP2492,1.329.00,1329.00


Get rid of "new_unit_price" and we should be good to go.

In [None]:
orderlines["unit_price"] = orderlines["new_unit_price"]

In [None]:
orderlines = orderlines.drop("new_unit_price", axis=1)

## Turn 'unit_price' into type float and date into type datetime

In [None]:
orderlines["unit_price"] = pd.to_numeric(orderlines["unit_price"])
orderlines["date"] = pd.to_datetime(orderlines["date"])

## Save tidy csv to drive

In [None]:
from google.colab import drive

drive.mount('drive', force_remount=True)
orderlines.to_csv('/content/drive/My Drive/Bootcamp/03 data cleaning/data/orderlines_tidy.csv')

Mounted at drive
