# <ins>Orders cleanup</ins>

# Import pandas and upload data.

In [1]:
import pandas as pd

In [2]:
url_brands = 'https://drive.google.com/file/d/1m1ThDDIYRTTii-rqM5SEQjJ8McidJskD/view?usp=sharing' # brands.csv
path_brands = "https://drive.google.com/uc?export=download&id="+url_brands.split("/")[-2]

brands = pd.read_csv(path_brands)

In [3]:
url_ol = 'https://drive.google.com/file/d/1FYhN_2AzTBFuWcfHaRuKcuCE6CWXsWtG/view?usp=sharing' # orderlines.csv
path_ol = "https://drive.google.com/uc?export=download&id="+url_ol.split("/")[-2]

orderlines = pd.read_csv(path_ol)

In [4]:
url_orders = 'https://drive.google.com/file/d/1Vu0q91qZw6lqhIqbjoXYvYAQTmVHh6uZ/view?usp=sharing' # orders.csv
path_orders = "https://drive.google.com/uc?export=download&id="+url_orders.split("/")[-2]

orders = pd.read_csv(path_orders)

In [5]:
url_products = 'https://drive.google.com/file/d/1afxwDXfl-7cQ_qLwyDitfcCx3u7WMvkU/view?usp=drive_link' # products.csv
path_products = "https://drive.google.com/uc?export=download&id="+url_products.split("/")[-2]

products = pd.read_csv(path_products)

# Have a look at orders

In [6]:
orders

Unnamed: 0,order_id,created_date,total_paid,state
0,241319,2017-01-02 13:35:40,44.99,Cancelled
1,241423,2017-11-06 13:10:02,136.15,Completed
2,242832,2017-12-31 17:40:03,15.76,Completed
3,243330,2017-02-16 10:59:38,84.98,Completed
4,243784,2017-11-24 13:35:19,157.86,Cancelled
...,...,...,...,...
226904,527397,2018-03-14 13:56:38,42.99,Place Order
226905,527398,2018-03-14 13:57:25,42.99,Shopping Basket
226906,527399,2018-03-14 13:57:34,141.58,Shopping Basket
226907,527400,2018-03-14 13:57:41,19.98,Shopping Basket


In [7]:
orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226909 entries, 0 to 226908
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   order_id      226909 non-null  int64  
 1   created_date  226909 non-null  object 
 2   total_paid    226904 non-null  float64
 3   state         226909 non-null  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 6.9+ MB


Check for duplicates (there are none).

In [8]:
orders.duplicated().sum()

0

# Clean up orders
TODO:
* Get rid of all orders != completed √
* Then get rid of column 'state' √
* Check null values in 'total_paid (and drop rows?) √
* Turn 'created_date' into type datetime √
* Investigate 'total_paid' - is it a mess? √
* Set 'order_id' as index √
* Save tidy csv to drive √

## Get rid of all orders != completed
226,909 rows before, 46,605 after (kept 20.5% of rows).

In [9]:
orders = orders.loc[orders["state"] == "Completed"]

## Then get rid of column 'state'

In [10]:
orders = orders.loc[:, "order_id":"total_paid"]

## Check null values in 'total_paid
They were already dropped in previous step.

In [11]:
orders["total_paid"].isna().sum()

0

## Turn 'created_date' into type datetime

In [12]:
orders["created_date"] = pd.to_datetime(orders["created_date"])

## Investigate 'total_paid' - is it a mess?
Seems fine.

Define pandas display format.

In [13]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_rows', 100)

In [14]:
orders.loc[:,"total_paid"].tolist()

[136.15,
 15.76,
 84.98,
 149.0,
 112.97,
 183.52,
 211.95,
 407.96,
 167.98,
 153.54,
 54.99,
 140.99,
 505.76,
 610.07,
 1610.0,
 497.68,
 292.04,
 35.91,
 34.98,
 56.98,
 283.83,
 386.18,
 20.58,
 20.97,
 260.0,
 1367.11,
 40.76,
 7.78,
 2264.6,
 195.74,
 24.98,
 1132.33,
 225.16,
 192.98,
 44.98,
 109.98,
 3109.57,
 122.97,
 372.98,
 80.96,
 156.98,
 141.97,
 147.48,
 121.46,
 127.02,
 470.33,
 150.02,
 302.39,
 75.98,
 17.98,
 723.69,
 84.17,
 73.23,
 279.99,
 407.59,
 139.27,
 213.98,
 114.98,
 3.98,
 170.99,
 118.92,
 29.99,
 179.98,
 52.99,
 795.85,
 219.67,
 2616.99,
 64.99,
 174.97,
 34.98,
 24.98,
 16.98,
 72.97,
 697.72,
 59.99,
 130.44,
 42.99,
 49.94,
 342.97,
 3047.6,
 120.73,
 56.98,
 553.98,
 79.99,
 117.4,
 54.99,
 3385.99,
 94.23,
 351.97,
 24.97,
 267.98,
 58.28,
 15.95,
 819.99,
 171.98,
 114.98,
 318.93,
 19.97,
 540.16,
 49.98,
 34.97,
 71.48,
 54.99,
 446.45,
 2635.47,
 173.98,
 687.21,
 120.2,
 33.98,
 85.99,
 51.98,
 72.19,
 39.98,
 0.0,
 233.97,
 81.97,
 132.

## Set 'order_id' as index

In [15]:
orders = orders.set_index("order_id")
orders

Unnamed: 0_level_0,created_date,total_paid
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1
241423,2017-11-06 13:10:02,136.15
242832,2017-12-31 17:40:03,15.76
243330,2017-02-16 10:59:38,84.98
245275,2017-06-28 11:35:37,149.00
245595,2017-01-21 12:52:47,112.97
...,...,...
527042,2018-03-14 11:47:50,18.98
527070,2018-03-14 11:50:48,24.97
527074,2018-03-14 11:51:42,24.97
527096,2018-03-14 11:58:40,34.96


## Save tidy csv to drive

In [16]:
from google.colab import drive

drive.mount('drive', force_remount=True)
orders.to_csv('/content/drive/My Drive/Bootcamp/03 data cleaning/data/orders_tidy.csv')

Mounted at drive
