In [1]:
import json
import pandas as pd

In [2]:
# Define a function to get column names from schemas.json
def get_column_names(schemas: dict, ds_name: str, sorting_key='column_position'):
    column_details = schemas[ds_name]
    columns = sorted(column_details, key=lambda col: col[sorting_key], reverse=False)
    return [col['column_name'] for col in columns]

In [3]:
# Read the schemas as a dict through json
schemas = json.load(open('../data/retail_db/schemas.json'))

In [4]:
# Obtain order column names
orders_columns = get_column_names(schemas, 'orders')

In [5]:
# Read orders data from csv file based on orders_column names
orders = pd.read_csv('../data/retail_db/orders/part-00000', names=orders_columns)

In [6]:
orders

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,1,2013-07-25 00:00:00.0,11599,CLOSED
1,2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
2,3,2013-07-25 00:00:00.0,12111,COMPLETE
3,4,2013-07-25 00:00:00.0,8827,CLOSED
4,5,2013-07-25 00:00:00.0,11318,COMPLETE
...,...,...,...,...
68878,68879,2014-07-09 00:00:00.0,778,COMPLETE
68879,68880,2014-07-13 00:00:00.0,1117,COMPLETE
68880,68881,2014-07-19 00:00:00.0,2518,PENDING_PAYMENT
68881,68882,2014-07-22 00:00:00.0,10000,ON_HOLD


In [7]:
orders.sort_values?

[0;31mSignature:[0m
[0morders[0m[0;34m.[0m[0msort_values[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mby[0m[0;34m:[0m [0;34m'IndexLabel'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maxis[0m[0;34m:[0m [0;34m'Axis'[0m [0;34m=[0m [0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mascending[0m[0;34m:[0m [0;34m'bool | list[bool] | tuple[bool, ...]'[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minplace[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkind[0m[0;34m:[0m [0;34m'SortKind'[0m [0;34m=[0m [0;34m'quicksort'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mna_position[0m[0;34m:[0m [0;34m'str'[0m [0;34m=[0m [0;34m'last'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mignore_index[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkey[0m[0;34m:[0m [0;34m'

In [8]:
orders.sort_values('order_customer_id')

Unnamed: 0,order_id,order_date,order_customer_id,order_status
22944,22945,2013-12-13 00:00:00.0,1,COMPLETE
57962,57963,2013-08-02 00:00:00.0,2,ON_HOLD
67862,67863,2013-11-30 00:00:00.0,2,COMPLETE
15191,15192,2013-10-29 00:00:00.0,2,PENDING_PAYMENT
33864,33865,2014-02-18 00:00:00.0,2,COMPLETE
...,...,...,...,...
42914,42915,2014-04-16 00:00:00.0,12434,COMPLETE
4798,4799,2013-08-23 00:00:00.0,12434,PENDING_PAYMENT
13543,13544,2013-10-16 00:00:00.0,12434,PENDING
41642,41643,2014-04-08 00:00:00.0,12435,PENDING


In [9]:
orders.sort_values('order_customer_id', ascending=False)

Unnamed: 0,order_id,order_date,order_customer_id,order_status
41642,41643,2014-04-08 00:00:00.0,12435,PENDING
61628,61629,2013-12-21 00:00:00.0,12435,CANCELED
6159,6160,2013-09-02 00:00:00.0,12434,COMPLETE
4798,4799,2013-08-23 00:00:00.0,12434,PENDING_PAYMENT
5302,5303,2013-08-26 00:00:00.0,12434,PENDING
...,...,...,...,...
57962,57963,2013-08-02 00:00:00.0,2,ON_HOLD
33864,33865,2014-02-18 00:00:00.0,2,COMPLETE
15191,15192,2013-10-29 00:00:00.0,2,PENDING_PAYMENT
67862,67863,2013-11-30 00:00:00.0,2,COMPLETE


In [10]:
orders.sort_values(
    ['order_customer_id', 'order_date'],
    ascending=[True,False]
)

Unnamed: 0,order_id,order_date,order_customer_id,order_status
22944,22945,2013-12-13 00:00:00.0,1,COMPLETE
33864,33865,2014-02-18 00:00:00.0,2,COMPLETE
67862,67863,2013-11-30 00:00:00.0,2,COMPLETE
15191,15192,2013-10-29 00:00:00.0,2,PENDING_PAYMENT
57962,57963,2013-08-02 00:00:00.0,2,ON_HOLD
...,...,...,...,...
5302,5303,2013-08-26 00:00:00.0,12434,PENDING
4798,4799,2013-08-23 00:00:00.0,12434,PENDING_PAYMENT
1867,1868,2013-08-03 00:00:00.0,12434,CLOSED
41642,41643,2014-04-08 00:00:00.0,12435,PENDING
