# ETL in Python

In [54]:
# import glob to get file with different type
import glob

#get all .csv file
list_csv = glob.glob('./datasource/*.csv')

#get all json file
list_json = glob.glob("./datasource/*.json")

#get all xml file
list_xml = glob.glob('./datasource/*xml')
print(list_xml)

['./datasource\\used_car_prices1.xml', './datasource\\used_car_prices2.xml', './datasource\\used_car_prices3.xml']


## 1. Extract CSV


### Extract csv file

In [47]:
import pandas as pd

#function read csv file
def extract_from_csv(file):
    
    dataframe = pd.read_csv(file)
    
    return dataframe

### Extract json file

In [76]:
def extract_from_json(file):
    
    dataframe = pd.read_json(file, lines=True)
    return dataframe

pd.read_json('./datasource/used_car_prices1.json', lines=True)

Unnamed: 0,car_model,year_of_manufacture,price,fuel
0,ritz,2012,4626.865672,Diesel
1,ritz,2011,3507.462687,Petrol
2,swift,2014,7388.059701,Diesel
3,ertiga,2014,8955.223881,Diesel
4,dzire,2014,8208.955224,Diesel
5,sx4,2011,4402.985075,CNG
6,dzire,2015,6940.298507,Petrol
7,800,2003,522.38806,Petrol
8,alto k10,2016,4477.61194,Petrol
9,sx4,2003,3358.208955,Petrol


### Extract xml file

In [90]:
def extract_from_xml(file):
    dataframe = pd.read_xml(file)
    return dataframe

pd.read_xml('./datasource/used_car_prices1.xml')


Unnamed: 0,car_model,year_of_manufacture,price,fuel
0,corolla altis,2013,10373.134328,Petrol
1,etios cross,2015,6716.41791,Petrol
2,fortuner,2014,27985.074627,Diesel
3,fortuner,2015,35074.626866,Diesel
4,fortuner,2017,49253.731343,Diesel
5,etios liva,2014,7089.552239,Diesel
6,innova,2017,29477.61194,Petrol
7,fortuner,2010,13805.970149,Diesel
8,corolla altis,2011,6492.537313,Petrol
9,corolla altis,2016,21268.656716,Petrol


### Extract function

In [97]:

def extract():
    # create empty dataframe
    df_list = pd.DataFrame()
    
    # put data to dataframe
    for file in list_csv:
         df_list = df_list.append(extract_from_csv(file), ignore_index =True)
    
    for file in list_json:
        df_list = df_list.append(extract_from_json(file), ignore_index =True)
        
    for file in list_xml:
        df_list= df_list.append(extract_from_xml(file), ignore_index =True)
            
    return df_list
extract()

Unnamed: 0,car_model,year_of_manufacture,price,fuel
0,ritz,2014,5000.000000,Petrol
1,sx4,2013,7089.552239,Diesel
2,ciaz,2017,10820.895522,Petrol
3,wagon r,2011,4253.731343,Petrol
4,swift,2014,6865.671642,Diesel
...,...,...,...,...
85,camry,2006,3731.343284,Petrol
86,land cruiser,2010,52238.805970,Diesel
87,corolla altis,2012,8805.970149,Petrol
88,etios liva,2013,5149.253731,Petrol


## 2. Transfrom

### Round the price columns to 2 decimal places

In [101]:
def transfrom(data):
    data['price'] = data['price'].round(2)
    return data

transfrom(extract())

Unnamed: 0,car_model,year_of_manufacture,price,fuel
0,ritz,2014,5000.00,Petrol
1,sx4,2013,7089.55,Diesel
2,ciaz,2017,10820.90,Petrol
3,wagon r,2011,4253.73,Petrol
4,swift,2014,6865.67,Diesel
...,...,...,...,...
85,camry,2006,3731.34,Petrol
86,land cruiser,2010,52238.81,Diesel
87,corolla altis,2012,8805.97,Petrol
88,etios liva,2013,5149.25,Petrol


## 3. Loading


In [112]:
def load (targetfile, data_to_load):
    data_to_load.to_csv(targetfile)

## 4. Running ETL process

In [114]:
# extract data
extracted_data = extract()

# transform data
transformed_data = transform(extracted_data)

# load to new file

load('./final_csv/out.csv', transformed_data)
#if have data will overwrite csv