In [3]:
import pandas as pd
import pyarrow
import fastparquet

import matplotlib.pyplot as plt

# from lightgbm import LGBMClassifier

import warnings
warnings.simplefilter('ignore')

## Run data cleaning script, generate parquet files. 
Requires pyarrow and fastparquet (see requirements.txt)

In [4]:
# Open and run **ETL_export_parquet_invoice-level_+_client-level_mean.py**

## Import parquet files

In [5]:
# Import clean data 
# - Each row represents one invoice.
df_train_non_agg = pd.read_parquet("data/df_train_non_agg.parquet")
df_final_test_non_agg = pd.read_parquet("data/df_final_test_non_agg.parquet") # this is not our "test" data, we will still need to perform the test-train split

# Import clean data
# - Each row represents one client. 
# - Invoices are summarised. 
# - The "consommation_level_x" columns show mean energy consumption per client.
df_train_agg = pd.read_parquet("data/df_train_agg.parquet")
df_final_test_agg = pd.read_parquet("data/df_final_test_agg.parquet") # this is not our "test" data, we will still need to perform the test-train split

In [6]:
df_train_agg

Unnamed: 0,disrict,client_catg,region,creation_date,target,transactions_count,counter_coefficient,consommation_level_1,consommation_level_2,consommation_level_3,...,counter_code_450,counter_code_453,counter_code_467,counter_code_483,counter_code_5,counter_code_506,counter_code_532,counter_code_565,counter_code_600,counter_code_65
0,60,11,101,1994-12-31,0,35.0,1.0,352.400000,10.571429,0.000000,...,False,False,False,False,False,False,False,False,False,False
1,69,11,107,2002-05-29,0,37.0,1.0,557.540541,0.000000,0.000000,...,False,False,False,False,False,False,False,False,False,False
2,62,11,301,1986-03-13,0,18.0,1.0,798.611111,37.888889,0.000000,...,False,False,False,False,False,False,False,False,False,False
3,69,11,105,1996-07-11,0,20.0,1.0,1.200000,0.000000,0.000000,...,False,False,False,False,False,False,False,False,False,False
4,62,11,303,2014-10-14,0,14.0,1.0,663.714286,104.857143,117.357143,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135488,62,11,304,2004-07-26,0,71.0,1.0,1.957746,0.000000,0.000000,...,False,False,False,False,False,False,False,False,False,False
135489,63,11,311,2012-10-25,0,41.0,1.0,185.853659,0.756098,0.000000,...,False,False,False,False,True,False,False,False,False,False
135490,63,11,311,2011-11-22,0,36.0,1.0,273.083333,0.000000,0.000000,...,False,False,False,False,False,False,False,False,False,False
135491,60,11,101,1993-12-22,0,2.0,1.0,300.000000,70.500000,0.000000,...,False,False,False,False,False,False,False,False,False,False


In [7]:
df_train_non_agg

Unnamed: 0,disrict,client_catg,region,creation_date,target,counter_coefficient,consommation_level_1,consommation_level_2,consommation_level_3,consommation_level_4,...,counter_code_450,counter_code_453,counter_code_467,counter_code_483,counter_code_5,counter_code_506,counter_code_532,counter_code_565,counter_code_600,counter_code_65
0,60,11,101,1994-12-31,0,1.0,82.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
1,60,11,101,1994-12-31,0,1.0,1200.0,184.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
2,60,11,101,1994-12-31,0,1.0,123.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
3,60,11,101,1994-12-31,0,1.0,102.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
4,60,11,101,1994-12-31,0,1.0,572.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4438276,63,11,311,2011-11-22,0,1.0,312.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
4438277,63,11,311,2011-11-22,0,1.0,578.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
4438278,60,11,101,1993-12-22,0,1.0,400.0,135.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
4438279,60,11,101,1993-12-22,0,1.0,200.0,6.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False


In [8]:
df_train_agg.describe()

Unnamed: 0,creation_date,target,transactions_count,counter_coefficient,consommation_level_1,consommation_level_2,consommation_level_3,consommation_level_4,reading_remarque,counter_statue
count,135493,135493.0,128000.0,128000.0,128000.0,128000.0,128000.0,128000.0,128000.0,128000.0
mean,2002-10-01 18:45:16.001564544,0.055841,34.615531,1.001211,407.277247,117.61988,27.496823,75.918575,8.781734,0.010695
min,1977-02-05 00:00:00,0.0,1.0,1.0,0.0,0.0,0.0,0.0,6.0,0.0
25%,1994-01-12 00:00:00,0.0,13.0,1.0,220.530312,0.0,0.0,0.0,9.0,0.0
50%,2005-09-19 00:00:00,0.0,31.5,1.0,355.161002,6.892857,0.0,0.0,9.0,0.0
75%,2012-04-04 00:00:00,0.0,52.0,1.0,533.078947,66.688702,5.560244,0.0,9.0,0.0
max,2019-09-10 00:00:00,1.0,434.0,40.0,34024.0,115683.0,2400.0,79179.777778,9.0,5.0
std,,0.229614,25.78289,0.183005,342.787037,756.054243,96.626119,833.338704,0.726306,0.160331
