# Examples of DataFrame usage

In [1]:
from hana_ml import dataframe
import numpy as np
import logging

## Setup connection and data sets
Let us load some data into HAAN table.  The data is loaded into 4 tables - full set, test set, training set, and the validation set:
<li>DBM2_RFULL_TBL</li>
<li>DBM2_RTEST_TBL</li>
<li>DBM2_RTRAINING_TBL</li>
<li>DBM2_RVALIDATION_TBL</li>

To do that, a connection is created and passed to the loader.

There is a config file, <b>config/e2edata.ini</b> that controls the connection parameters and whether or not to reload the data from scratch.  In case the data is already loaded, there would be no need to load the data.  A sample section is below.  If the config parameter, reload_data is true then the tables for test, training, and validation are (re-)created and data inserted into them.

#########################<br>
[hana]<br>
url=host.sjc.sap.corp<br>
user=username<br>
passwd=userpassword<br>
port=3xx15<br>
<br>

#########################<br>

In [2]:
from data_load_utils import DataSets, Settings
url, port, user, pwd = Settings.load_config("../config/e2edata.ini")
connection_context = dataframe.ConnectionContext(url, port, user, pwd)
full_tbl, training_tbl, validation_tbl, test_tbl = DataSets.load_bank_data(connection_context)

### Simple DataFrame

In [3]:
dataset1 = connection_context.table(training_tbl)
# Alternatively, it could be any SELECT
#dataset1 = connection_context.sql('SELECT * FROM "{0}"'.format(training_tbl))
print(dataset1.select_statement)

SELECT * FROM "DBM2_RTRAINING_TBL"


In [4]:
print(dataset1)

<hana_ml.dataframe.DataFrame object at 0x000000000503CB00>


### Drop duplicates

In [5]:
dataset2 = dataset1.drop_duplicates()
print(dataset2.select_statement)

SELECT DISTINCT * FROM (SELECT * FROM "DBM2_RTRAINING_TBL") AS "DT_0"


In [6]:
print(dataset2.columns)

['ID', 'AGE', 'JOB', 'MARITAL', 'EDUCATION', 'DBM_DEFAULT', 'HOUSING', 'LOAN', 'CONTACT', 'DBM_MONTH', 'DAY_OF_WEEK', 'DURATION', 'CAMPAIGN', 'PDAYS', 'PREVIOUS', 'POUTCOME', 'EMP_VAR_RATE', 'CONS_PRICE_IDX', 'CONS_CONF_IDX', 'EURIBOR3M', 'NREMPLOYED', 'LABEL']


### Drop a column

In [7]:
dataset3 = dataset2.drop(["LABEL"])
print(dataset3.select_statement)

SELECT "ID", "AGE", "JOB", "MARITAL", "EDUCATION", "DBM_DEFAULT", "HOUSING", "LOAN", "CONTACT", "DBM_MONTH", "DAY_OF_WEEK", "DURATION", "CAMPAIGN", "PDAYS", "PREVIOUS", "POUTCOME", "EMP_VAR_RATE", "CONS_PRICE_IDX", "CONS_CONF_IDX", "EURIBOR3M", "NREMPLOYED" FROM (SELECT DISTINCT * FROM (SELECT * FROM "DBM2_RTRAINING_TBL") AS "DT_0") AS "DT_1"


### Take null values and substitute with a specific value

In [8]:
dataset4 = dataset2.fillna(25, ["AGE"])
print(dataset4.select_statement)

SELECT "ID", COALESCE("AGE", 25) AS "AGE", "JOB", "MARITAL", "EDUCATION", "DBM_DEFAULT", "HOUSING", "LOAN", "CONTACT", "DBM_MONTH", "DAY_OF_WEEK", "DURATION", "CAMPAIGN", "PDAYS", "PREVIOUS", "POUTCOME", "EMP_VAR_RATE", "CONS_PRICE_IDX", "CONS_CONF_IDX", "EURIBOR3M", "NREMPLOYED", "LABEL" FROM (SELECT DISTINCT * FROM (SELECT * FROM "DBM2_RTRAINING_TBL") AS "DT_0") dt


### Fetch 5 rows into client

In [9]:
print(dataset4.head(5).collect())

      ID  AGE          JOB   MARITAL          EDUCATION DBM_DEFAULT HOUSING  \
0  27178   57    housemaid   married           basic.4y          no     yes   
1  31377   39  blue-collar  divorced           basic.9y     unknown      no   
2   5987   34  blue-collar   married           basic.9y          no      no   
3  12963   41  blue-collar   married            unknown          no      no   
4   5479   32   management   married  university.degree          no      no   

  LOAN    CONTACT DBM_MONTH  ...   CAMPAIGN  PDAYS  PREVIOUS     POUTCOME  \
0   no   cellular       nov  ...          1    999         0  nonexistent   
1   no   cellular       may  ...          2    999         0  nonexistent   
2   no  telephone       may  ...          1    999         0  nonexistent   
3  yes   cellular       jul  ...          1    999         0  nonexistent   
4   no  telephone       may  ...          3    999         0  nonexistent   

   EMP_VAR_RATE CONS_PRICE_IDX  CONS_CONF_IDX  EURIBOR3M  NREM

### Fetch columns in a DataFrame

In [10]:
print(dataset4.columns)

['ID', 'AGE', 'JOB', 'MARITAL', 'EDUCATION', 'DBM_DEFAULT', 'HOUSING', 'LOAN', 'CONTACT', 'DBM_MONTH', 'DAY_OF_WEEK', 'DURATION', 'CAMPAIGN', 'PDAYS', 'PREVIOUS', 'POUTCOME', 'EMP_VAR_RATE', 'CONS_PRICE_IDX', 'CONS_CONF_IDX', 'EURIBOR3M', 'NREMPLOYED', 'LABEL']


In [11]:
print(dataset4.head(10).collect())

      ID  AGE           JOB   MARITAL            EDUCATION DBM_DEFAULT  \
0  27178   57     housemaid   married             basic.4y          no   
1  31377   39   blue-collar  divorced             basic.9y     unknown   
2   5987   34   blue-collar   married             basic.9y          no   
3  12963   41   blue-collar   married              unknown          no   
4   5479   32    management   married    university.degree          no   
5  33491   53    technician   married  professional.course          no   
6  30259   56  entrepreneur   married    university.degree          no   
7  35092   36    management  divorced    university.degree          no   
8   7744   51   blue-collar   married              unknown     unknown   
9  38755   29        admin.    single    university.degree          no   

   HOUSING     LOAN    CONTACT DBM_MONTH  ...   CAMPAIGN  PDAYS  PREVIOUS  \
0      yes       no   cellular       nov  ...          1    999         0   
1       no       no   cellular 

In [12]:
print(dataset4.filter('AGE > 60').head(10).collect())

      ID  AGE         JOB   MARITAL            EDUCATION DBM_DEFAULT HOUSING  \
0  36349   67      admin.   married             basic.4y     unknown     yes   
1  29567   68     retired   married          high.school          no     yes   
2  38868   64     retired   married          high.school          no     yes   
3  38879   77     retired  divorced             basic.4y          no     yes   
4  38885   68     retired   married          high.school          no      no   
5  38943   78     retired   married  professional.course          no      no   
6  38946   67     retired   married             basic.4y          no      no   
7  29669   71     retired   married    university.degree          no      no   
8  38456   98     retired   married             basic.4y     unknown     yes   
9  38467   61  management   married    university.degree          no      no   

  LOAN    CONTACT DBM_MONTH  ...   CAMPAIGN  PDAYS  PREVIOUS     POUTCOME  \
0   no   cellular       jun  ...          

In [13]:
pd1 = dataset4.filter('AGE>60').head(10).collect()

In [14]:
print(type(pd1))

<class 'pandas.core.frame.DataFrame'>


In [15]:
dataset4.filter('AGE>60').sort(['AGE'])

<hana_ml.dataframe.DataFrame at 0x88442b0>

In [16]:
print(dataset4.filter('AGE>60').sort(['AGE']).head(1).collect())

      ID  AGE         JOB  MARITAL            EDUCATION DBM_DEFAULT HOUSING  \
0  38565   61  technician  married  professional.course          no      no   

  LOAN   CONTACT DBM_MONTH  ...   CAMPAIGN  PDAYS  PREVIOUS  POUTCOME  \
0   no  cellular       oct  ...          2    999         1   failure   

   EMP_VAR_RATE CONS_PRICE_IDX  CONS_CONF_IDX  EURIBOR3M  NREMPLOYED  LABEL  
0          -3.4         92.431          -26.9      0.722        5017    yes  

[1 rows x 22 columns]


In [17]:
condition = '{}."ID"={}."ID"'.format(dataset4.quoted_name, dataset2.quoted_name)
dataset5 = dataset4.join(dataset2, condition)

In [18]:
print(dataset5.head(1).collect())

      ID  AGE     JOB  MARITAL EDUCATION DBM_DEFAULT HOUSING LOAN   CONTACT  \
0  28115   41  admin.  married  basic.6y          no      no   no  cellular   

  DBM_MONTH  ...   CAMPAIGN  PDAYS  PREVIOUS  POUTCOME  EMP_VAR_RATE  \
0       apr  ...          2    999         1   failure          -1.8   

  CONS_PRICE_IDX  CONS_CONF_IDX  EURIBOR3M  NREMPLOYED  LABEL  
0         93.075          -47.1      1.479        5099    yes  

[1 rows x 44 columns]


In [19]:
dataset6 = dataset4.select("ID", "AGE", "JOB")

In [20]:
print(dataset6.head().collect())

      ID  AGE     JOB
0  28115   41  admin.


In [21]:
dataset7 = dataset4.select("ID", "AGE", "JOB", ('"AGE"*2', "TWICE_AGE"))

In [22]:
print(dataset7.head().collect())

      ID  AGE     JOB  TWICE_AGE
0  28115   41  admin.         82


In [23]:
dataset7.save("#MYTEST")

<hana_ml.dataframe.DataFrame at 0x9fcb8d0>

In [24]:
dataset8 = connection_context.table("#MYTEST")

In [25]:
print(dataset8.head().collect())

      ID  AGE     JOB  TWICE_AGE
0  28115   41  admin.         82
