In [None]:
# https://www.dataquest.io/blog/pandas-big-data/

In [None]:
import pandas as pd
from dask import dataframe as dd
import numpy as np
import featuretools as ft
pd.set_option('display.width', 5000)

In [None]:
bank = pd.read_csv("../data/bank-additional-full.csv",delimiter=";")
# create a date field
bank['date'] = '2020-01-25'

In [None]:
bank.info(memory_usage='deep')

In [None]:
bank.head()

In [None]:
# show the average memory usage for each data type
for dtype in ['float','int','object']:
    selected_dtype = bank.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024 ** 2
    print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))

### 1) User smaller data types to hold numeric columns

In [None]:
# show the min and max values for each data type (or look it up)
int_types = ["uint8", "int8", "int16"]
for it in int_types:
    print(np.iinfo(it))

In [None]:
# create a function to calculate memory usage
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

In [None]:
# downcast int types to reduce memory footprint
bank_int = bank.select_dtypes(include=['int'])
converted_int = bank_int.apply(pd.to_numeric,downcast='integer')
print(mem_usage(bank_int))
print(mem_usage(converted_int))
compare_ints = pd.concat([bank_int.dtypes,converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)

In [None]:
# downcast float types to reduce memory footprint
bank_float = bank.select_dtypes(include=['float'])
converted_float = bank_float.apply(pd.to_numeric,downcast='float')
print(mem_usage(bank_float))
print(mem_usage(converted_float))
compare_floats = pd.concat([bank_float.dtypes,converted_float.dtypes],axis=1)
compare_floats.columns = ['before','after']
compare_floats.apply(pd.Series.value_counts)

In [None]:
# copy the original df and compare the optimizes very
# compare the optimized to the original
optimized_bank = bank.copy()
optimized_bank[converted_bank.columns] = converted_bank
optimized_bank[converted_float.columns] = converted_float
print(mem_usage(bank))
print(mem_usage(optimized_bank))

### 2) Optimize objects using categoricals

In [None]:
# describe all object types
bank_obj = bank.select_dtypes(include=['object']).copy()
bank_obj.describe()

In [None]:
# convert a single feature to a category
job = bank_obj.job
print(job.head())
job_cat = job.astype('category')
print(job_cat.head())

In [None]:
# each unique value has been assigned an integer, and that the underlying datatype for the column is now int8. 
#This column doesn’t have any missing values, but if it did, the category subtype handles missing values by setting them to -1.
job_cat.head(10).cat.codes

In [None]:
# examine the memory usage for the job feature before and after
# The biggest drawback is the inability to perform numerical computations. 
# We can’t do arithmetic with category columns or use methods like Series.min() 
# and Series.max() without converting to a true numeric dtype first.

print('Before {}'.format(mem_usage(job)))
print('After {}'.format(mem_usage(job_cat)))

In [None]:
# Stick to using the category type primarily for object columns where less than 50% of the values are unique.
converted_obj = pd.DataFrame()
for col in bank_obj.columns:
    num_unique_values = len(bank_obj[col].unique())
    num_total_values = len(bank_obj[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj.loc[:,col] = bank_obj[col].astype('category')
    else:
        converted_obj.loc[:,col] = bank_obj[col]

In [None]:
# check which features got converted to categories
print(mem_usage(bank_obj))
print(mem_usage(converted_obj))
compare_obj = pd.concat([bank_obj.dtypes,converted_obj.dtypes],axis=1)
compare_obj.columns = ['before','after']
compare_obj.apply(pd.Series.value_counts)

In [None]:
# check the memory usage before and after
optimized_bank[converted_obj.columns] = converted_obj
print(mem_usage(bank))
print(mem_usage(optimized_bank))

### 3) Convert features to date/time when possible

In [None]:
mem_usage(bank['date'])

In [None]:
date = bank['date']
optimized_bank['date'] = pd.to_datetime(date,format='%Y-%m-%d')
optimized_bank.date.head()

In [None]:
print(mem_usage(bank['date']))
print(mem_usage(optimized_bank['date']))

In [None]:
# check the memory usage before and after
optimized_bank[converted_obj.columns] = converted_obj
print(mem_usage(bank))
print(mem_usage(optimized_bank))

### Use types when reading the file from CSV

In [None]:
column_types = {
    'age': 'uint8',
    'job': 'category',
    'marital': 'category',
    'education': 'category',
    'default': 'category',
    'housing': 'category',
    'loan': 'category',
    'contact': 'category',
    'month': 'category',
    'day_of_week': 'category',
    'duration': 'uint16',
    'campaign': 'uint8',
    'pdays': 'uint16',
    'previous': 'uint8',
    'poutcome': 'category',
    'emp.var.rate': 'float32',
    'cons.price.idx': 'float32',
    'cons.conf.idx': 'float32',
    'euribor3m': 'float32',
    'nr.employed': 'float32',
    'y': 'category',
    }

readopt_bank = pd.read_csv("../data/bank-additional-full.csv",delimiter=";", dtype=column_types,
                            #parse_dates=['date'],
                            )
bank['date'] = '2020-01-25'
date = bank['date']
readopt_bank['date'] = pd.to_datetime(date,format='%Y-%m-%d')

print(mem_usage(readopt_bank))
readopt_bank.head()