In [2]:
import pandas as pd
import numpy as np

In [3]:
import numpy as np
import seaborn as sns

In [4]:
base_dataset=sns.load_dataset("titanic")

In [5]:
base_dataset.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [6]:
base_dataset['survived'].dtypes

dtype('int64')

In [7]:
import glob
fil_paths=glob.glob("C:\\Users\\ML LABS\\Downloads\\Datasets\\home-credit-default-risk\\*.csv")

In [8]:
fil_paths

['C:\\Users\\ML LABS\\Downloads\\Datasets\\home-credit-default-risk\\application_test.csv',
 'C:\\Users\\ML LABS\\Downloads\\Datasets\\home-credit-default-risk\\application_train.csv',
 'C:\\Users\\ML LABS\\Downloads\\Datasets\\home-credit-default-risk\\bureau.csv',
 'C:\\Users\\ML LABS\\Downloads\\Datasets\\home-credit-default-risk\\bureau_balance.csv',
 'C:\\Users\\ML LABS\\Downloads\\Datasets\\home-credit-default-risk\\credit_card_balance.csv',
 'C:\\Users\\ML LABS\\Downloads\\Datasets\\home-credit-default-risk\\HomeCredit_columns_description.csv',
 'C:\\Users\\ML LABS\\Downloads\\Datasets\\home-credit-default-risk\\installments_payments.csv',
 'C:\\Users\\ML LABS\\Downloads\\Datasets\\home-credit-default-risk\\POS_CASH_balance.csv',
 'C:\\Users\\ML LABS\\Downloads\\Datasets\\home-credit-default-risk\\previous_application.csv',
 'C:\\Users\\ML LABS\\Downloads\\Datasets\\home-credit-default-risk\\sample_submission.csv']

In [9]:
import pandas as pd
df=pd.read_csv(fil_paths[0])

In [10]:
df.memory_usage().sum()/1024**2

44.99842834472656

In [11]:
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

Memory usage of dataframe is 45.00 MB


In [12]:
np.iinfo(np.int8).min,np.iinfo(np.int8).max

(-128, 127)

In [13]:
np.iinfo(np.int16).min,np.iinfo(np.int16).max

(-32768, 32767)

In [14]:
np.iinfo(np.int32).min,np.iinfo(np.int32).max

(-2147483648, 2147483647)

In [15]:
np.iinfo(np.int64).min,np.iinfo(np.int64).max

(-9223372036854775808, 9223372036854775807)

In [16]:
""" iterate through all the columns of a dataframe and modify the data type
    to reduce memory usage.        
"""
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

for col in df.columns:
    col_type = df[col].dtype

    if col_type != object:
        c_min = df[col].min()
        c_max = df[col].max()
        if str(col_type)[:3] == 'int':
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)  
        else:
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
    else:
        df[col] = df[col].astype('category')

end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

Memory usage of dataframe is 45.00 MB
Memory usage after optimization is: 9.40 MB
Decreased by 79.1%


### Guidelines

In [None]:
TIP 1 - Deleting unused variables and gc.collect()
TIP 2 - Presetting the datatypes
TIP 3 - Importing selected rows of the a file (including generating your own subsamples)
TIP 4 - Importing in batches and processing each individually
TIP 5 - Importing just selected columns
TIP 6 - Creative data processing
TIP 7 - Using Dask

### How to Work with BIG Datasets

In [46]:
import numpy as np 
import pandas as pd 
import datetime
import os
import time
import matplotlib.pyplot as plt
import seaborn as sns
import gc
%matplotlib inline

## TIP # 1 Deleting unused variables and gc.collect() 

The thing about python is that once it loads something into RAM it doesn't really get rid of it effectively.  So if you load a huge dataframe into pandas, and then make a copy of it and never use it again, that original dataframe will still be in your RAM.  Eating away at your memory.   Same goes for any other variables you create.

Therefore if you used up a dataframe (or other variable), get in the habit of deleting it.  

For example, if you create a dataframe  `temp`, extract some features and merge results to your main training set, `temp` will still be eating up space.  You need to explicitely delete it by stating `del temp`.  You also need to make sure that nothing else is referring to `temp` (you don't have any other variables bound to it).

Even after doing so there may still be residual memory usage going on.

That's where the garbage collection module comes in.   `import gc` at the beginning of your project, and then each time you want to clear up space put command `gc.collect()` .  

It also helps to run `gc.collect()` after multiple transformations/functions/copying etc...  as all the little references/values accumulate.

In [51]:
# eg:
#import some file
temp = pd.read_csv('../input/train_sample.csv')

#do something to the file
temp['os'] = temp['os'].astype('str')

In [52]:
temp.columns

Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY',
       ...
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object', length=122)

In [53]:
#do something to the file
temp['SK_ID_CURR'] = temp['SK_ID_CURR'].astype('str')

In [54]:
#delete when no longer needed
del temp
#collect residual garbage
gc.collect()

5128

## TIP # 2   Presetting the datatypes
If you import data into CSV, python will do it's best to guess the datatypes, but it will tend to error on the side of allocating more space than necessary.
So if you know in advance that your numbers are integers, and don't get bigger than certain values, set the datatypes at minimum requirements before importing.

In [None]:
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        }

train = pd.read_csv('../input/train_sample.csv', dtype=dtypes)

#check datatypes:
train.info()

## TIP # 3 Importing selected rows of a csv file

### a) Select number of rows to import
Instead of the default  `pd.read_csv('filename') ` you can use parameter `nrows` to specify number of rows to import.  For exampe:
`train = pd.read_csv('../input/train.csv', nrows=10000)` will only read the first 10000 rows (including the heading)..

In [None]:
train = pd.read_csv('../input/train.csv', nrows=10000, dtype=dtypes)
train.head()

### b)  Simple row skip (with or without headings)
You can also specify number of rows to skip (`skiprows`) , if you, for example want 1 million rows after the first 5 million:
`train = pd.read_csv('../input/train.csv', skiprows=5000000, nrows=1000000)`.  This however will ignore the first line with headers.  Instead you can pass in range of rows to skip, that will not include the first row  (indexed `[0]`).

In [None]:
#plain skipping looses heading info.  It's OK for files that don't have headings, 
#or dataframes you'll be linking together, or where you make your own custom headings...
train = pd.read_csv('../input/train.csv', skiprows=5000000, nrows=1000000, header = None, dtype=dtypes)
train.head()

In [None]:
#but if you want to import the headings from the original file
#skip first 5mil rows, but use the first row for heading:
train = pd.read_csv('../input/train.csv', skiprows=range(1, 5000000), nrows=1000000, dtype=dtypes)
train.head()

## TIP #4   Importing in batches and processing each individually

We know that the proportion of clicks that was attributed is very low.  So let's say we want to look at all of them at the same time.  We don't know what rows they are, and we can't load the whole data and filter.  But we can load in chuncks, extract from each chunk what we need and get rid of everything else!

The idea is simple.  You specify size of chunk (number of lines) you want pandas to import at a time.  Then you do some kind of processing on it.  Then pandas imports the next chunk, untill there are no more lines left.

So below I import one million rows, extract only rows that have 'is_attributed'==1 (i.e. app was downloaded) and then merge these results into common dataframe for further inspection.

In [None]:
#set up an empty dataframe
df_converted = pd.DataFrame()

#we are going to work with chunks of size 1 million rows
chunksize = 10 ** 6

#in each chunk, filter for values that have 'is_attributed'==1, and merge these values into one dataframe
for chunk in pd.read_csv('../input/train.csv', chunksize=chunksize, dtype=dtypes):
    filtered = (chunk[(np.where(chunk['is_attributed']==1, True, False))])
    df_converted = pd.concat([df_converted, filtered], ignore_index=True, )


Let's see what we've got:

In [None]:
df_converted.info()

In [None]:
df_converted.head()

## TIP #5 Importing just selected columns

If you want to analyze just some specific feature, you can import just the selected columns.

For example, lets say we want to analyze clicks by ips.  Or conversions by ips.

Importing just 2 fields as opposed to full table just may fit in our RAM

In [None]:
#wanted columns
columns = ['ip', 'click_time', 'is_attributed']
dtypes = {
        'ip'            : 'uint32',
        'is_attributed' : 'uint8',
        }

ips_df = pd.read_csv('../input/train.csv', usecols=columns, dtype=dtypes)

In [None]:

 df[col]= df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
    df[col]= df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
    df[col]= df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
     df[col]= df[col].astype(np.int64)
else:
if c_min > np.iinfo(np.float16).min and c_max < np.iinfo(np.float16).max:
    df[col]=df[col].astype(np.float16)
elif c_min > np.iinfo(np.float32).min and c_max < np.iinfo(np.float32).max:
    df[col]=df[col].astype(np.float32)
else:
    df[col]=df[col].astype(np.float64)
else:
df[col]=df[col].astype('category')

end_mem=df.memory_usage().sum()/1024*1024



