In [1]:
import matplotlib as plt
import numpy as np
import pandas as pd

In [2]:
##
train_transaction = pd.read_csv("../data/assembled_data/train_transaction.csv",)
train_transaction.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
## checking the number of na columns
train_transaction.describe()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
count,590540.0,590540.0,590540.0,590540.0,590540.0,581607.0,588975.0,586281.0,524834.0,524834.0,...,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0
mean,3282270.0,0.03499,7372311.0,135.027176,9898.734658,362.555488,153.194925,199.278897,290.733794,86.80063,...,0.775874,721.741883,1375.783644,1014.622782,9.807015,59.16455,28.530903,55.352422,151.160542,100.700882
std,170474.4,0.183755,4617224.0,239.162522,4901.170153,157.793246,11.336444,41.244453,101.741072,2.690623,...,4.727971,6217.223583,11169.275702,7955.735482,243.861391,387.62948,274.57692,668.486833,1095.034387,814.946722
min,2987000.0,0.0,86400.0,0.251,1000.0,100.0,100.0,100.0,100.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3134635.0,0.0,3027058.0,43.321,6019.0,214.0,150.0,166.0,204.0,87.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3282270.0,0.0,7306528.0,68.769,9678.0,361.0,150.0,226.0,299.0,87.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3429904.0,0.0,11246620.0,125.0,14184.0,512.0,150.0,226.0,330.0,87.0,...,0.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3577539.0,1.0,15811130.0,31937.391,18396.0,600.0,231.0,237.0,540.0,102.0,...,55.0,160000.0,160000.0,160000.0,55125.0,55125.0,55125.0,104060.0,104060.0,104060.0


In [4]:
train_transaction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 394 entries, TransactionID to V339
dtypes: float64(376), int64(4), object(14)
memory usage: 1.7+ GB


In [4]:
import math
import gc
import pickle
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
import os


class DataMemoryReducer(BaseEstimator,TransformerMixin):
    '''
    This module attempts to reduce 64 bit datatypes to 32 bit during EDA with pandas so it will fit memory. It will also save the schema to given location
    '''
    def __init__(self,schema_location=None):
        self.float_columns_to_reduce = []
        self.data_schema = {}
        self.schema_location = schema_location
        self.int_columns_to_reduce  = []
        self.__float_min = -3.4028235E+38
        self.__float_max =  3.4028235e+38
        self.__int_min = -2147483648
        self.__int_max = 2147483647
        
            
    def fit(self, pandas_df):
        self.data_schema = pandas_df.dtypes.to_dict()
        float_df = pandas_df.select_dtypes(include='float64')
        float_columns = float_df.columns
        if len(float_columns) == 0:
            print("None of the float columns can be changed")
            
        else: 
            #if all the columns are within range, change all
            if  (np.nanmax(float_df.values) <= self.__float_max) and (np.nanmin(float_df.values) >= self.__float_min):
                self.float_columns_to_reduce = float_columns
                for converted_column in float_columns:
                    self.data_schema[converted_column] = np.dtype("float32")

            else:
                self.float_columns_to_reduce = [] #clear any residuals
                for float_column in float_columns:
                    if ((np.nanmax(float_df[float_column].values) <= self.__float_max) and 
                       (np.nanmin(float_df[float_column].values) >= self.__float_min)):
                        self.float_columns_to_reduce.append(float_column)
                        self.data_schema[float_column] = np.dtype("float32")
                     
        del float_df
        gc.collect()
        
        
        int_df = pandas_df.select_dtypes(include='int64')
        int_columns =int_df.columns
        if len(int_columns) == 0:
            print("None of the int columns can be changed")
        else:    
            #if all the columns are within range, change all
            if  (np.nanmax(int_df.values) <= self.__int_max) and (np.nanmin(int_df.values) >= self.__int_min):
                self.int_columns_to_reduce = int_df.columns
                for converted_column in int_columns:
                    self.data_schema[converted_column] = np.dtype("int32")
                
            #check each column one by one
            else:
                self.int_columns_to_reduce = []
                for int_column in int_columns:
                    if ((np.nanmax(int_df[int_column].values) <= self.__int_max) and 
                       (np.nanmin(int_df[int_column].values) >= self.__int_min)):
                        self.int_columns_to_reduce.append(int_column)
                        self.data_schema[int_column] = np.dtype("int32")
            
            
        del int_df
        gc.collect()
        
        if self.schema_location is not None:
            with open(os.path.join(self.schema_location,"raw_data_schema.pkl"),"wb") as handler:
                pickle.dump(self.data_schema,handler)
    
        return self

    def transform(self, pandas_df):
        temp_df = pandas_df.copy()
        temp_df[self.float_columns_to_reduce]= temp_df[self.float_columns_to_reduce].astype('float32')
        gc.collect()
        temp_df[self.int_columns_to_reduce] = temp_df[self.int_columns_to_reduce].astype('int32')
        gc.collect()
        return temp_df
    
#     def fit_transform(self, pandas_df):
#         self.fit(pandas_df)
#         return self.transform(pandas_df)
        
        

In [5]:
data_memory_reducer = DataMemoryReducer("")

reduced_df = data_memory_reducer.fit_transform(train_transaction)
gc.collect()

0

In [6]:
with open("schema.pkl", "rb") as handler:
    schema = pickle.load(handler)

In [8]:
test_reading = pd.read_csv("../data/assembled_data/train_transaction.csv",dtype=schema)
test_reading.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 394 entries, TransactionID to V339
dtypes: float32(376), int32(4), object(14)
memory usage: 919.1+ MB


In [11]:
reduced_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 394 entries, TransactionID to V339
dtypes: float32(376), int32(4), object(14)
memory usage: 919.1+ MB


In [12]:
train_transaction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 394 entries, TransactionID to V339
dtypes: float64(376), int64(4), object(14)
memory usage: 1.7+ GB


In [13]:
data_memory_reducer.float_columns_to_reduce

Index(['TransactionAmt', 'card2', 'card3', 'card5', 'addr1', 'addr2', 'dist1',
       'dist2', 'C1', 'C2',
       ...
       'V330', 'V331', 'V332', 'V333', 'V334', 'V335', 'V336', 'V337', 'V338',
       'V339'],
      dtype='object', length=376)

In [14]:
data_memory_reducer.int_columns_to_reduce

Index(['TransactionID', 'isFraud', 'TransactionDT', 'card1'], dtype='object')

In [14]:
from sklearn.impute import SimpleImputer

simple_imputer = SimpleImputer()
test_2 = simple_imputer.fit_transform(reduced_df.select_dtypes("float32").values)


AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [15]:
test_2

array([[ 68.5     , 362.5555  , 150.      , ...,  55.352417, 151.16054 ,
        100.700874],
       [ 29.      , 404.      , 150.      , ...,  55.352417, 151.16054 ,
        100.700874],
       [ 59.      , 490.      , 150.      , ...,  55.352417, 151.16054 ,
        100.700874],
       ...,
       [ 30.95    , 595.      , 150.      , ...,  55.352417, 151.16054 ,
        100.700874],
       [117.      , 481.      , 150.      , ...,  55.352417, 151.16054 ,
        100.700874],
       [279.95    , 170.      , 150.      , ...,  55.352417, 151.16054 ,
        100.700874]], dtype=float32)

In [10]:
gc.collect()

1155