In [13]:
import pandas as pd 

In [64]:
def reduce_df_size(df):
    '''
    Reduce memory footprint of a Pandas DataFrame by downcasting numeric columns and 
    converting 'object' columns to categorical. Columns of dtype 'bool' or 'datetime64[ns]' 
    are passed through without change. 

    :param DataFrame df: DataFrame to reduce memory footprint of.

    :raises AssertionError: if the shape of reduced size df is not equal to original df.

    :return: The reduced size DataFrame.
    '''
    cols = df.columns
    dtypedict = {"float":['float64', 'float32','float16'],
                "integer":['int64', 'int32', 'int16','int8'],
                "unsigned": ['uint8','uint16', 'uint32'],
                "bool": [],
                "datetime64[ns]": [],
                "category": [],
                "object": ["category"]}

    dfs = {}
    for cast_type, dtypes in dtypedict.items():
        print(cast_type, dtypes)
        if cast_type in ["float", "integer", "unsigned"]:
            dfs[cast_type] = df.select_dtypes(include=dtypes).apply(pd.to_numeric,downcast=cast_type)
        elif cast_type in ["object"]:
            df_obj = df.select_dtypes(include=['object'])
            for c in df_obj.columns:
                try:
                    df_obj[c] = df_obj[c].astype('category')
                except Exception as e:
                    print(e)
            dfs[cast_type] = df_obj
        else:
            if cast_type in df.dtypes.values:
                dfs[cast_type] = df.select_dtypes(include=dtypes)
    res = pd.concat(dfs.values(), axis=1)
    assert res.shape==df.shape
    return res.loc[:,cols]

In [57]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    print("{:03.2f} MB".format(usage_mb))

In [58]:
df = pd.read_csv("../Downloads/Train.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [59]:
mem_usage(df)

814.81 MB


In [60]:
df.dtypes

SalesID                       int64
SalePrice                     int64
MachineID                     int64
ModelID                       int64
datasource                    int64
auctioneerID                float64
YearMade                      int64
MachineHoursCurrentMeter    float64
UsageBand                    object
saledate                     object
fiModelDesc                  object
fiBaseModel                  object
fiSecondaryDesc              object
fiModelSeries                object
fiModelDescriptor            object
ProductSize                  object
fiProductClassDesc           object
state                        object
ProductGroup                 object
ProductGroupDesc             object
Drive_System                 object
Enclosure                    object
Forks                        object
Pad_Type                     object
Ride_Control                 object
Stick                        object
Transmission                 object
Turbocharged                

In [61]:
df2 = reduce_df_size(df)

(401125, 53)
float ['float64', 'float32', 'float16']
integer ['int64', 'int32', 'int16', 'int8']
unsigned ['uint8', 'uint16', 'uint32']
bool []
datetime64[ns] []
category []
object ['category']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(401125, 53)


In [62]:
df2.dtypes

SalesID                        int32
SalePrice                      int32
MachineID                      int32
ModelID                        int32
datasource                     int16
auctioneerID                 float32
YearMade                       int16
MachineHoursCurrentMeter     float32
UsageBand                   category
saledate                    category
fiModelDesc                 category
fiBaseModel                 category
fiSecondaryDesc             category
fiModelSeries               category
fiModelDescriptor           category
ProductSize                 category
fiProductClassDesc          category
state                       category
ProductGroup                category
ProductGroupDesc            category
Drive_System                category
Enclosure                   category
Forks                       category
Pad_Type                    category
Ride_Control                category
Stick                       category
Transmission                category
T

In [63]:
mem_usage(df2)

31.36 MB


In [55]:
df2.head()

Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,saledate,...,Undercarriage_Pad_Width,Stick_Length,Thumb,Pattern_Changer,Grouser_Type,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls
0,1139246,66000,999089,3157,121,3.0,2004,68.0,Low,11/16/2006 0:00,...,,,,,,,,,Standard,Conventional
1,1139248,57000,117657,77,121,3.0,1996,4640.0,Low,3/26/2004 0:00,...,,,,,,,,,Standard,Conventional
2,1139249,10000,434808,7009,121,3.0,2001,2838.0,High,2/26/2004 0:00,...,,,,,,,,,,
3,1139251,38500,1026470,332,121,3.0,2001,3486.0,High,5/19/2011 0:00,...,,,,,,,,,,
4,1139253,11000,1057373,17311,121,3.0,2007,722.0,Medium,7/23/2009 0:00,...,,,,,,,,,,
