In [72]:
import pandas as pd 

In [70]:
def reduce_df_size(df):
    '''
    Reduce memory footprint of a Pandas DataFrame by downcasting numeric columns and 
    converting 'object' columns to categorical. Columns of dtype 'bool' or 'datetime64[ns]' 
    are passed through without change. 

    :param DataFrame df: DataFrame to reduce memory footprint of.

    :raises AssertionError: if the shape of reduced size df is not equal to original df.

    :return: The reduced size DataFrame.
    '''
    cols = df.columns
    dtypedict = {"float":['float64', 'float32','float16'],
                "integer":['int64', 'int32', 'int16','int8'],
                "unsigned": ['uint8','uint16', 'uint32'],
                "bool": [],
                "datetime64[ns]": [],
                "category": [],
                "object": ["category"]}

    dfs = {}
    for cast_type, dtypes in dtypedict.items():
        if cast_type in ["float", "integer", "unsigned"]:
            dfs[cast_type] = df.select_dtypes(include=dtypes).apply(pd.to_numeric,downcast=cast_type)
        elif cast_type in ["object"]:
            df_obj = df.select_dtypes(include=['object'])
            for c in df_obj.columns:
                try:
                    df_obj.loc[:,c] = df_obj[c].astype('category')
                except Exception as e:
                    print(e)
            dfs[cast_type] = df_obj
        else:
            if cast_type in df.dtypes.values:
                dfs[cast_type] = df.select_dtypes(include=dtypes)
    res = pd.concat(dfs.values(), axis=1)
    assert res.shape==df.shape
    return res.loc[:,cols]

In [57]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    print("{:03.2f} MB".format(usage_mb))

In [58]:
df = pd.read_csv("Train (1).zip")

  interactivity=interactivity, compiler=compiler, result=result)


In [59]:
mem_usage(df)

814.81 MB


In [76]:
df.head().T

Unnamed: 0,0,1,2,3,4
SalesID,1139246,1139248,1139249,1139251,1139253
SalePrice,66000,57000,10000,38500,11000
MachineID,999089,117657,434808,1026470,1057373
ModelID,3157,77,7009,332,17311
datasource,121,121,121,121,121
auctioneerID,3,3,3,3,3
YearMade,2004,1996,2001,2001,2007
MachineHoursCurrentMeter,68,4640,2838,3486,722
UsageBand,Low,Low,High,High,Medium
saledate,11/16/2006 0:00,3/26/2004 0:00,2/26/2004 0:00,5/19/2011 0:00,7/23/2009 0:00


In [71]:
df2 = reduce_df_size(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [75]:
df2.head().T

Unnamed: 0,0,1,2,3,4
SalesID,1139246,1139248,1139249,1139251,1139253
SalePrice,66000,57000,10000,38500,11000
MachineID,999089,117657,434808,1026470,1057373
ModelID,3157,77,7009,332,17311
datasource,121,121,121,121,121
auctioneerID,3,3,3,3,3
YearMade,2004,1996,2001,2001,2007
MachineHoursCurrentMeter,68,4640,2838,3486,722
UsageBand,Low,Low,High,High,Medium
saledate,11/16/2006 0:00,3/26/2004 0:00,2/26/2004 0:00,5/19/2011 0:00,7/23/2009 0:00


In [63]:
mem_usage(df2)

31.36 MB
