## Advannced Pandas Optimization

In [5]:
# import california dataset from sklearn
from sklearn.datasets import fetch_california_housing
import pandas as pd
 
housing_dataset = fetch_california_housing()

# Data Set contains three key elements data, feature_names and 

housing_dataframe = pd.DataFrame(data=housing_dataset.data, columns=housing_dataset.feature_names)

 
housing_dataframe['MedInc'] = housing_dataframe['MedInc'].astype('category')

housing_dataframe['MedInc'].head()


0    8.3252
1    8.3014
2    7.2574
3    5.6431
4    3.8462
Name: MedInc, dtype: category
Categories (12928, float64): [0.4999, 0.5360, 0.5495, 0.6433, ..., 14.5833, 14.9009, 15.0000, 15.0001]

In [7]:
housing_dataframe.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude'],
      dtype='object')

In [9]:
# downcast the data type of 'AvgBedrms' column to 'float'ArithmeticError
housing_dataframe['AveBedrms'] = pd.to_numeric(housing_dataframe['AveBedrms'], downcast='float')

print(housing_dataframe['AveBedrms'].head())

housing_dataframe['Population'] = housing_dataframe['Population'].astype('int32')
print(housing_dataframe['Population'].head())


0    1.023810
1    0.971880
2    1.073446
3    1.073059
4    1.081081
Name: AveBedrms, dtype: float32
0     322
1    2401
2     496
3     558
4     565
Name: Population, dtype: int32


In [12]:
import numpy as np

df = pd.DataFrame(data=np.c_[housing_dataset['data'], housing_dataset['target']], columns=housing_dataset['feature_names'] + ['target'])

def memory_usage(df):
    return df.memory_usage(deep=True).sum() / 1024 ** 2 # convert bytes to MB

original_memory = memory_usage(df)

# Optimize memory usage in Pandas using downcast parameter
df['AveBedrms'] = pd.to_numeric(df['AveBedrms'], downcast='float')
df['AveRooms'] = pd.to_numeric(df['AveRooms'], downcast='float')
optimized_memory = memory_usage(df)

print(f"Memory usage of the dataframe: {original_memory} MB")
print(f"Memory usage of the optimized dataframe: {optimized_memory} MB")
print(f"Memory saved: {original_memory - optimized_memory} MB")

Memory usage of the dataframe: 1.4173622131347656 MB
Memory usage of the optimized dataframe: 1.2598915100097656 MB
Memory saved: 0.157470703125 MB


In [15]:
import time

# Measure the memory usage of the dataframe
start_memory = memory_usage(housing_dataframe)

# Optimize DataFrame memory usage by downcasting the data types
start_time = time.time()

housing_dataframe['MedInc'] = pd.to_numeric(housing_dataframe['MedInc'], downcast='float')
housing_dataframe['HouseAge'] = pd.to_numeric(housing_dataframe['HouseAge'], downcast='integer')
housing_dataframe['AveRooms'] = pd.to_numeric(housing_dataframe['AveRooms'], downcast='float')
housing_dataframe['AveBedrms'] = pd.to_numeric(housing_dataframe['AveBedrms'], downcast='float')
housing_dataframe['Population'] = pd.to_numeric(housing_dataframe['Population'], downcast='integer')
housing_dataframe['AveOccup'] = pd.to_numeric(housing_dataframe['AveOccup'], downcast='float')
housing_dataframe['Latitude'] = pd.to_numeric(housing_dataframe['Latitude'], downcast='float')
housing_dataframe['Longitude'] = pd.to_numeric(housing_dataframe['Longitude'], downcast='float')

end_memory = memory_usage(housing_dataframe)
end_time = time.time()

print(f"Memory usage before: {start_memory} MB")
print(f"Memory usage after: {end_memory} MB")
print(f"Memory saved: {start_memory - end_memory} MB")
print(f"Time taken: {end_time - start_time} seconds")

# Calculate percentage of memory saved
percentage_memory_saved = ((start_memory - end_memory) / start_memory) * 100
print(f"Percentage of memory saved: {percentage_memory_saved:.2f}%")

Memory usage before: 0.5709571838378906 MB
Memory usage after: 0.5709571838378906 MB
Memory saved: 0.0 MB
Time taken: 0.0066280364990234375 seconds
Percentage of memory saved: 0.00%


In [17]:
# House Age to category type using pd.cut() function   =, The 'HouseAge' feature needs to be split into five equal-width bins labeled 0 to 4.
housing_dataframe['HouseAge'] = pd.cut(housing_dataframe['HouseAge'], bins=5, labels=[0, 1, 2, 3, 4])

housing_dataframe['HouseAge'].head()
 

0    3
1    1
2    4
3    4
4    4
Name: HouseAge, dtype: category
Categories (5, int64): [0 < 1 < 2 < 3 < 4]