In [2]:
# Reduce data size GB --> MB
import pandas as pd
import numpy as np


# Synthetic dataset
train = pd.read_csv('./data/train.csv', index_col=0)
test = pd.read_csv('./data/test.csv', index_col=0)

  from pandas.core import (


In [3]:
train.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 11504798 entries, 0 to 11504797
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Gender                object 
 1   Age                   int64  
 2   Driving_License       int64  
 3   Region_Code           float64
 4   Previously_Insured    int64  
 5   Vehicle_Age           object 
 6   Vehicle_Damage        object 
 7   Annual_Premium        float64
 8   Policy_Sales_Channel  float64
 9   Vintage               int64  
 10  Response              int64  
dtypes: float64(3), int64(5), object(3)
memory usage: 2.8 GB


In [4]:
test.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 7669866 entries, 11504798 to 19174663
Data columns (total 10 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Gender                object 
 1   Age                   int64  
 2   Driving_License       int64  
 3   Region_Code           float64
 4   Previously_Insured    int64  
 5   Vehicle_Age           object 
 6   Vehicle_Damage        object 
 7   Annual_Premium        float64
 8   Policy_Sales_Channel  float64
 9   Vintage               int64  
dtypes: float64(3), int64(4), object(3)
memory usage: 1.8 GB


In [5]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,11504798.0,38.383563,14.993459,20.0,24.0,36.0,49.0,85.0
Driving_License,11504798.0,0.998022,0.044431,0.0,1.0,1.0,1.0,1.0
Region_Code,11504798.0,26.41869,12.99159,0.0,15.0,28.0,35.0,52.0
Previously_Insured,11504798.0,0.462997,0.498629,0.0,0.0,0.0,1.0,1.0
Annual_Premium,11504798.0,30461.370411,16454.745205,2630.0,25277.0,31824.0,39451.0,540165.0
Policy_Sales_Channel,11504798.0,112.425442,54.035708,1.0,29.0,151.0,152.0,163.0
Vintage,11504798.0,163.897744,79.979531,10.0,99.0,166.0,232.0,299.0
Response,11504798.0,0.122997,0.328434,0.0,0.0,0.0,0.0,1.0


# Memory Optimization Strategy

We implement a function to convert data types to more memory-efficient alternatives. We are going to follow the following list about datatypes with Pandas in order to avoid data loss:

- `int8`: Ranges from -128 to 127.
- `int16`: Ranges from -32,768 to 32,767.
- `int32`: Ranges from -2,147,483,648 to 2,147,483,647.
- `int64`: Ranges from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807.
- `float16` (Half-precision): Approximate decimal precision of 3 to 4 decimal digits.
- `float32` (Single-precision): Approximate decimal precision of 7 to 9 decimal digits.
- `float64` (Double-precision): Approximate decimal precision of 15 to 17 deciml digits.
l digits.

In [6]:
def converting_datatypes(df):
    df = df.copy()
    try:
        # Converting data types
        df['Gender'] = df['Gender'].astype('category')
        df['Vehicle_Age'] = df['Vehicle_Age'].astype('category')
        df['Vehicle_Damage'] = df['Vehicle_Damage'].astype('category')
        df['Age'] = df['Age'].astype('int8')
        df['Driving_License'] = df['Driving_License'].astype('int8')
        df['Region_Code'] = df['Region_Code'].astype('int8')
        df['Previously_Insured'] = df['Previously_Insured'].astype('int8')
        df['Annual_Premium'] = df['Annual_Premium'].astype('int32')
        df['Policy_Sales_Channel'] = df['Policy_Sales_Channel'].astype('int16')
        df['Vintage'] = df['Vintage'].astype('int16')
        df['Response'] = df['Response'].astype('int8')
        print(df.info(memory_usage='deep'))
    except KeyError as e:
        print(f"Error: {e} not found in DataFrame")
    except Exception as e:
        print(f"An error occurred: {e}")
    return df

In [7]:
train = converting_datatypes(train)

<class 'pandas.core.frame.DataFrame'>
Index: 11504798 entries, 0 to 11504797
Data columns (total 11 columns):
 #   Column                Dtype   
---  ------                -----   
 0   Gender                category
 1   Age                   int8    
 2   Driving_License       int8    
 3   Region_Code           int8    
 4   Previously_Insured    int8    
 5   Vehicle_Age           category
 6   Vehicle_Damage        category
 7   Annual_Premium        int32   
 8   Policy_Sales_Channel  int16   
 9   Vintage               int16   
 10  Response              int8    
dtypes: category(3), int16(2), int32(1), int8(5)
memory usage: 263.3 MB
None


In [8]:
test = converting_datatypes(test)

Error: 'Response' not found in DataFrame


In [9]:
test.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 7669866 entries, 11504798 to 19174663
Data columns (total 10 columns):
 #   Column                Dtype   
---  ------                -----   
 0   Gender                category
 1   Age                   int8    
 2   Driving_License       int8    
 3   Region_Code           int8    
 4   Previously_Insured    int8    
 5   Vehicle_Age           category
 6   Vehicle_Damage        category
 7   Annual_Premium        int32   
 8   Policy_Sales_Channel  int16   
 9   Vintage               int16   
dtypes: category(3), int16(2), int32(1), int8(4)
memory usage: 168.2 MB


In [10]:
# Save files as parquet for faster loading
train.to_parquet('./data/train.parquet')
test.to_parquet('./data/test.parquet')

KeyboardInterrupt: 

In [11]:
# convert other files
df = pd.read_csv('./data/sub_train.csv', index_col=0)

df = converting_datatypes(df)
df.to_parquet('./data/sub_train.parquet')

<class 'pandas.core.frame.DataFrame'>
Index: 2830118 entries, 6926847 to 11504796
Data columns (total 11 columns):
 #   Column                Dtype   
---  ------                -----   
 0   Gender                category
 1   Age                   int8    
 2   Driving_License       int8    
 3   Region_Code           int8    
 4   Previously_Insured    int8    
 5   Vehicle_Age           category
 6   Vehicle_Damage        category
 7   Annual_Premium        int32   
 8   Policy_Sales_Channel  int16   
 9   Vintage               int16   
 10  Response              int8    
dtypes: category(3), int16(2), int32(1), int8(5)
memory usage: 64.8 MB
None
