In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
def get_dataset(size):
    """Create fake dataset"""
    data = pd.DataFrame()
    data['model'] = np.random.choice(
    ['m1', 'm2', 'm3', 'm4'], size)
    data['fuel'] = np.random.choice(['petrol', 'diesel', 'gas'], size)
    data['production_date'] = np.random.randint(1990, 2024, size)
    data['transmission'] = np.random.choice(
    ['mechanical', 'automatic', 'robotic'], size)
    data['engine_power'] = np.random.randint(129, 609, size)
    data['price'] = np.random.uniform(60000., 12460000, size)
    data['count'] = np.random.randint(1, 30, size)
    return data

**Изучаем эффективное выделение памяти в Pandas**
1. int8: -128 to 127
2. int16: -32768 to 32767
3. int32: -2147483648 to 2147483647
4. int64: -9223372036854775808 to 9223372036854775807

In [3]:
SIZE = 1_000_000
df = get_dataset(SIZE)

In [4]:
df.head()

Unnamed: 0,model,fuel,production_date,transmission,engine_power,price,count
0,m2,petrol,2015,automatic,535,4295965.0,3
1,m3,gas,2006,mechanical,336,5368480.0,13
2,m2,diesel,1997,automatic,275,4274000.0,1
3,m2,gas,2014,robotic,550,10445410.0,19
4,m1,petrol,2015,mechanical,302,2084916.0,23


In [5]:
df_start = df.copy()

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 7 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   model            1000000 non-null  object 
 1   fuel             1000000 non-null  object 
 2   production_date  1000000 non-null  int64  
 3   transmission     1000000 non-null  object 
 4   engine_power     1000000 non-null  int64  
 5   price            1000000 non-null  float64
 6   count            1000000 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 53.4+ MB


In [12]:
df.memory_usage()

Index                  132
model              8000000
fuel               8000000
production_date    8000000
transmission       8000000
engine_power       8000000
price              8000000
count              8000000
dtype: int64

In [13]:
df.memory_usage().sum()

56000132

**production_date**

**Изучаем эффективное выделение памяти в Pandas**
1. int8: -128 to 127
2. int16: -32768 to 32767
3. int32: -2147483648 to 2147483647
4. int64: -9223372036854775808 to 9223372036854775807

In [14]:
print(df['production_date'].min(), df['production_date'].max())

1990 2023


In [15]:
print(df['engine_power'].min(), df['engine_power'].max())

129 608


In [16]:
int_large_cols = ['production_date', 'engine_power']
df[int_large_cols] = df[int_large_cols].astype('int16')

In [18]:
df.memory_usage()

Index                  132
model              8000000
fuel               8000000
production_date    2000000
transmission       8000000
engine_power       2000000
price              8000000
count              8000000
dtype: int64

In [17]:
df.memory_usage().sum()

44000132

**count**

In [19]:
print(df['count'].min(), df['count'].max())

1 29


In [20]:
df['count'] = df['count'].astype('int8')

In [21]:
df.memory_usage()

Index                  132
model              8000000
fuel               8000000
production_date    2000000
transmission       8000000
engine_power       2000000
price              8000000
count              1000000
dtype: int64

In [22]:
df.memory_usage().sum()

37000132

**category**

In [26]:
cat_cols = df.select_dtypes('object').columns
df[cat_cols] = df[cat_cols].astype('category')

In [27]:
df.memory_usage()

Index                  132
model              1000204
fuel               1000132
production_date    2000000
transmission       1000132
engine_power       2000000
price              8000000
count              1000000
dtype: int64

In [28]:
df.memory_usage().sum()

16000600

**result**

In [29]:
df_end = df.copy()

In [30]:
print(df_start.memory_usage().sum(), " ", df_end.memory_usage().sum())

56000132   16000600
