In [11]:
import pandas as pd
import gc

df = pd.read_csv('../fines.csv')

df['Calculated'] = df['Fines'] / df['Refund'] * df['Year']


In [12]:
%%timeit
def calculate_with_loop(df):
    results = []
    for i in range(len(df)):
        result = df.iloc[i]['Fines'] / df.iloc[i]['Refund'] * df.iloc[i]['Year']
        results.append(result)
    return results

df['calculated_loop'] = calculate_with_loop(df)

77 ms ± 750 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [13]:
%%timeit
def calculate_with_iterrows(df):
    results = []
    for index, row in df.iterrows():
        result = row['Fines'] / row['Refund'] * row['Year']
        results.append(result)
    return results

df['calculated_iterrows'] = calculate_with_iterrows(df)

25.4 ms ± 95.8 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [14]:
%%timeit
df['calculated_apply'] = df.apply(lambda row: row['Fines'] / row['Refund'] * row['Year'], axis=1)

5.87 ms ± 55.2 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [15]:
%%timeit
df['calculated_series'] = (df['Fines'] / df['Refund']) * df['Year']

140 μs ± 1.84 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [16]:
%%timeit
def calculate_with_values(df):
    results = []
    fines = df['Fines'].values
    refund = df['Refund'].values
    year = df['Year'].values
    for i in range(len(df)):
        result = fines[i] / refund[i] * year[i]
        results.append(result)
    return results

df['calculated_values'] = calculate_with_values(df)

638 μs ± 32 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [17]:
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,Calculated,calculated_loop,calculated_iterrows,calculated_apply,calculated_series,calculated_values
0,Y163O8161RUS,2,3200.0,Ford,Focus,1989,3182400.0,3182400.0,3182400.0,3182400.0,3182400.0,3182400.0
1,E432XX77RUS,1,6500.0,Toyota,Camry,1995,12967500.0,12967500.0,12967500.0,12967500.0,12967500.0,12967500.0
2,7184TT36RUS,1,2100.0,Ford,Focus,1984,4166400.0,4166400.0,4166400.0,4166400.0,4166400.0,4166400.0
3,X582HE161RUS,2,2000.0,Ford,Focus,2015,2015000.0,2015000.0,2015000.0,2015000.0,2015000.0,2015000.0
4,92918M178RUS,1,5700.0,Ford,Focus,2014,11479800.0,11479800.0,11479800.0,11479800.0,11479800.0,11479800.0
...,...,...,...,...,...,...,...,...,...,...,...,...
920,8182XX154RUS,1,200.0,Ford,Focus,1981,396200.0,396200.0,396200.0,396200.0,396200.0,396200.0
921,X796TH96RUS,1,500.0,Ford,Focus,1992,996000.0,996000.0,996000.0,996000.0,996000.0,996000.0
922,T011MY163RUS,2,4000.0,Ford,Focus,2007,4014000.0,4014000.0,4014000.0,4014000.0,4014000.0,4014000.0
923,T341CC96RUS,2,1000.0,Volkswagen,Passat,2005,1002500.0,1002500.0,1002500.0,1002500.0,1002500.0,1002500.0


In [18]:

df.set_index('CarNumber', inplace=True)
specific_row = df.loc['O136HO197RUS']

In [19]:
df.info(memory_usage='deep')

df['Fines'] = pd.to_numeric(df['Fines'], downcast='float')
df['Refund'] = pd.to_numeric(df['Refund'], downcast='float')
df['Year'] = pd.to_numeric(df['Year'], downcast='integer')

optimized_df = df.copy()

<class 'pandas.core.frame.DataFrame'>
Index: 925 entries, Y163O8161RUS to T119CT96RUS
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Refund               925 non-null    int64  
 1   Fines                925 non-null    float64
 2   Make                 925 non-null    object 
 3   Model                914 non-null    object 
 4   Year                 925 non-null    int64  
 5   Calculated           925 non-null    float64
 6   calculated_loop      925 non-null    float64
 7   calculated_iterrows  925 non-null    float64
 8   calculated_apply     925 non-null    float64
 9   calculated_series    925 non-null    float64
 10  calculated_values    925 non-null    float64
dtypes: float64(7), int64(2), object(2)
memory usage: 249.4 KB


In [20]:
initial_memory = df.memory_usage(deep=True).sum()
print(f"Initial memory usage: {initial_memory} bytes")

Initial memory usage: 242466 bytes


In [21]:
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype('category')

In [22]:
new_memory = df.memory_usage(deep=True).sum()
print(f"New memory usage: {new_memory} bytes")
print(f"Memory usage decreased by: {initial_memory - new_memory} bytes")

New memory usage: 146309 bytes
Memory usage decreased by: 96157 bytes


In [23]:
%reset_selective -f df

gc.collect()

930