# Enhancing performance with Python

## Libraries and settings

In [None]:
# Libraries
import os
import csv
import random
import numpy as np
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Show current working directory
print(os.getcwd())

## Create data set with simulated apartment data

In [None]:
def apmt_sim(n_records):
    """Simulates an apartment data set"""

    rooms = np.random.randint(1, 8, size=n_records).astype(int)
    area = (rooms * np.random.randint(25, 45, size=n_records)).astype(int)
    price = (area * np.random.randint(25, 35, size=n_records)).astype(int)

    df = pd.DataFrame(
        {
            "id": list(range(1, n_records+1, 1)),
            "price": price,
            "area": area,
            "rooms": rooms
        }
    )

    return df

# Create data frame with 100'000 records
df = apmt_sim(n_records=10**5)

# Save data to file
df.to_csv('apartment_data_simulated.csv', sep=';', encoding='utf-8')

# Statistics to check values 
print(df.iloc[:,:].describe().round(2))

# Show data frame
df

## Function to calculate the price per m2

In [None]:
# Function
def ppm2(price, area):
    """Calculates the price per m2"""
    
    return price / area

## Compare the performance
Note that, in the code below, the magic command <b style="color:blue">%%timeit</b> measures the mean execution time of a cell (-r 1 means 1 run)

### Using a for loop

In [None]:
%%timeit -r 1

# Foor loop with df.iterrows()
price_m2 = []
for index, row in df.iterrows():
    price_m2.append(ppm2(row['price'], row['area']))

# Write new variable to df
df['price_per_m2'] = price_m2

### Using column division

In [None]:
%%timeit -r 1

df['price_per_m2'] = df['price'] / df['area']

### Using .apply() and lambda

In [None]:
%%timeit -r 1

df['price_per_m2'] = df.apply(lambda row: ppm2(row['price'], row['area']), axis=1)

### Using .map() and lambda

In [None]:
%%timeit -r 1

df['price_per_m2'] = list(map(ppm2, df['price'], df['area']))

## Enhancing performance when reading and writing data from/to a file

### Reading data frame from a file using pd.read_csv()

In [None]:
%%timeit -r 1

df_in = pd.read_csv('apartment_data_simulated.csv', sep=';', encoding='utf-8')

### Reading data frame from a file using open() and csv.reader()

In [None]:
%%timeit -r 1

# Open the file for reading
with open("apartment_data_simulated.csv", 'r') as file:
    csv_reader = csv.reader(file, delimiter=";")

    # This requires that your process rows as you produce them, e.g.:
    #line_count = 0
    #for row in csv_reader:
    #    if line_count == 0:
    #        print(f'Column names are {", ".join(row)}')
    #        line_count += 1
    #    line_count += 1
    #print(f'Number of lines processed: {line_count}.')


### Writing the data frame to a .csv file using pd.to_csv()

In [None]:
%%timeit -r 1

df.to_csv('apartment_data_exported.csv', sep=';', encoding='utf-8')

### Writing data to a .csv file using open() and csv.writer()

In [None]:
# Convert data frame to list
lst = df.values.tolist()
lst[:5]

In [None]:
%%timeit -r 1

# Column names
column_names = ['id', 'price', 'area', 'rooms', 'price_per_m2']

# Open a file for writing
with open('apartment_data_exported.csv', 'w', newline='') as file:

    # Create a CSV writer object with a semicolon delimiter
    writer = csv.writer(file, delimiter=';')

    # Write the column names as the first row in the CSV file
    writer.writerow(column_names)

    # Write each sub-list as a row to the CSV file
    for row in lst:
        writer.writerow(row)

### Jupyter notebook --footer info-- (please always provide this at the end of each notebook)

In [None]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')