In [24]:
import pandas as pd
import numpy as np

Generates a single random data entry based on the statisticals provided.

Parameters:

    location_stats: Statistics of latitude and longitude by neighbourhood.
        
    price_stats: Price statistics by neighbourhood group.

Returns:

    dict: A dictionary representing a single data row.



In [25]:
def generate_random_entry(location_stats, price_stats):
    
    # Choose a random neighbourhood group
    ng = np.random.choice(location_stats.index.get_level_values(0).unique())

    # Choose a neighbourhood within the group
    nh_options = location_stats.xs(ng, level='neighbourhood_group').index
    nh = np.random.choice(nh_options)

    # Generate latitude and longitude
    lat = np.random.normal(location_stats.loc[(ng, nh)]['latitude']['mean'], location_stats.loc[(ng, nh)]['latitude']['std'])
    long = np.random.normal(location_stats.loc[(ng, nh)]['longitude']['mean'], location_stats.loc[(ng, nh)]['longitude']['std'])

    # Generate price based on neighbourhood group stats
    price = np.random.normal(price_stats.loc[ng, 'mean'], price_stats.loc[ng, 'std'])

    return {
        'neighbourhood_group': ng,
        'neighbourhood': nh,
        'latitude': lat,
        'longitude': long,
        'price': max(0, round(price))
    }


Appends a single new data entry to the existing dataset.

Parameters:

    file_path: Path to the existing CSV dataset.

In [26]:
def append_new_data(file_path):

    df = pd.read_csv(file_path)

    # Analyze data for bounds and distributions
    price_stats = df.groupby('neighbourhood_group')['price'].describe()
    location_stats = df.groupby(['neighbourhood_group', 'neighbourhood']).agg({
        'latitude': ['mean', 'std'],
        'longitude': ['mean', 'std']
    })

     # Generate new data
    new_data = generate_random_entry(location_stats, price_stats)
    new_df = pd.DataFrame([new_data]) 

    # Append new data
    df_updated = pd.concat([df, new_df], ignore_index=True)
    df_updated.to_csv(file_path, index=False)


In [29]:
if __name__ == "__main__":
    append_new_data('AB_NYC_2019.csv')