# A London Housing Analysis
#### Group Coursework for DS105A: Data for Data Science

**Importing Necessary Packages, Libraries, and Data**

In [None]:
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.preprocessing import MinMaxScaler

#!pip install folium
import folium
from folium.plugins import HeatMap

#!pip install plotly
import plotly.graph_objects as go
import plotly.express as px

#!pip install plotnine
from plotnine import *

#!pip install dash
from dash import Dash, dcc, html, dash_table
from dash.dependencies import Input, Output

In [207]:
df_expanded = pd.read_csv('Data/secondary/df_expanded.csv')
all_vars_data = pd.read_csv('Data/primary/all_postcode_data.csv')
listing_data = pd.read_csv('Data/primary/all_listing_data.csv')

1. House Prive vs. No. Bedrooms

In [None]:
df_expanded['date_sold'] = pd.to_datetime(df_expanded['date_sold'])
plot = (
    ggplot(df_expanded, aes(x='date_sold', y='display_price', color='num_bedrooms')) +
    geom_point() +
    geom_smooth(method='lm', color='blue', se=False) +
    # xlim('1995-01-01', '2023-12-01') +  
    ylim(0, 2000000) +
    labs(title='Display Prices Under 2 Million Since 1995', x='Date Sold', y='Display Price (£)') +
    theme_minimal()+
    theme(figure_size=(8, 6))
)

plot.save("Data/display/scatter_graph.png")
display(plot)

2. Inflation Heat Map

In [None]:
# creating inflation data frame
inflation = df_expanded[['address', 'display_price', 'date_sold', 'latitude', 'longitude']]

# cleaning and sorting
inflation = inflation.drop_duplicates()
inflation = inflation[inflation['address'].duplicated(keep=False)]
inflation = inflation.sort_values(by=['address', 'date_sold'])
inflation = inflation.groupby('address').filter(lambda x: len(x) <= 2)
inflation = inflation.assign(**inflation.groupby('address')[['display_price', 'date_sold']].diff())
inflation = inflation.dropna()

# applying simple inflation formula
inflation['inflation'] = ((inflation['display_price'] / (inflation['date_sold'].dt.days).astype(int)) / 365 * 100).round(2)



# creating a heat map showing inflation
map_london = folium.Map(location=[51.509865, -0.118092], zoom_start=11)
heat_data = list(zip(inflation['latitude'], inflation['longitude'], inflation['inflation']))


# adding heat map weighted by price
HeatMap(heat_data, 
        min_opacity=0.15,            
        radius=19,                  
        blur=1,                    
        max_zoom=1).add_to(map_london)

display(map_london)
map_london.save("Data/display/inflation_map.html")


3. Heat Map by Price

In [None]:
listing_data_outliers_removed = listing_data[listing_data['display_price'] <= 4000000]

# generate a map centered around London
heat_data = list(zip(listing_data_outliers_removed['latitude'], listing_data_outliers_removed['longitude'], listing_data_outliers_removed['display_price']))

# creating a map of London
map_london = folium.Map(location=[51.509865, -0.118092], zoom_start=11)

# adding heat map weighted by price
HeatMap(heat_data, 
        min_opacity=0.25,            
        radius=16,                  
        blur=15,                    
        max_zoom=1).add_to(map_london)

display(map_london)
map_london.save("Data/display/price_map.html")

4. Heat Map by Crime

In [211]:
# Generate a map centered around London
heat_data = list(zip(listing_data['latitude'], listing_data['longitude'], listing_data['crime rate']))

# Creating a map of London
map_london = folium.Map(location=[51.509865, -0.118092], zoom_start=11)

# Adding heat map to the map, using the 'display_price' column as weights
HeatMap(heat_data, min_opacity=0.2, radius=16, blur=16, max_zoom=1).add_to(map_london)

# Display the map
display(map_london)
map_london.save("Data/display/crime_map.html")

5. Heat Map by AQI

In [None]:
# generate a map centered around London
heat_data = list(zip(listing_data['latitude'], listing_data['longitude'], listing_data['AQI']))

# creating a map of London
map_london = folium.Map(location=[51.509865, -0.118092], zoom_start=11)

# adding heat map to the map, using the 'display_price' column as weights
HeatMap(heat_data, min_opacity=0.2, radius=20, blur=15, max_zoom=1, reverse_gradient=True).add_to(map_london)

display(map_london)
map_london.save("Data/display/AQI_map.html")

6. Crime to Price Ratio

In [None]:
map_heat_crime = folium.Map(location=[51.5074, -0.1278], zoom_start=11)

# calculate price-to-crime ratio
listing_data['price_to_crime_ratio'] = listing_data['display_price'] / listing_data['crime rate']

# ensure data is in the format expected by HeatMap
heat_data_ratio = [[row['latitude'], row['longitude'], row['price_to_crime_ratio']] for index, row in listing_data.iterrows()]

# add HeatMap layer based on price-to-crime ratio
HeatMap(heat_data_ratio, min_opacity=0.15, radius=16, blur=17, max_zoom=1).add_to(map_heat_crime)

del listing_data['price_to_crime_ratio']

map_heat_crime.save("Data/display/price_crime_map.html")
map_heat_crime

#### Correlation

In [None]:
# columns to calculate
columns_of_interest = ['num_bedrooms', 'crime rate', 'AQI', 'central dist', 'rating']
selected_columns = listing_data[columns_of_interest + ['display_price']]

# calculate the correlation matrix
correlation_matrix = selected_columns.corr()

# calculate the variance for each variable
variance = selected_columns.var()

# create a DataFrame for correlation and variance
correlation_and_variance = pd.DataFrame({
    'Correlation with Price': correlation_matrix['display_price'],
    'Variance': variance
})

# making table neater
correlation_and_variance = correlation_and_variance.drop('display_price')
correlation_and_variance = correlation_and_variance.rename({
    'num_bedrooms': 'Number of Bedrooms',
    'central dist': 'Distance from Center of London',
    'rating': 'Overall Rating'
})

correlation_and_variance.to_csv('Data/display/correlation_table')
correlation_and_variance

Unnamed: 0,Correlation with Price,Variance
Number of Bedrooms,0.345345,1.69343
crime rate,-0.170161,74.836127
AQI,0.02204,39.962825
Distance from Center of London,-0.28054,36.859949
Overall Rating,0.304088,153.363383


#### Heat Map Rating

In [None]:
# generate a map centered around London
heat_data = list(zip(listing_data['latitude'], listing_data['longitude'], listing_data['rating']))

# creating a map of London
map_london = folium.Map(location=[51.509865, -0.118092], zoom_start=11)

# adding heat map to the map, using the 'Rating' column as weights
HeatMap(heat_data, min_opacity=0.2, radius=16, blur=16, max_zoom=1).add_to(map_london)

map_london.save("Data/display/rating_map.html")
display(map_london)

#### Line Graph on Variable Impact

In [None]:
# group by data and calculate the mean price for each category
avg_price_by_bedrooms = df_normalise.groupby('num_bedrooms')['price_not_normalised'].mean().reset_index()
avg_price_by_crime = df_normalise.groupby('crime rate')['price_not_normalised'].mean().reset_index()
avg_price_by_AQI = df_normalise.groupby('AQI')['price_not_normalised'].mean().reset_index()
avg_price_by_dist = df_normalise.groupby('central dist')['price_not_normalised'].mean().reset_index()

# saving so it can be called
df_normalise.to_csv('Data/display/normalise.csv')

# create tge line graph
fig = px.line()

# add the variable, price line to the graph
fig.add_scatter(x=avg_price_by_bedrooms['num_bedrooms'], y=avg_price_by_bedrooms['price_not_normalised'], mode='lines', name='Bedrooms')
fig.add_scatter(x=avg_price_by_crime['crime rate'], y=avg_price_by_crime['price_not_normalised'], mode='lines', name='Crime Rate')
fig.add_scatter(x=avg_price_by_AQI['AQI'], y=avg_price_by_AQI['price_not_normalised'], mode='lines', name='AQI')

# creating a line of best fit for distance from center 
slope, intercept = np.polyfit(avg_price_by_dist['central dist'], avg_price_by_dist['price_not_normalised'], 1)
line_trace = go.Scatter(x=avg_price_by_dist['central dist'], y=slope * avg_price_by_dist['central dist'] + intercept + 200000, mode='lines', name='Distance from City Centre (best fit)', line=dict(color='blue'))
fig.add_trace(line_trace)

# finish layout
fig.update_layout(title='Variable Correlation with Price', xaxis_title='(Relative) Lowest to Highest Value', yaxis_title='Price (£)')
fig.update_yaxes(range=[0, 8000000])

fig.show()
fig.write_html("Data/display/line_graph.html")

#### Interactive Table

In [None]:
# deleting columns from table that make it look messy when visualizing
del listing_data['property_type']
del listing_data['latitude']
del listing_data['longitude']
del listing_data['Unnamed: 0']
del listing_data['has_floor_plan']
del listing_data['date_sold']
del listing_data['url']
del listing_data['postcode']

# making remaining data look more clean
listing_data['crime rate'] = listing_data['crime rate'].round(2)
listing_data['rating'] = listing_data['rating'].round(2)

# create a Dash web application
app = Dash(__name__)

# define the layout of the web application
app.layout = html.Div([
    html.H1("Interactive House Listing Table"),
    
    # dropdown for selecting number of bedrooms
    dcc.Dropdown(
        id='bedroom-dropdown',
        options=[{'label': str(i), 'value': i} for i in range(1, 10)],
        value=None,
        placeholder="Select number of bedrooms"
    ),
    
    # dropdown for maximum price
    dcc.Dropdown(
        id='price-dropdown',
        options=[{'label': str(i), 'value': i} for i in range(0, 8000000, 100000)],
        value=None,
        placeholder="Select maximum price"
    ),
    
    # dropdown for selecting minimum AQI
    dcc.Dropdown(
    id='aqi-dropdown',
    options=[{'label': str(i), 'value': i} for i in range(60, 101)], 
    value=None,
    placeholder="Select minimum AQI"
    ),
    
    # dropdown for maximum crime rate
    dcc.Dropdown(
        id='crime-dropdown',
        options=[{'label': str(i), 'value': i} for i in range(0, 100, 10)],
        value=None,
        placeholder="Select maximum crime rate (per 1000)"
    ),
    
    # dropdown for maximum distance from center of London
    dcc.Dropdown(
        id='distance-dropdown',
        options=[{'label': str(i), 'value': i} for i in range(0, 50, 1)],
        value=None,
        placeholder="Select maximum distance from center (km)"
    ),
    
    # creating output table
    dash_table.DataTable(id='listing-table'),
])

# define callback to update the table based on user input
@app.callback(
    Output('listing-table', 'data'),
    [Input('bedroom-dropdown', 'value'),
     Input('price-dropdown', 'value'),
     Input('aqi-dropdown', 'value'),
     Input('crime-dropdown', 'value'),
     Input('distance-dropdown', 'value')]
)
# registering User input
def update_table(selected_bedrooms, max_price, min_aqi, max_crime, max_distance):
    # filter the data based on user input
    filtered_data = listing_data[
        (listing_data['num_bedrooms'] == selected_bedrooms if selected_bedrooms else True) &
        (listing_data['display_price'] <= max_price if max_price else True) &
        (listing_data['AQI'] >= min_aqi if min_aqi else True) &
        (listing_data['crime rate'] <= max_crime if max_crime else True) &
        (listing_data['central dist'] <= max_distance if max_distance else True)
    ]

    # convert filtered data to dictionary format for DataTable
    table_data = filtered_data.to_dict('records')

    return table_data

# run web application
app.run_server(debug=True)


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/anaconda3/lib/python3.11/site-packages/pandas/core/indexes/base.py:3790, in Index.get_loc(
    self=Index(['address', 'num_bedrooms', 'display_price... 'central dist', 'rating'],
      dtype='object'),
    key=True
)
   3789 try:
-> 3790     return self._engine.get_loc(casted_key)
        casted_key = True
        self = Index(['address', 'num_bedrooms', 'display_price', 'crime rate', 'AQI',
       'central dist', 'rating'],
      dtype='object')
   3791 except KeyError as err:

File index.pyx:152, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:181, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:7080, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:7088, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: Tru