In [1]:
# Decorations
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
import gmplot
import scipy.stats as stats
from config import api_key

In [2]:
# Import database
df1 = pd.read_csv('Data/housing.csv')
df2 = pd.read_csv('Data/Total.csv')
df1 = df1.dropna()

### df1(1990) Analysis

Sort and slice data to get top and bottom 5%

In [3]:
# SSort data by house value
df1 = df1.sort_values('median_house_value', ascending=False).reset_index(drop=True)
df1.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-118.42,34.08,48,2413,261.0,770,248,15.0001,500001,<1H OCEAN
1,-117.66,33.48,22,809,180.0,334,157,2.3846,500001,<1H OCEAN
2,-118.5,33.97,29,2737,808.0,1157,696,5.128,500001,<1H OCEAN
3,-122.47,37.73,50,1653,252.0,641,224,10.6605,500001,NEAR OCEAN
4,-122.47,37.73,52,2151,280.0,762,274,10.7309,500001,NEAR OCEAN


In [None]:
# Slice the top 5%
top5 = df1.iloc[:round(len(df1['median_house_value'])*0.05), :].sort_values('median_house_value', ascending=False)
top5.head()

In [None]:
# slice the bottom 5%
bottom5 = df1.iloc[round(len(df1['median_house_value'])*0.95): len(df1['median_house_value']),:].sort_values('median_house_value', ascending=True)
bottom5.head()

Plotting Top 5% and Bottom 5% on gmap (Use gmplot package)

In [None]:
# Set the center of the map
gmap = gmplot.GoogleMapPlotter(top5['latitude'].median(),
                                   top5['longitude'].median(), 100000)
# Plot scatter points based on LatLng
gmap.scatter(top5['latitude'], top5['longitude'], '#FF0000', 
                              size = 2000, marker = False ) 
gmap.scatter(bottom5['latitude'], bottom5['longitude'], '#110870', 
                              size = 2000, marker = False ) 
# Draw out to 'Plot' folder in html format
gmap.draw("Plot/gmap.html")

Based on the plot, we can assume that the median house value is heavliy influenced by their location.

Now we clean our data and create a new dataframe with informations we need

First, we look at the relationship between income and house value

In [None]:
plt.plot(top5['median_house_value'], top5['median_income'])
plt.xlabel('House Value')
plt.ylabel('Income')
plt.title('House Value vs. Income')

Above plot shows that there's no visible correlation between income and house value, since there's no specifice trending (increase nor decrease) in house value as income increase. Rather, we can see that people with lower income(for example, for income=2) live in more expensive houses. 

It is also possible that number of rooms/bedrooms are related to house value

In [None]:
# bins = [0, 3, 6, 9, 12, 15]
# group_names = ["<3", "3$-6$", "6$-9$", "9$-12$", ">12$"]
# top5['median_income_groups'] = pd.cut(top5['median_income'], bins, labels=group_names)

# plt.scatter(top5['median_house_value'], top5['median_income_groups'])
# plt.xlabel('House Value')
# plt.ylabel('Population')
# plt.title('Population vs House Value')

## Whole 1990 data Total Rooms vs Population

In [None]:
plt.plot(df1['population'], df1['total_rooms'], c='r')
plt.title('Population per District vs. Total Rooms per District')
plt.xlabel('Population')
pop_vs_rooms = plt.ylabel('Total Rooms')
plt.grid()
plt.savefig("Plot/pop_vs_rooms.png", dpi=300)
plt.show()

We make a little subgroups to see their effect on population:

In [None]:
bins = [0, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000]
group_names = ["<5k", "5k-10k", "10k-15k", "15k-20k", "20k-25k", "25k-30k", "30k-35k", ">35k"]
df1["total_rooms_groups"] = pd.cut(df1['total_rooms'], bins, labels=group_names)

plt.scatter(df1['total_rooms_groups'], df1['population'])
plt.xlabel('Total Rooms')
plt.ylabel('Population')
plt.title('Total Rooms per District vs Population per District')
plt.grid()
plt.savefig("Plot/pop_vs_rooms_sub_groups.png", dpi=300)
plt.show()

## Top 5 House Values vs Population

In [None]:
plt.plot(top5['median_house_value'], top5['population'])
plt.xlabel('House Value')
plt.ylabel('Poulation')
plt.title('Population vs House Value')
plt.savefig("Plot/Top5_PHV_nosubgroup.png")
plt.show()

As the above diagrams is not good results especially at high price, we devided the population by sub-groups:

In [None]:
bins = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000]
group_names = ["<1k", "1k-2k", "2k-3k", "3k-4k", "4k-5k", "5k-6k", "6k-7k", ">7k"]
top5["population_groups"] = pd.cut(top5['population'], bins, labels=group_names)

plt.scatter(top5['median_house_value'], top5['population_groups'])
plt.xlabel('House Value')
plt.ylabel('Population')
plt.title('Population vs House Value')
plt.savefig("Plot/Top5_PHV.png")
plt.show()

For the most expensive top 5 percent, we can see that lower poulation can be in any house with various prices and the higher poulation lives in the most expensive ones (the bigger blocks are more expensive they are).

## Bottom 5 House Values vs Population

In [None]:
bins = [0, 3000, 6000, 9000, 12000, 15000, 18000]
group_names = ["<3k", "3k-6k", "6k-9k", "9k-12k", "12k-15k", ">15k"]
bottom5["population_groups"] = pd.cut(bottom5['population'], bins, labels=group_names)

plt.scatter(bottom5['median_house_value'], bottom5['population_groups'])
plt.xlabel('House Value')
plt.ylabel('Population')
plt.title('Population vs House Value')
plt.savefig("Plot/Bottom5_PHV.png")
plt.show()

We can see that lower poulation can be in any house with various prices and interstingly the higher poulation has the same condition for the cheap price houses.

## Ocean Proximity vs Count of Houses

In [None]:
x_axis_op = ["<1H OCEAN", "INLAND", "ISLAND", "NEAR BAY", "NEAR OCEAN"]
y_axis_op = df1.groupby("ocean_proximity").count().rename(columns={"longitude": "count_of_houses"})["count_of_houses"]
plt.bar(x_axis_op, y_axis_op, color='b', alpha=0.5, align='center')
plt.xlabel("Ocean Proximity")
plt.ylabel("Count of Houses")
op_bar_chart = plt.title("Top Five Percent")
plt.savefig("Plot/all_NOP.png")
plt.show()

The process shows that people totally in 1990 preferred to live near the ocean.

## Top 5 Percent Ocean Proximity vs Count of Houses

In [None]:
x_axis_op = ["<1H OCEAN", "INLAND", "NEAR BAY", "NEAR OCEAN"]
y_axis_op = top5.groupby("ocean_proximity").count().rename(columns={"longitude": "count_of_houses"})["count_of_houses"]
plt.bar(x_axis_op, y_axis_op, color='b', alpha=0.5, align='center')
plt.xlabel("Ocean Proximity")
plt.ylabel("Count of Houses")
op_bar_chart = plt.title("Top Five Percent")
plt.savefig("Plot/Top5_NOP.png")
plt.show()

## Bottom 5 Percent Ocean Proximity vs Count of Houses

In [None]:
x_axis_op = ["<1H OCEAN", "INLAND", "NEAR BAY", "NEAR OCEAN"]
y_axis_op = bottom5.groupby("ocean_proximity").count().rename(columns={"longitude": "count_of_houses"})["count_of_houses"]
plt.bar(x_axis_op, y_axis_op, color='g', alpha=0.5, align='center')
plt.xlabel("Ocean Proximity")
plt.ylabel("Count of Houses")
op_bar_chart = plt.title("Bottom Five Percent")
plt.savefig("Plot/Bottom5_NOP.png")
plt.show()

Comparison between two bar charts show that there is a tendancy of living near ocean/bay rather than inland.

## Top Five Ocean Proximity Average Median Price

In [None]:
op_mean_house_value_top = top5.groupby("ocean_proximity")["median_house_value"].mean().round(2)
op_mean_df_top = pd.DataFrame(op_mean_house_value_top)
op_mean_df_top.rename(columns={"median_house_value":"Average Median Price"})

## Bottom Five Ocean Proximity Average Median Price

In [None]:
op_mean_house_value_bottom = bottom5.groupby("ocean_proximity")["median_house_value"].mean().round(2)
op_mean_df_bottom = pd.DataFrame(op_mean_house_value_bottom)
op_mean_df_bottom.rename(columns={"median_house_value":"Average Median Price"})

## Top Five Ocean Proximity Max Median Price

In [None]:
op_max_house_value_top = top5.groupby("ocean_proximity")["median_house_value"].max().round(2)
op_max_df_top = pd.DataFrame(op_max_house_value_top)
op_max_df_top.rename(columns={"median_house_value":"Average Median Price"})

## Top Five Ocean Proximity Min Median Price

In [None]:
op_min_house_value_top = top5.groupby("ocean_proximity")["median_house_value"].min().round(2)
op_min_df_top = pd.DataFrame(op_min_house_value_top)
op_min_df_top.rename(columns={"median_house_value":"Average Median Price"})

## Bottom Five Ocean Proximity Max Median Price

In [None]:
op_max_house_value_bot = bottom5.groupby("ocean_proximity")["median_house_value"].max().round(2)
op_max_df_bot = pd.DataFrame(op_max_house_value_bot)
op_max_df_bot.rename(columns={"median_house_value":"Average Median Price"})

## Bottom Five Ocean Proximity Min Median Price

In [None]:
op_min_house_value_bot = bottom5.groupby("ocean_proximity")["median_house_value"].min().round(2)
op_min_df_top = pd.DataFrame(op_min_house_value_bot)
op_min_df_top.rename(columns={"median_house_value":"Average Median Price"})

The data above shows that although the inland in top 5 are not preferred, their price was higher in year 1990.

### df2(2018) Analysis 

In [None]:
df2.head()

In [None]:
# Keep only the columns that are useful 
df2 = df2[['Sub Type', 'St#', 'St Name', 'City', 'L/C Price', 'Br/Ba', 'YrBuilt']]
df2.head()
#df2 = df2.drop(on=0)

In [None]:
yr_blt = df2.loc[:, 'YrBuilt'].str.split('/', expand=True)[0]
yr_blt = pd.DataFrame(yr_blt)
yr_blt = yr_blt.fillna(0)

In [None]:
df2['Age'] = ''
count = 0
for i in yr_blt[0]:
        i = 2018 - int(i)
        df2['Age'][count] = i
        count += 1
df2.head()

In [None]:
df2['house_price'] = df2.loc[:, 'L/C Price'].str.split('$', expand=True)[1] 

In [None]:
df2.head()

In [None]:
df2_by_value = df2.set_index("L/C Price")
df2_by_value = df2_by_value.reset_index()
df2_by_value.head()

### Ploting on Gmap

In [None]:
df2_by_value["Address"] = df2_by_value["St#"].astype(str) + " " + df2_by_value["St Name"].astype(str)
df2_by_value['Lat'] = ""
df2_by_value['Lng'] = ""

In [None]:
params = {"key": api_key}
for index, row in df2_by_value.iterrows():
    base_url = "https://maps.googleapis.com/maps/api/geocode/json"
    params['address'] = row['Address']
    geo_data = requests.get(base_url, params).json()
    try:
        df2_by_value[index, "Lat"] = geo_data["results"][0]["geometry"]["location"]["lat"]
        df2_by_value[index, "Lng"] = geo_data["results"][0]["geometry"]["location"]["lng"]
    except IndexError:
        print(f"Row {index} cannot be found on gmap.")
        continue
#     print(geo_data["results"][0]["geometry"]["location"]["lat"])

In [None]:
df2_by_value.to_csv("Data/df2_by_value_with_latlng")
df2_by_value

In [None]:
df2_by_value.to_csv("Data/df2_latlng.csv")