In [None]:
"""
@author Dylan Nguyen
@email me@dylanhnguyen.com
@website https://dylanhnguyen.com

This workbook escribes the preliminary EDA for Trulia realestate housing pricing dataset
""";

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import numpy as np

from scipy import stats

: 

## Importing and Formatting Data Set

In [None]:
#Initializing connection with the Trulia housing database .db file
conn = sqlite3.connect(r'house-data.db')

#reading the "cleaned data" table from the Trulia house database and store in a Pandas dataframe
df = pd.read_sql_query("SELECT * FROM trulia_house_SB_data_cleaned", conn)

: 

## Basic EDA

In [None]:
#inspec the basic features of the dataset:
df.columns
df.head()


: 

In [None]:
df.zip.value_counts()


## Exploritory Data Analysis

In [None]:
numeric_cols=['price', 'num_bedrooms',
       'num_baths', 'building_sqft', 'lot_area',
        'house_age', 'has_garage',
       'has_fireplace', 'has_ocean_views', 'has_mountain_views',
       'has_hope_ranch', 'has_montecito','building_sqft_was_missing']

In [None]:
pair_cols = ['price', 'num_bedrooms',
             'num_baths', 'building_sqft', 'lot_area',
             'house_age']


### Histogram of house price
* *Conclusions:*

In [None]:
plt.rcParams.update({'font.size': 15})
plt.rcParams["axes.edgecolor"] = "black"
plt.rcParams["axes.linewidth"] = 1.5

sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(8, 7))

sns.histplot(df['price'], ax=ax, kde=True, legend=False)
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="price")

#sns.despine(trim=True, left=True)
plt.show()

: 

In [None]:
normalized_data = stats.boxcox(df['price'])

fig, ax = plt.subplots(1, 2, figsize=(15, 3))

sns.histplot(df['price'], ax=ax[0], kde=True, legend=False)
ax[0].set_title("original Data")

sns.histplot(normalized_data[0], ax=ax[1], kde=True, legend=False)
ax[1].set_title("Normalized data")
plt.show()


### Pairplots

In [None]:
sns.pairplot(df[pair_cols])

### Heatmap

In [None]:
f, ax = plt.subplots(figsize=(8, 7))
sns.heatmap(df[numeric_cols].corr(), annot=False)

### In Depth Histogram

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 7))
fig.tight_layout(pad=4.0)

sns.histplot(df['price'], ax=axes[0, 0], kde=True, legend=False)
#df.price.hist(ax=axes[0,0])
#axes[0,0].set_title(df.price.name)

sns.histplot(df['num_bedrooms'], ax=axes[0, 1], kde=False, legend=False)
#df.num_bedrooms.hist(ax=axes[0,1])
#axes[0,1].set_title(df.num_bedrooms.name)

sns.histplot(df['num_baths'], ax=axes[0, 2], kde=False, legend=False)
#df.num_baths.hist(ax=axes[0,2])
#axes[0,2].set_title(df.num_baths.name)

sns.histplot(df['building_sqft'], ax=axes[1, 0], kde=True, legend=False)
#df.building_sqft.hist(ax=axes[1,0])
#axes[1,0].set_title(df.building_sqft.name)

sns.histplot(df['lot_area'], ax=axes[1, 1], kde=True, legend=False)
#df.lot_area.hist(ax=axes[1,1])
#axes[1,1].set_title(df.lot_area.name)

sns.histplot(df['house_age'], ax=axes[1, 2], kde=True, legend=False)
#df.house_age.hist(ax=axes[1,2])
#axes[1,2].set_title(df.house_age.name)


: 

In [None]:
#normalizing data

fig, ax = plt.subplots(4, 2, figsize=(10, 15))
fig.tight_layout(pad=5.0)


sns.histplot(df['num_baths'], ax=ax[0, 0], kde=False, legend=False)

df['num_baths_norm'] = stats.boxcox(df['num_baths'])[0]
sns.histplot(df['num_baths_norm'], ax=ax[0, 1], kde=False, legend=False)


sns.histplot(df['lot_area'], ax=ax[1, 0], kde=True, legend=False)

df['lot_area_norm'] = stats.boxcox(df['lot_area'])[0]
sns.histplot(df['lot_area_norm'], ax=ax[1, 1], kde=True, legend=False)


sns.histplot(df['building_sqft'], ax=ax[2, 0], kde=True, legend=False)

# df['building_sqft_norm']=stats.boxcox(df['building_sqft'])[0]
df["building_sqft_norm"] = np.log1p(df["building_sqft"])
sns.histplot(df['building_sqft_norm'], ax=ax[2, 1], kde=True, legend=False)


sns.histplot(df['price'], ax=ax[3, 0], kde=True, legend=False)

#df['price_norm']=stats.boxcox(df['price'])[0]
df["price_norm"] = np.log1p(df["price"])
sns.histplot(df['price_norm'], ax=ax[3, 1], kde=True, legend=False)

plt.show()

### Boxplot

In [None]:
# visualize the distribution and outliers for all of the numeric variables using a box plot 

fig,axes=plt.subplots(2,3,figsize=(15, 7),frameon=True)
fig.tight_layout(pad=3.0)

df.boxplot(ax=axes[0,0],column='price')

df.boxplot(ax=axes[0,1],column='num_bedrooms')

df.boxplot(ax=axes[0,2],column='num_baths')

df.boxplot(ax=axes[1,0],column='building_sqft')

df.boxplot(ax=axes[1,1],column='lot_area')

df.boxplot(ax=axes[1,2],column='house_age')

### Strip Plot

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
sns.stripplot(x='zip', y='price', data=df, ax=ax)

### Boxplot of zip code prices

In [None]:
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x='zip', y='price', data=df)
fig.axis(ymin=0, ymax=10)
