# EDA Example
- References
    - https://github.com/ageron/handson-ml2

## Data Loading

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("data/housing.csv")

## 요약 통계

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df["ocean_proximity"].value_counts()

In [None]:
df.describe()


## 속성별 분포확인

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
df.hist(bins=50, figsize=(20,15))
plt.show()

## 추가적인 분석전에 미리 데이터셋 나누기

In [None]:
#이 수업에서는 단순 랜덤 샘플링
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

## 위치정보를 기반으로 시각화

### 단순 Scatter Plot

In [None]:
train_set.plot(kind="scatter", x="longitude", y="latitude")

### 투명도를 활용한 Scatter Plot

In [None]:
train_set.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

### 주택의 중위 가격을 포함하는 Scatter Plot

In [None]:
train_set.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
    s=train_set["population"]/100, label="population", figsize=(10,7),
    c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
    sharex=False)
plt.legend()

### 지도 이미지와 함께 Plot 해보기

In [None]:
import matplotlib.image as mpimg
import numpy as np
california_img=mpimg.imread("data/california.png")
ax = train_set.plot(kind="scatter", x="longitude", y="latitude", figsize=(10,7),
                       s=train_set['population']/100, label="Population",
                       c="median_house_value", cmap=plt.get_cmap("jet"),
                       colorbar=False, alpha=0.4,
                      )
plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5,
           cmap=plt.get_cmap("jet"))
plt.ylabel("Latitude", fontsize=14)
plt.xlabel("Longitude", fontsize=14)

prices = train_set["median_house_value"]
tick_values = np.linspace(prices.min(), prices.max(), 11)
cbar = plt.colorbar(ticks=tick_values/prices.max())
cbar.ax.set_yticklabels(["$%dk"%(round(v/1000)) for v in tick_values], fontsize=14)
cbar.set_label('Median House Value', fontsize=16)

plt.legend(fontsize=16)
plt.show()

## 속성들간의 상관관계 구하기

In [None]:
corr_matrix = train_set.corr()

In [None]:
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age"]
scatter_matrix(train_set[attributes], figsize=(12, 8))


In [None]:
train_set.plot(kind="scatter", x="median_income", y="median_house_value",
             alpha=0.1)
plt.axis([0, 16, 0, 550000])


## 피쳐 생성
- 세대 주당 방의 갯수가 중요하지 않을까?
- 침실의 갯수의 비율이 중요한게 아닌가?
- 1인가구 다가구에 따라 다르지 않을까?

In [None]:
train_set["rooms_per_household"] = train_set["total_rooms"]/train_set["households"]
train_set["bedrooms_per_room"] = train_set["total_bedrooms"]/train_set["total_rooms"]
train_set["population_per_household"]=train_set["population"]/train_set["households"]

In [None]:
corr_matrix = train_set.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
train_set.plot(kind="scatter", x="rooms_per_household", y="median_house_value",
             alpha=0.2)
plt.axis([0, 5, 0, 520000])
plt.show()

In [None]:
train_set.describe()
