In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
# 데이터셋 로드
housing = pd.read_csv('/content/housing.csv')
housing.head()
pd.set_option('display.max_columns', 10) # (display.max_rows, N)

In [4]:
housing.shape #

(20640, 10)

In [6]:
housing.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [None]:
housing.info() # 결측치가 존재, 9번 row는 dtype이 객체형태이다

In [8]:
housing['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
housing.plot(kind='scatter', x='longitude', y='latitude', grid=True, alpha=0.3, figsize=(8,6))
plt.show()

In [None]:
housing.plot(kind='scatter', x='longitude', y='latitude', grid=True,
             s=housing['population'] / 100, label = 'population', # 구역별 인구수 비례 원의 크기 지정
             c='median_house_value', # 주택 중위가격을 색상 지정
             cmap='turbo', # 컬러맵지정
             colorbar=True,
             figsize=(8,6))

In [None]:
housing.plot(kind='scatter', x='longitude', y='latitude', grid=True,
             s=housing['population'] / 100, label = 'population', # 구역별 인구수 비례 원의 크기 지정
             c='median_house_value', # 주택 중위가격을 색상 지정
             cmap='turbo', # 컬러맵지정
             colorbar=True,
             figsize=(8,6))
IMAGE_NAME = '/content/california.png'
cal_img = plt.imread(IMAGE_NAME)
axis = -124.55, -113.8, 32.45, 42.3
plt.axis(axis)
plt.imshow(cal_img, extent=axis)

In [None]:
pd.factorize(housing['ocean_proximity']) # 범주형 데이터 -> 수치형 데이터로 변환

In [16]:
housing['ocean_p'] = pd. factorize(housing['ocean_proximity'])[0]

In [None]:
housing.plot(kind='scatter', x='longitude', y='latitude', grid=True,
             s=housing['population'] / 100, label = 'population', # 구역별 인구수 비례 원의 크기 지정
             c='median_house_value', # 주택 중위가격을 색상 지정
             cmap='turbo', # 컬러맵지정
             colorbar=True,
             figsize=(8,6))
IMAGE_NAME = '/content/california.png'
cal_img = plt.imread(IMAGE_NAME)
axis = -124.55, -113.8, 32.45, 42.3
plt.axis(axis)
plt.imshow(cal_img, extent=axis)

In [None]:
housing.hist(bins=50, figsize=(12, 11))

In [None]:
bins = [0., 1.5, 3.0, 4.5, 6., np.inf] # 무한대
housing_income = pd.cut(housing['median_income'],
                        bins=bins,
                        labels=[1,2,3,4,5])
housing_income

In [None]:
housing_income_sort = housing_income.value_counts().sort_index()
housing_income_sort

In [None]:
housing_income_sort.plot(kind='bar', rot=0, grid=True)
plt.xlabel('Income category')
plt.ylabel('Number of districts')
plt.show()

In [None]:
# 특성(피쳐)별 상관관계
corr = housing.corr(numeric_only=True)
corr

In [None]:
import seaborn as sns
ax = sns.heatmap(corr, annot=True, cmap='coolwarm')

In [None]:
corr['median_house_value'].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix

attr = ['median_house_value', 'median_income', 'housing_median_age']
scatter_matrix(housing[attr], figsize=(10,6))
plt.show()

In [None]:
# 전처리
housing.plot(kind='scatter', x = 'median_income', y='median_house_value', alpha=0.5, grid=True)
plt.show()

In [None]:
housing.median_house_value.value_counts()

In [33]:
mask = housing.median_house_value != 500001
housing = housing[mask].copy()

In [None]:
housing.median_house_value.describe()

In [None]:
housing.plot(kind='scatter', x = 'median_income', y='median_house_value', alpha=0.5, grid=True)
plt.show()

In [None]:
housing.isnull().sum()

In [None]:
mask = housing.isnull().any(axis=1) # 결측치만 확인
housing[mask]

In [None]:
housing[mask].shape

In [43]:
median = housing['total_bedrooms'].median()
housing['total_bedrooms'].fillna(median, inplace=True)

In [None]:
housing[mask]

In [None]:
housing.isnull().sum()

In [None]:
# 데이터 스케일링(표준화, 정규화, 로그변환)
# 표준화 : 데이터셋의 평균값 0, 표준편차는 1이 되도록, 원래값에서 평균값을 뺀다음에 해당 데이터셋 표준편차로 나눔
housing_nume = housing.drop(['ocean_proximity', 'ocean_p'], axis=1) # 수치형 데이터가 아니기 때문에
housing_nume

In [None]:
mean_df = housing_nume.mean()
mean_df

In [None]:
std_df = housing_nume.std()
std_df

In [None]:
# 표준화
housing_standard = housing_nume.sub(mean_df, axis=1).div(std_df,axis=1)
housing_standard

In [None]:
# 정규화 최소값 0, 최대값 1로 만들기
max_df = housing_nume.max()
min_df = housing_nume.min()
housing_normal = housing_nume.sub(min_df, axis=1).div(max_df - min_df, axis=1)
housing_normal

In [None]:
housing_normal.describe()

In [None]:
# 로그 변환
housing_nume[['total_rooms', 'population', 'households', 'median_income', 'median_house_value']].hist(bins=50, figsize=(10,9))
plt.show()

In [None]:
housing_log = housing_nume[['total_rooms', 'population', 'households', 'median_income', 'median_house_value']].apply(np.log)
housing_log

In [None]:
housing_log.hist(bins=50, figsize=(10,9))
plt.show()