# 캘리포니아 집 값 예측(회귀)

## 데이터 불러오기

In [4]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split

from plotly.subplots import make_subplots

In [5]:
df = pd.read_csv("../data/housing.csv") # 데이터 프레임워크를 기대..

In [6]:
df.shape

(20640, 10)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [8]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [9]:
df.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

## 데이터 확인
- 데이터 전체를 출력하는 이미지 필요
    1. 3 by 3 레이아웃
    2. 막대 그래프 생성
<br /><br />
- longitude
- latitude
- housing_median_age
- total_rooms
- total_bedrooms
- population
- households
- median_income
- median_house_value
<br /><br />
- ocean_proximity

In [10]:
fig = make_subplots(rows=3, cols=3, subplot_titles=(
                                                    "<i>longitude</i>",
                                                    "<i>latitude</i>",
                                                    "<i>housing_median_age</i>",
                                                    "<i>total_rooms</i>",
                                                    "<i>total_bedrooms</i>",
                                                    "<i>population</i>",
                                                    "<i>households</i>",
                                                    "<i>median_income</i>",
                                                    "<i>median_house_value</i>",
                                                   ))
fig.add_trace(go.Histogram(x=df["longitude"], name="longitude"), row=1, col=1)
fig.add_trace(go.Histogram(x=df["latitude"], name="latitude"), row=1, col=2)
fig.add_trace(go.Histogram(x=df["housing_median_age"], name="housing_median_age"), row=1, col=3)

fig.add_trace(go.Histogram(x=df["total_rooms"], name="total_rooms"), row=2, col=1)
fig.add_trace(go.Histogram(x=df["total_bedrooms"], name="total_bedrooms"), row=2, col=2)
fig.add_trace(go.Histogram(x=df["population"], name="population"), row=2, col=3)

fig.add_trace(go.Histogram(x=df["households"], name="households"), row=3, col=1)
fig.add_trace(go.Histogram(x=df["median_income"], name="median_income"), row=3, col=2)
fig.add_trace(go.Histogram(x=df["median_house_value"], name="median_house_value"), row=3, col=3)

fig.update_layout(template="plotly_dark", title_text="<b>Distribution")

## 데이터 전처리

In [11]:
X = df[["longitude", "latitude", "housing_median_age", "total_rooms", "total_bedrooms", "population", "households", "median_income"]]
y = df["median_house_value"]
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, shuffle=True, test_size=0.3)

In [1]:
class DataPreprocessing:
    quantitave = ["longitude", "latitude", "housing_median_age",
                 "total_rooms", "total_bedrooms", "population",
                 "households", "median_income",]
    
    # 생성자
    def __init__(self):
        self.q_25 = None
        self.q_75 = None
        self.means = None
        self.medians = None
    
    # 메서드
    # fit
    def fit(self, X, y=None):
        self.q_25 = X[DataPreprocessing.quantitave].quantile(q=0.25)
        self.medians = X[DataPreprocessing.quantitave].quantile(q=0.5)        
        self.q_75 = X[DataPreprocessing.quantitave].quantile(q=0.75)
        self.means = X[DataPreprocessing.quantitave].mean()

    # transform
    def transform(self):
        for col in X[DataPreprocessing.quantitave].col:
            q_3 = self.q_75[col]
            q_1 = self.q_25[col]
            iqr = q_3 - q_1
            
            upper_bound = q_3 + 1.5 * iqr
            lower_bound = q_1 - 1.5 * iqr
            X.loc[X[col] > upper_bound, col] = q_3
            X.loc[X[col] > lower_bound, col] = q_1
            
        for col in X[DataPreprocessing.quantitave].col:
            X[col].fillna(self.means[col], inplace=True)
            
        # 후처리

In [2]:
ds = DataPreprocessing()
ds.fit(X)
print(ds.q_25)

NameError: name 'X' is not defined

## 데이터 분리 for 머신러닝

In [37]:
%%time
lst = []
for i in range(5000):
    lst.append(i)

CPU times: total: 0 ns
Wall time: 1.02 ms


In [38]:
%%time
[i for i in range(50000)]

CPU times: total: 0 ns
Wall time: 997 µs


[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


## 데이터 학습 및 검증