# BIG DATA ANALYTICS PROGRAMMING : Regression Task
### Regression(회귀) 문제를 처음 부터 끝까지 다뤄 봅니다
---
References
- https://github.com/rickiepark/handson-ml2/blob/master/02_end_to_end_machine_learning_project.ipynb

## 1. Load Dataset

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("data/housing.csv")

## 2. Data에 대한 기본적인 정보 확인

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['ocean_proximity'].value_counts()

In [None]:
df.describe()


In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
df.hist(bins=50, figsize=(20,15))
plt.show()

## 3. 미리 훈련/테스트 데이터셋 나누기

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split


In [None]:

df["income_cat"] = pd.cut(df["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [None]:
df.head()

In [None]:
df["income_cat"].value_counts()

In [None]:
df["income_cat"].hist()

In [None]:
train_set_random, test_set_random = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df["income_cat"]):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

In [None]:
strat_test_set["income_cat"].value_counts() / len(strat_test_set)


In [None]:
test_set_random['income_cat'].value_counts() / len(test_set_random)


In [None]:
df["income_cat"].value_counts() / len(df)


In [None]:
def income_cat_proportions(data):
    return data["income_cat"].value_counts() / len(data)

compare_props = pd.DataFrame({
    "Overall": income_cat_proportions(df),
    "Stratified": income_cat_proportions(strat_test_set),
    "Random": income_cat_proportions(test_set_random),
}).sort_index()
compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100


In [None]:
compare_props

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)


In [None]:
df = strat_train_set.copy()

## 4. 탐색적 데이터 분석

In [None]:
df.plot(kind="scatter", x="longitude", y="latitude")


In [None]:
df.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)


In [None]:
df.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
    s=df["population"]/100, label="population", figsize=(10,7),
    c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
    sharex=False)
plt.legend()


In [None]:

import matplotlib.image as mpimg
california_img=mpimg.imread("data/california.png")
ax = df.plot(kind="scatter", x="longitude", y="latitude", figsize=(10,7),
                       s=df['population']/100, label="Population",
                       c="median_house_value", cmap=plt.get_cmap("jet"),
                       colorbar=False, alpha=0.4,
                      )
plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5,
           cmap=plt.get_cmap("jet"))
plt.ylabel("Latitude", fontsize=14)
plt.xlabel("Longitude", fontsize=14)

prices = df["median_house_value"]
tick_values = np.linspace(prices.min(), prices.max(), 11)
cbar = plt.colorbar(ticks=tick_values/prices.max())
cbar.ax.set_yticklabels(["$%dk"%(round(v/1000)) for v in tick_values], fontsize=14)
cbar.set_label('Median House Value', fontsize=16)

plt.legend(fontsize=16)
plt.show()

In [None]:
corr_matrix = df.corr()


In [None]:
corr_matrix["median_house_value"].sort_values(ascending=False)


In [None]:
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age"]
scatter_matrix(df[attributes], figsize=(12, 8))


In [None]:

df.plot(kind="scatter", x="median_income", y="median_house_value",
             alpha=0.1)
plt.axis([0, 16, 0, 550000])

## 5. 추가 속성 생성

In [None]:
df["rooms_per_household"] = df["total_rooms"]/df["households"]
df["bedrooms_per_room"] = df["total_bedrooms"]/df["total_rooms"]
df["population_per_household"]=df["population"]/df["households"]

In [None]:
corr_matrix = df.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)


In [None]:
df.plot(kind="scatter", x="bedrooms_per_room", y="median_house_value",
             alpha=0.2)
plt.show()

In [None]:
df.describe()


## 6. 데이터 전처리

### 6-1. Label 분리 및 결측값 핸들링

In [None]:
df = strat_train_set.drop("median_house_value", axis=1) # 훈련 세트를 위해 레이블 삭제
df_labels = strat_train_set["median_house_value"].copy()


In [None]:
sample_incomplete_rows = df[df.isnull().any(axis=1)].head()
sample_incomplete_rows

In [None]:
sample_incomplete_rows.dropna(subset=["total_bedrooms"])    # 옵션 1


In [None]:
sample_incomplete_rows.drop("total_bedrooms", axis=1)       # 옵션 2


In [None]:
median = df["total_bedrooms"].median()
df["total_bedrooms"].fillna(median, inplace=True) # 옵션 3


In [None]:
df.info()

### 6-2. Categorical 데이터 인코딩

In [None]:
df_cat = df[["ocean_proximity"]]
df_cat.head(10)


In [None]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
df_cat_encoded = ordinal_encoder.fit_transform(df_cat)
df_cat_encoded[:10]

In [None]:
ordinal_encoder.categories_


In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
df_cat_1hot = cat_encoder.fit_transform(df_cat)
df_cat_1hot


In [None]:
cat_encoder.get_feature_names()

In [None]:
df_cat_1hot.toarray()


In [None]:
for index, category in enumerate(cat_encoder.get_feature_names()):
    print(index)
    print(category)
    df[category] = df_cat_1hot.toarray()[:,index]

In [None]:
df.head()

In [None]:
organized_df = df.drop("ocean_proximity", axis=1) 
organized_df

### 6-3. Numerical 데이터 정규화

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
X = scaler.fit_transform(organized_df)
y = df_labels.values

In [None]:
X

In [None]:
y

## 7. 정리된 데이터셋을 확인 하기 위한 간단한 모델 적용

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


In [None]:
from sklearn.linear_model import LinearRegression


In [None]:
reg = LinearRegression()
reg.fit(X, y)


### 7-1. 테스트 데이터셋에 전처리 적용

In [None]:
def organizing(encoder, scaler, data):
    
    for index, category in enumerate(encoder.get_feature_names()):
        df_cat = data[["ocean_proximity"]]
        data[category] = encoder.transform(df_cat).toarray()[:,index]
    data.drop("ocean_proximity", axis=1, inplace=True) 
    X = scaler.transform(data)
    return X

    

In [None]:
test_y = strat_test_set['median_house_value']
test_X = strat_test_set.drop("median_house_value", axis=1) # 훈련 세트를 위해 레이블 삭제


In [None]:
test_X.info()

In [None]:
test_X["total_bedrooms"].fillna(median,inplace=True)
test_X.info()

In [None]:
test_X = organizing(cat_encoder, scaler, test_X)
print(test_X)

### 7-2. 예측

In [None]:
pred_y = reg.predict(test_X)

In [None]:
mse = mean_squared_error(test_y, pred_y)
rmse = np.sqrt(mse)
print(rmse)


In [None]:
mae = mean_absolute_error(test_y, pred_y)
print(mae)

## 8. 최적의 모델 찾기

In [None]:
from sklearn.utils import all_estimators

estimators = all_estimators(type_filter='regressor')

all_regs = []
for name, RegressorClass in estimators:
    
    try:
        reg = RegressorClass()
        all_regs.append(reg)
        print('Appending', name)
    except:
        pass

In [None]:
results = []

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(random_state=42)
scores = cross_val_score(rfr, X, y,
                         scoring="neg_mean_squared_error", cv=10)
scores = np.sqrt(-scores)

print("점수:", scores)
print("평균:", scores.mean())
print("표준 편차:", scores.std())

In [None]:
SUPER_SLOW_REGRESSION = ["GaussianProcessRegressor","KernelRidge"]
for reg in all_regs:
    reg_name = reg.__class__.__name__ 
    if reg_name not in SUPER_SLOW_REGRESSION:
        try:
#             reg.fit(X, y)
            scores = cross_val_score(reg, X, y, scoring="neg_mean_squared_error", cv=5)
            scores = np.sqrt(-scores)

            if not scores.mean():
                break
            print("{}: RMSE {}".format(reg.__class__.__name__, scores.mean()))
            result = {
                "Name":reg.__class__.__name__, 
                "RMSE":scores.mean()
            }
            results.append(result)
        except:
            pass
    
    

In [None]:
result_df = pd.DataFrame(results)
result_df

In [None]:
result_df.sort_values(by="RMSE")

### 9. 모델 세부 튜닝

In [None]:

from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [50, 70, 100, 120, 150], 'max_features': [2, 4, 6, 8]},
  ]

forest_reg = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, verbose=2,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(X, y)

In [None]:
grid_search.best_params_

In [None]:
reg = RandomForestRegressor(max_features=6, n_estimators=150,random_state=42)

In [None]:
reg.fit(X,y)

In [None]:
pred_y = reg.predict(test_X)

In [None]:
mse = mean_squared_error(test_y, pred_y)
rmse = np.sqrt(mse)
mae = mean_absolute_error(test_y, pred_y)
print("RMSE {}, MAE {}".format(rmse,mae))

## Q. 중요하지 않은 속성 제거뒤 다시 해보기!

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_
print(feature_importances)
features_with_importance = zip(df.columns, grid_search.best_estimator_.feature_importances_)
sorted(features_with_importance,key=lambda f : f[1], reverse=True)