In [13]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import fetch_openml
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# ch2. exercise 1

In [14]:
# read csv
housing = pd.read_csv('housing.csv')

In [15]:
# 결측값 있는지 확인
housing.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [16]:
# 결측값 제거
housing = housing.dropna(axis=0)

In [17]:
# categorical variable의 범주 확인
housing.ocean_proximity.unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [18]:
# categorical variable -> dummy variable로 만들기
enc = OneHotEncoder()

ocean_proximity = enc.fit_transform(np.array(housing.ocean_proximity).reshape(-1,1)).toarray()
ocean_proximity = pd.DataFrame(ocean_proximity)
ocean_proximity.columns = enc.categories_

# 만들어준 variable을 기존 데이터와 concat
housing = pd.concat([housing.reset_index(drop=True), ocean_proximity], axis=1)
housing = housing.drop('ocean_proximity', axis=1)

In [19]:
# 변수 추가
housing['rooms_per_households'] = housing.total_rooms/housing.households
housing['population_per_households'] = housing.population/housing.households

In [20]:
# train test split
train, test = train_test_split(housing, test_size=0.2, random_state=79)
# split 잘 되었는지 dimension 확인
print(train.shape, test.shape)

(16346, 16) (4087, 16)


In [21]:
# 독립변수와 종속변수(target)으로 나누기
train_x = train.drop('median_house_value', axis=1)
train_y = train.median_house_value

test_x = test.drop('median_house_value', axis=1)
test_y = test.median_house_value

In [22]:
# scaling
scaling = StandardScaler()
train_x_scaled = scaling.fit_transform(train_x)
test_x_scaled = scaling.transform(test_x)

In [23]:
# 하이퍼파라미터를 다양하게 조정한 여섯 가지 모델
svr_linear_10 = SVR(kernel='linear', C=10)
svr_linear_1 = SVR(kernel='linear')
svr_poly_10 = SVR(kernel='poly', C=10)
svr_poly_1 = SVR(kernel='poly')
svr_rbf_10 = SVR(kernel='rbf', C=10)
svr_rbf_1 = SVR(kernel='rbf')

models = [svr_linear_10, svr_linear_1, svr_poly_10, svr_poly_1, svr_rbf_10, svr_rbf_1]

# 모델을 넣으면 fit->pred->rmse를 계산해주는 함수
def fitting_svr(model) :
    fitted_model = model.fit(train_x_scaled, train_y)
    pred_y = model.predict(test_x_scaled)
    rmse = np.sqrt(mean_squared_error(test_y, pred_y))
    return rmse

# 여섯 가지 모델에 대한 rmse 구하기
for i in range(len(models)) :
    rmse = fitting_svr(models[i])
    print ('rmse of',i+1,'th model :',rmse)

rmse of 1 th model : 80975.45267658445
rmse of 2 th model : 110778.91954075838
rmse of 3 th model : 119620.85398652202
rmse of 4 th model : 120885.09660917179
rmse of 5 th model : 117136.81257662126
rmse of 6 th model : 120682.04730071164


# ch2. exercise 2

In [24]:
# 위에서 가장 좋았던 linear SVR에 대한 하이퍼 파라미터를 조정
param_grid = {'C':[0.1,0.3,0.5,0.7,0.9,1,3,5,7,9,11,13,15,17,19,25,30]}
randomized_cv = RandomizedSearchCV(SVR(kernel='linear'), param_grid, n_iter=5, n_jobs=-1, cv=3, random_state=79)

svr_linear_cv = randomized_cv.fit(train_x_scaled, train_y)
svr_linear_cv.best_params_

{'C': 19}

In [25]:
# 위에서 나온 best parameter로 fit->pred->rmse 구하기
fitting_svr(SVR(kernel='linear', C=list(svr_linear_cv.best_params_.values())[0]))

77347.73424206937

# ch3. exercise 1

In [26]:
# read data -> train test split
mnist = fetch_openml('mnist_784', version=1)
x, y = mnist["data"], mnist["target"]

train_x, test_x, train_y, test_y = x[:60000], x[60000:], y[:60000], y[60000:]

In [None]:
# knn classifier의 하이퍼 파라미터 : n_neighbors 조정
param_grid = {'n_neighbors':list(range(1,100))}
randomized_cv = RandomizedSearchCV(KNeighborsClassifier(), param_grid, n_iter=5, n_jobs=-1, cv=3, random_state=79)

knn_cv = randomized_cv.fit(train_x, train_y)
knn_cv.best_params_

In [None]:
# 위에서 나온 best parameter로 fit->pred->accuracy 구하기
knn_clf = KNeighborsClassifier(n_neighbors=list(knn_cv.best_params_.values())[0])
knn_fit = knn_clf.fit(train_x, train_y)
pred_y = knn_clf.predict(test_x)

accuracy_score(test_y, pred_y)

# ch3. exercise 3

In [2]:
# read csv
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

In [3]:
# set index
train.index = train['PassengerId']
train = train.drop('PassengerId', axis=1)

test.index = test['PassengerId']
test = test.drop('PassengerId', axis=1)

In [4]:
# train set 결측값 확인
train.isnull().sum()

Survived       0
Sex            0
Age            0
Fare           0
Pclass_1       0
Pclass_2       0
Pclass_3       0
Family_size    0
Title_1        0
Title_2        0
Title_3        0
Title_4        0
Emb_1          0
Emb_2          0
Emb_3          0
dtype: int64

In [5]:
# test set 결측값 확인
test.isnull().sum()

Survived       0
Sex            0
Age            0
Fare           0
Pclass_1       0
Pclass_2       0
Pclass_3       0
Family_size    0
Title_1        0
Title_2        0
Title_3        0
Title_4        0
Emb_1          0
Emb_2          0
Emb_3          0
dtype: int64

In [6]:
# 독립변수, 종속변수(target)로 나누기
train_x = train.drop('Survived', axis=1)
train_y = train.Survived

test_x = test.drop('Survived', axis=1)
test_y = test.Survived

In [7]:
# scaling
scaling = StandardScaler()
train_x_scaled = scaling.fit_transform(train_x)
test_x_scaled = scaling.transform(test_x)

In [11]:
# random forest classifier hyperparameter tuning
param_grid = {'n_estimators':list(range(1,200)),
             'max_depth':list(range(1,10)),
             'max_features':['auto','sqrt','log2']}
randomized_cv = RandomizedSearchCV(RandomForestClassifier(), param_grid, n_iter=5, n_jobs=-1, cv=3, random_state=79)

rf_cv = randomized_cv.fit(train_x_scaled, train_y)
rf_cv.best_params_

{'n_estimators': 136, 'max_features': 'log2', 'max_depth': 9}

In [12]:
# 위에서 나온 best parameter로 fit->pred->accuracy 구하기
rf_clf = RandomForestClassifier(n_estimators=list(rf_cv.best_params_.values())[0],
                               max_features=list(rf_cv.best_params_.values())[1],
                               max_depth=list(rf_cv.best_params_.values())[2])
rf_fit = rf_clf.fit(train_x_scaled, train_y)
pred_y = rf_clf.predict(test_x_scaled)

accuracy_score(test_y, pred_y)

0.83