<a href="https://colab.research.google.com/github/ttogle918/ds-sa-problemsolving/blob/main/refactoring_optimize.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

참고 : 
https://deep-diver.github.io/pocket-ml-reference-korean/chapter3/

### 데이터의 수집

In [1]:
import pandas as pd
url = "https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic3.xls"
df = pd.read_excel(url)
orig_df = df
df.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')

In [2]:
mask = df.isnull().any(axis=1)
mask.head()

0    True
1    True
2    True
3    True
4    True
dtype: bool

In [3]:
df[mask].body.head()

0      NaN
1      NaN
2      NaN
3    135.0
4      NaN
Name: body, dtype: float64

In [4]:
df.sex.value_counts(dropna=False)

male      843
female    466
Name: sex, dtype: int64

In [5]:
df.embarked.value_counts(dropna=False)

S      914
C      270
Q      123
NaN      2
Name: embarked, dtype: int64

### 특징 생성

In [6]:
name = df.name
name.head(3)

0     Allen, Miss. Elisabeth Walton
1    Allison, Master. Hudson Trevor
2      Allison, Miss. Helen Loraine
Name: name, dtype: object

In [7]:
df = df.drop(
    columns=["name",
             "ticket",
             "home.dest",
             "boat",
             "body",
             "cabin"]
    )
df.columns

Index(['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked'],
      dtype='object')

In [8]:
y = df.survived
X = df.drop(columns="survived")
print(X, y)

      pclass     sex      age  sibsp  parch      fare embarked
0          1  female  29.0000      0      0  211.3375        S
1          1    male   0.9167      1      2  151.5500        S
2          1  female   2.0000      1      2  151.5500        S
3          1    male  30.0000      1      2  151.5500        S
4          1  female  25.0000      1      2  151.5500        S
...      ...     ...      ...    ...    ...       ...      ...
1304       3  female  14.5000      1      0   14.4542        C
1305       3  female      NaN      1      0   14.4542        C
1306       3    male  26.5000      0      0    7.2250        C
1307       3    male  27.0000      0      0    7.2250        C
1308       3    male  29.0000      0      0    7.8750        S

[1309 rows x 7 columns] 0       1
1       1
2       0
3       0
4       0
       ..
1304    0
1305    0
1306    0
1307    0
1308    0
Name: survived, Length: 1309, dtype: int64


### 샘플 데이터

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### 리팩터링

In [21]:
from sklearn import preprocessing
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
def tweak_titanic(df):
  df = df.drop(
    columns=[
      "name",
      "ticket",
      "home.dest",
      "boat",
      "body",
      "cabin",
    ]
  ).pipe(pd.get_dummies, drop_first=True)
  return df

def get_train_test_X_y(df, y_col, size=0.3, std_cols=None):
  y = df[y_col]
  X = df.drop(columns=y_col)
  X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=size, random_state=42
  )
  cols = X.columns
  num_cols = [
    "pclass",
    "age",
    "sibsp",
    "parch",
    "fare",
  ]

  fi = IterativeImputer()

  X_train.loc[:, num_cols] = fi.fit_transform(X_train[num_cols])
  X_test.loc[:, num_cols] = fi.transform(X_test[num_cols])

  if std_cols:
    std = preprocessing.StandardScaler()
    X_train.loc[:, std_cols] = std.fit_transform(X_train[std_cols])
    X_test.loc[:, std_cols] = std.transform(X_test[std_cols])

  return X_train, X_test, y_train, y_test  

In [22]:
ti_df = tweak_titanic(orig_df)
std_cols = "pclass,age,sibsp,fare".split(",")

X_train, X_test, y_train, y_test = get_train_test_X_y(ti_df, "survived", std_cols=std_cols)

### 모델 만들기

In [23]:
from sklearn import ensemble

rf = ensemble.RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

### 모델의 평가

In [24]:
rf.score(X_test, y_test)

0.7837150127226463

In [25]:
from sklearn import metrics
metrics.precision_score(y_test, rf.predict(X_test))

0.7916666666666666

In [26]:
for col, val in sorted(zip(X_train.columns, 
                           rf.feature_importances_), 
                       key=lambda x: x[1],
                       reverse=True)[:5]:
  print(f"{col:10}{val:10.3f}")

age            0.285
fare           0.262
sex_male       0.241
pclass         0.089
sibsp          0.050


### 모델의 최적화

In [27]:
from sklearn import model_selection

rf4 = ensemble.RandomForestClassifier()
params = {
  "max_features": [0.4, "auto"],
  "n_estimators": [15, 200],
  "min_samples_leaf": [1, 0.1],
  "random_state": [42],
}

cv = model_selection.GridSearchCV(rf4, params, n_jobs=-1).fit(X_train, y_train)
print(cv.best_params_)

{'max_features': 0.4, 'min_samples_leaf': 1, 'n_estimators': 200, 'random_state': 42}


In [28]:
rf5 = ensemble.RandomForestClassifier(
  **{
  "max_features": "auto",
  "min_samples_leaf": 0.1,
  "n_estimators": 200,
  "random_state": 42,
  }
)

rf5.fit(X_train, y_train)
rf5.score(X_test, y_test)

0.7073791348600509