In [1]:
import seaborn as sns
df = sns.load_dataset('planets')

In [2]:
df.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [3]:
df['orbital_period'].quantile(0.25)

5.4425405

In [4]:
Q25 = df['orbital_period'].quantile(0.25)
Q75 = df['orbital_period'].quantile(0.75)

In [5]:
IQR = Q75 - Q25
min_val = Q25 - IQR * 1.5
max_val = Q75 + IQR * 1.5

In [6]:
min_val, max_val

(-775.4011487500002, 1306.84868925)

In [14]:
df_outlier = df[(df['orbital_period'] <= min_val) | (df['orbital_period'] >= max_val)]

In [15]:
len(df), len(df_outlier)

(1035, 126)

In [17]:
df_outlier.orbital_period

6        1773.4
14       2391.0
15      14002.0
19       4909.0
32      10220.0
         ...   
920      3500.0
921      1825.0
922      5100.0
937    730000.0
944     36525.0
Name: orbital_period, Length: 126, dtype: float64

### 3 결측치 제거

In [18]:
df = sns.load_dataset('penguins')

In [19]:
df.isna().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [20]:
missing = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
for i in missing:
    df[i] = df[i].fillna(df[i].median())
df['sex'] = df['sex'].fillna('Male')

In [21]:
df.isna().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [24]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,44.45,17.3,197.0,4050.0,Male
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [25]:
from sklearn.preprocessing import LabelEncoder
label = ['species', 'island', 'sex']
df[label] = df[label].apply(LabelEncoder().fit_transform)

In [26]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,2,39.1,18.7,181.0,3750.0,1
1,0,2,39.5,17.4,186.0,3800.0,0
2,0,2,40.3,18.0,195.0,3250.0,0
3,0,2,44.45,17.3,197.0,4050.0,1
4,0,2,36.7,19.3,193.0,3450.0,0


In [29]:
import pandas as pd
category = ['island', 'sex']
for i in category:
    df[i] = df[i].astype('category')

In [30]:
df = pd.get_dummies(df)

In [31]:
df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_0,island_1,island_2,sex_0,sex_1
0,0,39.1,18.7,181.0,3750.0,0,0,1,0,1
1,0,39.5,17.4,186.0,3800.0,0,0,1,1,0
2,0,40.3,18.0,195.0,3250.0,0,0,1,1,0
3,0,44.45,17.3,197.0,4050.0,0,0,1,0,1
4,0,36.7,19.3,193.0,3450.0,0,0,1,1,0


In [34]:
# 파생변수
df['body_mass_g_qcut'] = pd.qcut(df['body_mass_g'], 5, labels=False)
df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_0,island_1,island_2,sex_0,sex_1,body_mass_g_qcut
0,0,39.1,18.7,181.0,3750.0,0,0,1,0,1,1
1,0,39.5,17.4,186.0,3800.0,0,0,1,1,0,1
2,0,40.3,18.0,195.0,3250.0,0,0,1,1,0,0
3,0,44.45,17.3,197.0,4050.0,0,0,1,0,1,2
4,0,36.7,19.3,193.0,3450.0,0,0,1,1,0,0


In [35]:
df['body_mass_g_qcut'].value_counts()

0    71
1    70
2    68
4    68
3    67
Name: body_mass_g_qcut, dtype: int64

In [37]:
# scaling
from sklearn.preprocessing import MinMaxScaler
scaler_list = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
scaler = MinMaxScaler()
scaler.fit(df[scaler_list])
df[scaler_list] = scaler.transform(df[scaler_list])

In [38]:
df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_0,island_1,island_2,sex_0,sex_1,body_mass_g_qcut
0,0,0.254545,0.666667,0.152542,0.291667,0,0,1,0,1,1
1,0,0.269091,0.511905,0.237288,0.305556,0,0,1,1,0,1
2,0,0.298182,0.583333,0.389831,0.152778,0,0,1,1,0,0
3,0,0.449091,0.5,0.423729,0.375,0,0,1,0,1,2
4,0,0.167273,0.738095,0.355932,0.208333,0,0,1,1,0,0


In [41]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:, 1:], df['species'], test_size=0.2, stratify=df['species'], random_state=1)# 종속변수를 기준으로 최대한 split

In [42]:
x_train.shape

(275, 10)

In [43]:
from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier()
model1.fit(x_train, y_train)
pred1 = model1.predict(x_test)

In [44]:
from sklearn.ensemble import AdaBoostClassifier
model2 = AdaBoostClassifier()
model2.fit(x_train, y_train)
pred2 = model2.predict(x_test)

In [45]:
pred1, pred2

(array([0, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 0, 0, 0, 2, 0, 2, 2, 2, 0, 0, 2,
        0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 2, 1, 1, 0, 2, 2, 2, 1, 0, 2, 2, 2,
        2, 2, 0, 1, 0, 0, 2, 2, 0, 2, 2, 0, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0,
        2, 0, 1]),
 array([0, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 0, 0, 0, 2, 0, 2, 2, 2, 0, 0, 2,
        0, 1, 1, 0, 0, 0, 1, 2, 0, 0, 2, 1, 1, 0, 2, 2, 2, 1, 0, 2, 2, 2,
        2, 2, 0, 1, 0, 0, 2, 2, 0, 2, 2, 0, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0,
        2, 0, 1]))

In [46]:
# 앙상블
from sklearn.ensemble import VotingClassifier
clf = VotingClassifier(estimators=[('rf', model1), ('ad', model2)], voting='hard')
clf.fit(x_train, y_train)
pred3 = clf.predict(x_test)

In [47]:
pred3

array([0, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 0, 0, 0, 2, 0, 2, 2, 2, 0, 0, 2,
       0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 2, 1, 1, 0, 2, 2, 2, 1, 0, 2, 2, 2,
       2, 2, 0, 1, 0, 0, 2, 2, 0, 2, 2, 0, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0,
       2, 0, 1])

In [48]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred1), accuracy_score(y_test, pred2), accuracy_score(y_test, pred3)

(1.0, 0.9855072463768116, 1.0)

In [52]:
###### 하이퍼파라미터 튜닝
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators':[50, 100], 'max_depth':[4, 6]}
model4 = RandomForestClassifier()
clf = GridSearchCV(estimator=model4, param_grid=parameters, cv=3)
clf.fit(x_train, y_train)
clf.best_params_

{'max_depth': 4, 'n_estimators': 100}

In [55]:
pd.DataFrame({'id':y_test.index, 'pred': pred3}).to_csv('test.csv', index=False)

In [56]:
check = pd.read_csv('test.csv')

In [57]:
check.head()

Unnamed: 0,id,pred
0,57,0
1,173,1
2,213,1
3,50,0
4,25,0
