In [2]:
# 1 Data import 
import seaborn as sns
import pandas as pd
df=sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [3]:
# 2 Missing values
df.isna().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [4]:
missing = ['bill_length_mm',	'bill_depth_mm',	'flipper_length_mm',	'body_mass_g'	]

for i in missing:
    df[i] = df[i].fillna(df[i].median())

df['sex'] = df['sex'].fillna('Female')



In [5]:
df.isna().sum()


species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [6]:
# 3 label incoding
from sklearn.preprocessing import LabelEncoder

label = ['species',	'island', 'sex']
df[label] = df[label].apply(LabelEncoder().fit_transform)

In [7]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,2,39.1,18.7,181.0,3750.0,1
1,0,2,39.5,17.4,186.0,3800.0,0
2,0,2,40.3,18.0,195.0,3250.0,0
3,0,2,44.45,17.3,197.0,4050.0,0
4,0,2,36.7,19.3,193.0,3450.0,0


In [8]:
df.dtypes

species                int64
island                 int64
bill_length_mm       float64
bill_depth_mm        float64
flipper_length_mm    float64
body_mass_g          float64
sex                    int64
dtype: object

In [9]:
# 4 data dummy
import pandas as pd
category = ['island', 'sex' ]

for i in category:
    df[i] = df[i].astype('category')

df=pd.get_dummies(df, dtype=int) # pandas updated dtype=int 

In [10]:
df.dtypes

species                int64
bill_length_mm       float64
bill_depth_mm        float64
flipper_length_mm    float64
body_mass_g          float64
island_0               int64
island_1               int64
island_2               int64
sex_0                  int64
sex_1                  int64
dtype: object

In [11]:
df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_0,island_1,island_2,sex_0,sex_1
0,0,39.1,18.7,181.0,3750.0,0,0,1,0,1
1,0,39.5,17.4,186.0,3800.0,0,0,1,1,0
2,0,40.3,18.0,195.0,3250.0,0,0,1,1,0
3,0,44.45,17.3,197.0,4050.0,0,0,1,1,0
4,0,36.7,19.3,193.0,3450.0,0,0,1,1,0


In [12]:
# 파생변수
df['body_mass_g_qcut']= pd.qcut(df['body_mass_g'], 5, labels=False)
df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_0,island_1,island_2,sex_0,sex_1,body_mass_g_qcut
0,0,39.1,18.7,181.0,3750.0,0,0,1,0,1,1
1,0,39.5,17.4,186.0,3800.0,0,0,1,1,0,1
2,0,40.3,18.0,195.0,3250.0,0,0,1,1,0,0
3,0,44.45,17.3,197.0,4050.0,0,0,1,1,0,2
4,0,36.7,19.3,193.0,3450.0,0,0,1,1,0,0


In [13]:
df['body_mass_g'].value_counts()

body_mass_g
3800.0    12
3700.0    11
3900.0    10
3950.0    10
3550.0     9
          ..
4475.0     1
3975.0     1
3575.0     1
3850.0     1
5750.0     1
Name: count, Length: 94, dtype: int64

In [14]:
# 5 scale
from sklearn.preprocessing import MinMaxScaler
col_scale = ['bill_length_mm', 'bill_depth_mm','flipper_length_mm', 'body_mass_g']
min_scaler = MinMaxScaler()
df[col_scale]= min_scaler.fit_transform(df[col_scale])


In [15]:
# 6 data split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(df.iloc[:, 1:],df['species'], test_size=0.2, stratify=df['species'], random_state=1)

In [16]:
print(f"X_train, {X_train.shape}")
print(f"X_test, {X_test.shape}" )
print(f"y_train, {y_train.shape}" )
print(f"y_test, {y_test.shape}" )


X_train, (275, 10)
X_test, (69, 10)
y_train, (275,)
y_test, (69,)


In [17]:
# 7 model train 1
from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier(max_depth=6, n_estimators=100)
model1.fit(X_train, y_train)
pred1 = model1.predict(X_test)
pred1


array([0, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 0, 0, 0, 2, 0, 2, 2, 2, 0, 0, 2,
       0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 2, 1, 1, 0, 2, 2, 2, 1, 0, 2, 2, 2,
       2, 2, 0, 1, 0, 0, 2, 2, 0, 2, 2, 0, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0,
       2, 0, 1])

In [18]:
# 9 model train 2
from sklearn.ensemble import AdaBoostClassifier
model2 = AdaBoostClassifier()
model2.fit(X_train, y_train)
pred2 = model2.predict(X_test)
pred2

array([0, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 0, 0, 0, 2, 0, 2, 2, 2, 0, 0, 2,
       0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 2, 1, 1, 0, 2, 2, 2, 1, 0, 2, 2, 2,
       2, 2, 0, 1, 0, 0, 2, 2, 0, 2, 2, 0, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0,
       2, 0, 1])

In [19]:
# 10 ensemble
from sklearn.ensemble import VotingClassifier
clf = VotingClassifier(estimators=[('rf', model1), ('ad', model2)], voting='hard')
clf.fit(X_train, y_train)
pred3 = (clf.predict(X_test))
pred3

array([0, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 0, 0, 0, 2, 0, 2, 2, 2, 0, 0, 2,
       0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 2, 1, 1, 0, 2, 2, 2, 1, 0, 2, 2, 2,
       2, 2, 0, 1, 0, 0, 2, 2, 0, 2, 2, 0, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0,
       2, 0, 1])

In [None]:
# 11 eveluation
from sklearn.metrics import accuracy_score
# from sklearn.metrics import classification_report

print(f"randomforest, {accuracy_score(y_test, pred1)}")
print(f"ada, {accuracy_score(y_test, pred2)}")
print(f"ensemble, {accuracy_score(y_test, pred3)}")

randomforest, 1.0
ada, 1.0
ensemble, 1.0


In [21]:
# 12 hyper parameter tuning
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators': [50, 100], 'max_depth':[4,6]}
model4= RandomForestClassifier()
clf=GridSearchCV(estimator=model4,param_grid=parameters, cv=3 )

clf.fit(X_train, y_train)
print('best parameter', clf.best_params_)

best parameter {'max_depth': 6, 'n_estimators': 50}


In [None]:
# 13 prediction
pd.DataFrame({'id': y_test.index, 'pred': pred3}).to_csv('00300.csv', index=False)
check = pd.read_csv('00300.csv')
check.head()

Unnamed: 0,id,pred
0,57,0
1,173,1
2,213,1
3,50,0
4,25,0
