In [1]:
# Loading the Penguins Dataset from Seaborn
import seaborn as sns
import pandas as pd

df = sns.load_dataset('penguins')
print(df.head())

  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen            39.1           18.7              181.0   
1  Adelie  Torgersen            39.5           17.4              186.0   
2  Adelie  Torgersen            40.3           18.0              195.0   
3  Adelie  Torgersen             NaN            NaN                NaN   
4  Adelie  Torgersen            36.7           19.3              193.0   

   body_mass_g     sex  
0       3750.0    Male  
1       3800.0  Female  
2       3250.0  Female  
3          NaN     NaN  
4       3450.0  Female  


In [2]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB
None


In [3]:
print(df.isnull().sum())

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64


In [6]:
# Imputing missing numerical data
from sklearn.impute import SimpleImputer
import numpy as np

# Create a SimpleImputer Class
imputer = SimpleImputer(missing_values=np.NaN, strategy='mean')

# Fit the columns to the object
columns = ['bill_depth_mm', 'bill_length_mm', 'flipper_length_mm', 'body_mass_g']
imputer = imputer.fit(df[columns])

# Transform the DataFrames column with the fitted data
df[columns] = imputer.transform(df[columns])

In [7]:
# Dropping missing records in the sex column
df = df.dropna(subset=['sex'])

In [9]:
print(df.isnull().sum())

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64


In [10]:
# Mapping the sex variable to binary values
df['sex int'] = df['sex'].map({'Male': 0, 'Female': 1})

In [11]:
# Checking unique values in the island feature
print(df['island'].unique())

['Torgersen' 'Biscoe' 'Dream']


In [12]:
# One-hot Encoding the Island Feature
from sklearn.preprocessing import OneHotEncoder
one_hot = OneHotEncoder()
encoded = one_hot.fit_transform(df[['island']])
df[one_hot.categories_[0]] = encoded.toarray()

In [13]:
# Dropping Unnecessary Columns
df = df.drop(columns=['island', 'sex'])

In [14]:
# Splitting the data and creating a model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X = df.iloc[:, 1:]
y = df['species']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=100)

forest = RandomForestClassifier(n_estimators=100, random_state=100)

In [15]:
# Fitting a model and making predictions
forest.fit(X_train,y_train)
predictions = forest.predict(X_test)

In [16]:
# Evaluating the model
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, predictions))

Accuracy: 0.97
