In [3]:
import pandas as pd
import numpy as np
import seaborn as sns

In [4]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [5]:
df = sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [6]:
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [7]:
df.isna().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [9]:
df['species'].value_counts()

species
Adelie       152
Gentoo       124
Chinstrap     68
Name: count, dtype: int64

In [10]:
df.columns

Index(['species', 'island', 'bill_length_mm', 'bill_depth_mm',
       'flipper_length_mm', 'body_mass_g', 'sex'],
      dtype='object')

In [11]:
df.shape

(344, 7)

In [12]:
df['bill_length_mm'].median()

np.float64(44.45)

In [13]:
df['bill_length_mm'] = df['bill_length_mm'].fillna(44.45)

In [14]:
df['bill_depth_mm'].median()

np.float64(17.3)

In [15]:
df['bill_depth_mm'] = df['bill_depth_mm'].fillna(17.3)

In [16]:
df['sex'].value_counts()

sex
Male      168
Female    165
Name: count, dtype: int64

In [17]:
df['sex'].mode()

0    Male
Name: sex, dtype: object

In [18]:
df['sex'] = df['sex'].fillna('Male')


In [19]:
df.dropna(inplace=True) # inplace = True to make changes permanent

In [20]:
df.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [21]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


In [22]:
df[['flipper_length_mm', 'body_mass_g']] = df[['flipper_length_mm', 'body_mass_g']].astype(int) 

In [23]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()

In [24]:
df['species'] = label.fit_transform(df['species'])
df['sex'] = label.fit_transform(df['sex'])

In [25]:
df1 = pd.get_dummies(df['island'], dtype=int)

In [26]:
df = pd.concat([df,df1], axis=1)

In [27]:
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,Biscoe,Dream,Torgersen
0,0,Torgersen,39.1,18.7,181,3750,1,0,0,1
1,0,Torgersen,39.5,17.4,186,3800,0,0,0,1
2,0,Torgersen,40.3,18.0,195,3250,0,0,0,1
4,0,Torgersen,36.7,19.3,193,3450,0,0,0,1
5,0,Torgersen,39.3,20.6,190,3650,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
338,2,Biscoe,47.2,13.7,214,4925,0,1,0,0
340,2,Biscoe,46.8,14.3,215,4850,0,1,0,0
341,2,Biscoe,50.4,15.7,222,5750,1,1,0,0
342,2,Biscoe,45.2,14.8,212,5200,0,1,0,0


In [28]:
del df['island']

#### Model Building of SVM

##### Defining Input and Target column

In [29]:
x = df.iloc[:, 1:]
x

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,Biscoe,Dream,Torgersen
0,39.1,18.7,181,3750,1,0,0,1
1,39.5,17.4,186,3800,0,0,0,1
2,40.3,18.0,195,3250,0,0,0,1
4,36.7,19.3,193,3450,0,0,0,1
5,39.3,20.6,190,3650,1,0,0,1
...,...,...,...,...,...,...,...,...
338,47.2,13.7,214,4925,0,1,0,0
340,46.8,14.3,215,4850,0,1,0,0
341,50.4,15.7,222,5750,1,1,0,0
342,45.2,14.8,212,5200,0,1,0,0


In [30]:
y = df.iloc[:, 0]
y

0      0
1      0
2      0
4      0
5      0
      ..
338    2
340    2
341    2
342    2
343    2
Name: species, Length: 342, dtype: int64

In [31]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, train_size=0.75, random_state=25)

In [32]:
df.shape

(342, 9)

In [33]:
x_train.shape

(256, 8)

In [34]:
from sklearn.svm import SVC
svc = SVC()

In [35]:
svc.fit(x_train,y_train)

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [36]:
y_pred =svc.predict(x_test)

In [37]:
from sklearn.metrics import classification_report, recall_score, f1_score, precision_score

In [38]:
df['sex'].value_counts()

sex
1    177
0    165
Name: count, dtype: int64

In [39]:
precision = precision_score(y_test, y_pred, average='weighted') # weighted average is used when the data is imbalanced
precision

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


0.608567021686811

In [40]:
recall = recall_score(y_test, y_pred, average='weighted')
recall

0.7441860465116279

In [41]:
f1 = f1_score(y_test, y_pred, average='weighted')
f1

0.6665641771019678

In [42]:
clf = classification_report(y_test, y_pred)
print(clf)

              precision    recall  f1-score   support

           0       0.66      0.92      0.77        38
           1       0.00      0.00      0.00        17
           2       0.88      0.94      0.91        31

    accuracy                           0.74        86
   macro avg       0.51      0.62      0.56        86
weighted avg       0.61      0.74      0.67        86



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.fit_transform(x_test)

In [None]:
svc = SVC(class_weight='balanced')

In [None]:
svc.fit(x_train_scaled,y_train)

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,'balanced'


In [None]:
y_pred = svc.predict(x_test_scaled)

In [None]:
clf = classification_report(y_test, y_pred)
print(clf)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        38
           1       1.00      1.00      1.00        17
           2       1.00      1.00      1.00        31

    accuracy                           1.00        86
   macro avg       1.00      1.00      1.00        86
weighted avg       1.00      1.00      1.00        86



#### The model performance is bad so we do hyper parameter tuning