# Xây dựng mô hình từ giải thuật SVM trên dữ liệu các con thú trong rừng.
Dữ liệu lấy từ [Animal-Condition-Predict(SVM+KNN)](https://www.kaggle.com/code/kareemellithy/animal-condition-predict-svm-knn)

## Chuẩn bị

### Khai báo thư viện cần thiết

In [29]:
import numpy as np
import pandas as pd
from IPython import display
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.preprocessing import LabelEncoder

### Tải dữ liệu

In [30]:
animal_data = pd.read_csv('../data/animal_dataset.csv')

## Sơ lược về dữ liệu

### Head, Tail

In [31]:
print("Head 5")
display.display(animal_data.head())
print("Tail 5")
display.display(animal_data.tail())

Head 5


Unnamed: 0,AnimalName,symptoms1,symptoms2,symptoms3,symptoms4,symptoms5,Dangerous
0,Dog,Fever,Diarrhea,Vomiting,Weight loss,Dehydration,Yes
1,Dog,Fever,Diarrhea,Coughing,Tiredness,Pains,Yes
2,Dog,Fever,Diarrhea,Coughing,Vomiting,Anorexia,Yes
3,Dog,Fever,Difficulty breathing,Coughing,Lethargy,Sneezing,Yes
4,Dog,Fever,Diarrhea,Coughing,Lethargy,Blue Eye,Yes


Tail 5


Unnamed: 0,AnimalName,symptoms1,symptoms2,symptoms3,symptoms4,symptoms5,Dangerous
866,Buffaloes,Fever,Difficulty breathing,Poor Appetite,Eye and Skin change,Unable to exercise,Yes
867,Buffaloes,Fever,Loss of appetite,Lession on the skin,Lethargy,Joint Pain,Yes
868,Buffaloes,Lesions in the nasal cavity,Lesions on nose,Vomiting,Noisy Breathing,Lesions on nose,Yes
869,Buffaloes,Hair loss,Dandruff,Vomiting,Crusting of the skin,Ulcerated skin,Yes
870,Buffaloes,Greenish-yellow nasal discharge,Lack of pigmentation,Vomiting,Lethargy,Pain on face,Yes


### Shape, info

In [32]:
print("Shape:", animal_data.shape)
print("Info:")
animal_data.info()

Shape: (871, 7)
Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 871 entries, 0 to 870
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   AnimalName  871 non-null    object
 1   symptoms1   871 non-null    object
 2   symptoms2   871 non-null    object
 3   symptoms3   871 non-null    object
 4   symptoms4   871 non-null    object
 5   symptoms5   871 non-null    object
 6   Dangerous   869 non-null    object
dtypes: object(7)
memory usage: 47.8+ KB


### Mean, std, min, max, count

In [33]:
animal_data.describe().T

Unnamed: 0,count,unique,top,freq
AnimalName,871,46,Buffaloes,129
symptoms1,871,232,Fever,257
symptoms2,871,230,Diarrhea,119
symptoms3,871,229,Coughing,95
symptoms4,871,217,Weight loss,117
symptoms5,871,203,Pains,99
Dangerous,869,2,Yes,849


## Làm sạch dữ liệu

### NaN, Null

In [34]:
animal_data.isnull().sum()

AnimalName    0
symptoms1     0
symptoms2     0
symptoms3     0
symptoms4     0
symptoms5     0
Dangerous     2
dtype: int64

In [35]:
animal_data.dropna(inplace=True)
print(animal_data.isna().sum())
print("Shape after dropping NaN:", animal_data.shape)

AnimalName    0
symptoms1     0
symptoms2     0
symptoms3     0
symptoms4     0
symptoms5     0
Dangerous     0
dtype: int64
Shape after dropping NaN: (869, 7)


Kiểm tra duplicate

In [36]:
animal_data[animal_data.duplicated()]

Unnamed: 0,AnimalName,symptoms1,symptoms2,symptoms3,symptoms4,symptoms5,Dangerous
772,Tiger,Fetopelvic dispropotion,Uteria inertia,Malpresentation,Death,Pains,Yes
773,Tiger,Fetopelvic dispropotion,Uteria inertia,Malpresentation,Death,Pains,Yes
774,Tiger,Fetopelvic dispropotion,Uteria inertia,Malpresentation,Death,Pains,Yes
775,Tiger,Fetopelvic dispropotion,Uteria inertia,Malpresentation,Death,Pains,Yes
776,Tiger,Fetopelvic dispropotion,Uteria inertia,Malpresentation,Death,Pains,Yes
777,Tiger,Fetopelvic dispropotion,Uteria inertia,Malpresentation,Death,Pains,Yes
778,Tiger,Fetopelvic dispropotion,Uteria inertia,Malpresentation,Death,Pains,Yes
779,Tiger,Fetopelvic dispropotion,Uteria inertia,Malpresentation,Death,Pains,Yes
780,Tiger,Fetopelvic dispropotion,Uteria inertia,Malpresentation,Death,Pains,Yes
781,Tiger,Fetopelvic dispropotion,Uteria inertia,Malpresentation,Death,Pains,Yes


In [37]:
animal_data = animal_data.drop_duplicates()
print("Duplicate rows after removing:", animal_data.duplicated().sum())
print("Duplicate removed. New shape:", animal_data.shape)

Duplicate rows after removing: 0
Duplicate removed. New shape: (841, 7)


### Label Encoding

In [38]:
cols = animal_data.select_dtypes(include=['object']).columns
la = LabelEncoder()
for col in cols:
    animal_data[col] = la.fit_transform(animal_data[col])
    
display.display(animal_data.sample(10))

Unnamed: 0,AnimalName,symptoms1,symptoms2,symptoms3,symptoms4,symptoms5,Dangerous
78,27,131,74,180,123,81,1
337,29,172,40,43,29,113,1
32,6,63,6,185,87,110,1
390,42,223,216,224,181,180,1
169,15,63,193,57,36,110,1
568,12,63,180,38,33,148,1
661,2,63,32,188,146,36,1
657,2,159,149,160,41,79,1
815,8,212,217,217,210,178,1
275,29,25,34,38,5,25,1


## Chia dữ liệu train & test

In [39]:
data = animal_data.drop(columns=['Dangerous'])
target = animal_data['Dangerous']
display.display(data)
display.display(target)

Unnamed: 0,AnimalName,symptoms1,symptoms2,symptoms3,symptoms4,symptoms5
0,6,63,30,179,181,32
1,6,63,30,31,164,113
2,6,63,30,31,172,8
3,6,63,33,31,87,142
4,6,63,30,31,87,21
...,...,...,...,...,...,...
866,2,63,33,115,53,160
867,2,63,94,88,87,78
868,2,97,89,179,107,82
869,2,77,25,179,27,159


0      1
1      1
2      1
3      1
4      1
      ..
866    1
867    1
868    1
869    1
870    1
Name: Dangerous, Length: 841, dtype: int64

In [40]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

## Huấn luyện mô hình SVM

In [41]:
clf = svm.SVC()
clf.fit(X_train, y_train)

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


## Đánh giá độ chính xác

In [42]:
train_acc = clf.score(X_train,y_train)
val_acc = clf.score(X_test,y_test)
print('Training accuracy: {}'.format(train_acc))
print('Validation accuracy: {}'.format(val_acc))

Training accuracy: 0.9747023809523809
Validation accuracy: 0.9822485207100592


## Tìm tham số tối ưu

In [43]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
best_svm = None
best_val_acc = -1
best_kernel = None

for i in range(4):
    clf = svm.SVC(kernel=kernels[i], probability=True)
    clf.fit(X_train, y_train)
    tmp_val_acc = clf.score(X_test, y_test)
    if (tmp_val_acc > best_val_acc):
        best_val_acc = tmp_val_acc
    best_svm = clf
    best_kernel = kernels[i]

print("Best validation accuracy : {} with kernel: {}".format(best_val_acc, best_kernel))

Best validation accuracy : 0.9822485207100592 with kernel: sigmoid


# Kết thúc