In [7]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Các model scikit learn cho bài toán phân loại
from sklearn.svm import SVC # Support vector machine
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

### 1. Các thông tin về dữ liệu

In [8]:
train_df = pd.read_csv("train_clean.csv")

In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1069 entries, 0 to 1068
Data columns (total 54 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   label               1069 non-null   object 
 1   is_flip             1069 non-null   bool   
 2   horizontal          1069 non-null   bool   
 3   nose_x              1069 non-null   float64
 4   nose_y              1069 non-null   float64
 5   nose_z              1069 non-null   float64
 6   left_shoulder_x     1069 non-null   float64
 7   left_shoulder_y     1069 non-null   float64
 8   left_shoulder_z     1069 non-null   float64
 9   right_shoulder_x    1069 non-null   float64
 10  right_shoulder_y    1069 non-null   float64
 11  right_shoulder_z    1069 non-null   float64
 12  left_elbow_x        1069 non-null   float64
 13  left_elbow_y        1069 non-null   float64
 14  left_elbow_z        1069 non-null   float64
 15  right_elbow_x       1069 non-null   float64
 16  right_

In [20]:
train_df.columns

Index(['label', 'is_flip', 'horizontal', 'nose_x', 'nose_y', 'nose_z',
       'left_shoulder_x', 'left_shoulder_y', 'left_shoulder_z',
       'right_shoulder_x', 'right_shoulder_y', 'right_shoulder_z',
       'left_elbow_x', 'left_elbow_y', 'left_elbow_z', 'right_elbow_x',
       'right_elbow_y', 'right_elbow_z', 'left_wrist_x', 'left_wrist_y',
       'left_wrist_z', 'right_wrist_x', 'right_wrist_y', 'right_wrist_z',
       'left_hip_x', 'left_hip_y', 'left_hip_z', 'right_hip_x', 'right_hip_y',
       'right_hip_z', 'left_knee_x', 'left_knee_y', 'left_knee_z',
       'right_knee_x', 'right_knee_y', 'right_knee_z', 'left_ankle_x',
       'left_ankle_y', 'left_ankle_z', 'right_ankle_x', 'right_ankle_y',
       'right_ankle_z', 'left_heel_x', 'left_heel_y', 'left_heel_z',
       'right_heel_x', 'right_heel_y', 'right_heel_z', 'left_foot_index_x',
       'left_foot_index_y', 'left_foot_index_z', 'right_foot_index_x',
       'right_foot_index_y', 'right_foot_index_z'],
      dtype='object')

In [10]:
train_df.head()

Unnamed: 0,label,is_flip,horizontal,nose_x,nose_y,nose_z,left_shoulder_x,left_shoulder_y,left_shoulder_z,right_shoulder_x,...,left_heel_z,right_heel_x,right_heel_y,right_heel_z,left_foot_index_x,left_foot_index_y,left_foot_index_z,right_foot_index_x,right_foot_index_y,right_foot_index_z
0,C,False,True,0.715581,0.50528,-0.053817,0.652975,0.46734,0.133374,0.654601,...,0.259811,0.260686,0.540411,0.058975,0.269104,0.620491,0.219716,0.2591,0.620381,-0.008508
1,C,False,True,0.721967,0.434911,-0.062868,0.648795,0.423715,0.109654,0.650268,...,0.269626,0.26763,0.50901,0.094375,0.274874,0.594534,0.237126,0.262166,0.595848,0.042041
2,C,False,True,0.718587,0.430805,-0.075426,0.646167,0.43933,0.105028,0.644745,...,0.239015,0.268436,0.511306,0.062695,0.265034,0.587411,0.197538,0.253162,0.58615,0.001218
3,C,False,True,0.71743,0.427609,-0.076077,0.642745,0.43797,0.10436,0.64308,...,0.241515,0.268167,0.512053,0.070121,0.263887,0.585149,0.201427,0.25419,0.585665,0.010792
4,C,False,True,0.719244,0.42946,-0.074835,0.6466,0.438147,0.101769,0.646526,...,0.241088,0.268044,0.510686,0.073515,0.268694,0.592094,0.206195,0.254927,0.58687,0.018408


### 2. Training model

In [35]:
# Chuyển các cột boolean về số
train_df["is_flip"] = train_df["is_flip"].astype(int)
train_df["horizontal"] = train_df["horizontal"].astype(int)

In [42]:
# Chuyển dữ liệu của label về dạng số
from sklearn.preprocessing import LabelEncoder

# Khởi tạo LabelEncoder
label_encoder = LabelEncoder()

train_df["label"] = label_encoder.fit_transform(train_df["label"])
# train_df['label_decoded'] = label_encoder.inverse_transform(train_df['label_encoded'])

In [43]:
# Extract features
X = train_df.drop(columns="label")
y = train_df["label"]

In [44]:
X.head()

Unnamed: 0,is_flip,horizontal,nose_x,nose_y,nose_z,left_shoulder_x,left_shoulder_y,left_shoulder_z,right_shoulder_x,right_shoulder_y,...,left_heel_z,right_heel_x,right_heel_y,right_heel_z,left_foot_index_x,left_foot_index_y,left_foot_index_z,right_foot_index_x,right_foot_index_y,right_foot_index_z
0,0,1,0.715581,0.50528,-0.053817,0.652975,0.46734,0.133374,0.654601,0.489673,...,0.259811,0.260686,0.540411,0.058975,0.269104,0.620491,0.219716,0.2591,0.620381,-0.008508
1,0,1,0.721967,0.434911,-0.062868,0.648795,0.423715,0.109654,0.650268,0.462463,...,0.269626,0.26763,0.50901,0.094375,0.274874,0.594534,0.237126,0.262166,0.595848,0.042041
2,0,1,0.718587,0.430805,-0.075426,0.646167,0.43933,0.105028,0.644745,0.462119,...,0.239015,0.268436,0.511306,0.062695,0.265034,0.587411,0.197538,0.253162,0.58615,0.001218
3,0,1,0.71743,0.427609,-0.076077,0.642745,0.43797,0.10436,0.64308,0.461373,...,0.241515,0.268167,0.512053,0.070121,0.263887,0.585149,0.201427,0.25419,0.585665,0.010792
4,0,1,0.719244,0.42946,-0.074835,0.6466,0.438147,0.101769,0.646526,0.463304,...,0.241088,0.268044,0.510686,0.073515,0.268694,0.592094,0.206195,0.254927,0.58687,0.018408


In [45]:
y

0       0
1       0
2       0
3       0
4       0
       ..
1064    1
1065    1
1066    1
1067    1
1068    1
Name: label, Length: 1069, dtype: int32

In [46]:
sc = StandardScaler()

# Chuẩn hoá tập dữ liệu
X = pd.DataFrame(sc.fit_transform(X))

In [47]:
# Tách dữ liệu ra tập train, test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1234
)
y_test.head(3)

533    0
71     0
647    1
Name: label, dtype: int32

In [54]:
# X lúc này là một DataFrame mới với các cột mới được đánh số từ 0 đến n-1, trong đó n là số lượng cột trong dữ liệu ban đầu
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,43,44,45,46,47,48,49,50,51,52
0,-1.002810,0.0,1.143795,-0.244373,0.533166,1.086982,-0.526561,1.314017,1.215681,-0.128705,...,0.094110,-1.201363,0.406698,-0.844255,-1.250574,0.565927,0.082504,-1.217800,0.618381,-0.913525
1,-1.002810,0.0,1.177657,-1.183792,0.464983,1.056015,-1.364173,1.179692,1.183198,-0.673402,...,0.140654,-1.165028,-0.437558,-0.682369,-1.220344,-0.073853,0.161065,-1.201682,0.000272,-0.696312
2,-1.002810,0.0,1.159736,-1.238609,0.370369,1.036542,-1.064355,1.153494,1.141790,-0.680291,...,-0.004509,-1.160811,-0.375829,-0.827244,-1.271898,-0.249436,-0.017575,-1.249023,-0.244066,-0.871733
3,-1.002810,0.0,1.153601,-1.281273,0.365464,1.011192,-1.090477,1.149716,1.129306,-0.695240,...,0.007347,-1.162216,-0.355763,-0.793282,-1.277905,-0.305174,-0.000022,-1.243619,-0.256300,-0.830593
4,-1.002810,0.0,1.163220,-1.256557,0.374820,1.039753,-1.087074,1.135042,1.155145,-0.656577,...,0.005323,-1.162859,-0.392523,-0.777765,-1.252719,-0.134012,0.021492,-1.239740,-0.225936,-0.797866
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1064,0.997198,0.0,-0.818149,1.182362,-0.350212,-0.776548,2.206899,-0.993165,-1.029871,0.539259,...,1.188789,0.332549,0.020929,1.661855,0.413519,-0.108854,1.186635,0.304250,-0.462464,1.745369
1065,0.997198,0.0,-0.729532,1.558653,-0.682986,-0.640695,2.447270,-1.121703,-0.971972,0.936090,...,0.898505,0.365439,1.041442,1.520222,0.502455,0.645640,0.924511,0.335908,0.633392,1.628799
1066,0.997198,0.0,-0.822235,1.644193,-0.296904,-0.691410,2.574788,-0.899252,-1.003308,0.914820,...,1.052859,0.428323,0.800055,1.489368,0.431644,0.478042,1.049095,0.404338,0.191048,1.538501
1067,0.997198,0.0,-0.845428,1.282853,-0.710797,-0.660771,2.354623,-1.124002,-0.987080,0.536010,...,1.218764,0.547418,0.731569,1.698000,0.590064,0.323156,1.175483,0.567986,0.309572,1.698619


In [55]:
from sklearn.metrics import (
    precision_score,
    accuracy_score,
    f1_score,
    recall_score,
    confusion_matrix,
)

import warnings

warnings.filterwarnings("ignore")

In [56]:
def round_up_metric_results(results) -> list:
    """Round up metrics results such as precision score, recall score, ..."""
    return list(map(lambda el: round(el, 3), results))

In [61]:
algorithms = [
    ("LR", LogisticRegression()),
    ("SVC", SVC(probability=True)),
    ("KNN", KNeighborsClassifier()),
    ("DTC", DecisionTreeClassifier()),
    ("NB", GaussianNB()),
    ("RF", RandomForestClassifier()),
]

models = {}
final_results = []

for name, model in algorithms:
    trained_model = model.fit(X_train, y_train)
    models[name] = trained_model

    # Evaluate model
    model_results = model.predict(X_test)

    p_score = precision_score(y_test, model_results, average=None, labels=[0, 1])
    a_score = accuracy_score(y_test, model_results)
    r_score = recall_score(y_test, model_results, average=None, labels=[0, 1])
    f1_score_result = f1_score(y_test, model_results, average=None, labels=[0, 1])
    cm = confusion_matrix(y_test, model_results, labels=[0, 1])
    final_results.append(
        (
            name,
            round_up_metric_results(p_score),
            a_score,
            round_up_metric_results(r_score),
            round_up_metric_results(f1_score_result),
            cm,
        )
    )

In [62]:
# Sort results by F1 score
final_results.sort(key=lambda k: sum(k[4]), reverse=True)

pd.DataFrame(
    final_results,
    columns=[
        "Model",
        "Precision Score",
        "Accuracy score",
        "Recall Score",
        "F1 score",
        "Confusion Matrix",
    ],
)

Unnamed: 0,Model,Precision Score,Accuracy score,Recall Score,F1 score,Confusion Matrix
0,RF,"[0.882, 0.943]",0.906542,"[0.957, 0.845]","[0.918, 0.891]","[[112, 5], [15, 82]]"
1,KNN,"[0.835, 0.926]",0.869159,"[0.949, 0.773]","[0.888, 0.843]","[[111, 6], [22, 75]]"
2,SVC,"[0.803, 0.909]",0.841121,"[0.94, 0.722]","[0.866, 0.805]","[[110, 7], [27, 70]]"
3,DTC,"[0.847, 0.823]",0.836449,"[0.855, 0.814]","[0.851, 0.819]","[[100, 17], [18, 79]]"
4,LR,"[0.774, 0.767]",0.771028,"[0.821, 0.711]","[0.797, 0.738]","[[96, 21], [28, 69]]"
5,NB,"[0.622, 0.563]",0.598131,"[0.675, 0.505]","[0.648, 0.533]","[[79, 38], [48, 49]]"
