Name: Yerlan Negmetulla

## import needed libraries


In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

## read the car.data

In [2]:
data = pd.read_table('car.data', delimiter = ',', header=None)
data

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good



## do the preprocessing

In [3]:
def preprocessing_labels(digits, labels_dict):
    for x in labels_dict: 
        digits = np.where(digits==x, labels_dict[x], digits)
    return digits

labels = np.unique(data)
labels_dict = {
    'low':1,    'med':2,  'high':3,  'vhigh':4,
    '2':2,      '3':3,    '4':4,     '5more':5,  'more':6,
    'small':1,  'big':3,
    'unacc':1,  'acc':2,  'good':3,  'vgood':4
}

In [4]:
def preprocessing_data(data, labels, labels_dict):
    preprocessed_labels = preprocessing_labels(labels, labels_dict)
    preprocessed_data = np.zeros(data.shape)
    for i in range(data.shape[1]):
        for j in range(data.shape[0]):
            preprocessed_data[j][i] = preprocessed_labels[ np.where(labels == data[i][j]) ]
    return preprocessed_data

preprocessed_data = preprocessing_data(data, labels, labels_dict)


## Apply Decision Tree Algorithm
You can use sklearn
As usual, divide the dataset for train and test

In [5]:
X = preprocessed_data[:, :-1]
y = preprocessed_data[:,  -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

def confusion_matrix(y_actu, y_pred):
    labels = np.unique(y_actu)
    labels_number = len(labels)
    model = np.zeros((labels_number, labels_number), dtype=int)
    for i in range(labels_number):
        for j in range(labels_number):
            model[i][j] = np.sum((y_actu==labels[i]) & (y_pred==labels[j]))
    return model, labels_number

In [6]:
def accuracy(y_actu, y_pred):
    model, labels_number = confusion_matrix(y_actu, y_pred)
    sum_ = 0
    for i in range(labels_number):
        sum_ += model[i][i]
    return sum_ / model.sum()

print(f"Accuracy is about  {accuracy(y_test, pred)} %")

Accuracy is about  0.9633911368015414 %


## Write your own function for f1-score

In [7]:
def f1_macro(y_actu, y_pred):
    model, labels_number = confusion_matrix(y_actu, y_pred)
    sum_axis_0 = model.sum(axis=0)
    sum_axis_1 = model.sum(axis=1)
    precision = 0
    recall = 0
    for i in range(labels_number):
        precision += model[i][i]/sum_axis_0[i]
        recall += model[i][i]/sum_axis_1[i]
    precision /= labels_number
    recall /= labels_number
    return 2 * precision * recall / (precision + recall)

print(f"F1 score is about:  {f1_macro(y_test, pred)} %")

F1 score is about:  0.8937547024493343 %
