In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.stats import mode
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix

In [2]:
file_url = "https://raw.githubusercontent.com/IvanSergeyevichZhuk/ML_KPI/main/Data/Lab2/Part1/MN/MN07-Davydenko.csv"
data = pd.read_csv(file_url, header=None)
data = data.set_axis([0, 1, 2, 3, 4, 5, 6, 7,'class_type' ] , axis=1)

In [3]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,class_type
0,usual,improper,complete,2,less_conv,convenient,nonprob,recommended,very_recom
1,usual,improper,complete,2,less_conv,convenient,nonprob,priority,priority
2,usual,improper,complete,2,less_conv,convenient,nonprob,not_recom,not_recom
3,usual,improper,complete,2,less_conv,convenient,slightly_prob,recommended,very_recom
4,usual,improper,complete,2,less_conv,convenient,slightly_prob,priority,priority
...,...,...,...,...,...,...,...,...,...
295,usual,improper,completed,3,critical,convenient,problematic,priority,priority
296,usual,improper,completed,3,critical,convenient,problematic,not_recom,not_recom
297,usual,improper,completed,3,critical,inconv,nonprob,recommended,priority
298,usual,improper,completed,3,critical,inconv,nonprob,priority,priority


In [4]:
file_url = "https://raw.githubusercontent.com/IvanSergeyevichZhuk/ML_KPI/main/Data/Lab2/Part1/MN/MN07-Davydenko.csv"
data = pd.read_csv(file_url, header=None)
data = data.set_axis([0, 1, 2, 3, 4, 5, 6, 7,'class_type' ] , axis=1)
dataset=pd.get_dummies(data=data, columns=[0, 1, 2, 3, 4, 5, 6, 7])

In [5]:
dataset

Unnamed: 0,class_type,0_usual,1_improper,2_complete,2_completed,3_1,3_2,3_3,3_more,4_convenient,4_critical,4_less_conv,5_convenient,5_inconv,6_nonprob,6_problematic,6_slightly_prob,7_not_recom,7_priority,7_recommended
0,very_recom,1,1,1,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,1
1,priority,1,1,1,0,0,1,0,0,0,0,1,1,0,1,0,0,0,1,0
2,not_recom,1,1,1,0,0,1,0,0,0,0,1,1,0,1,0,0,1,0,0
3,very_recom,1,1,1,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,1
4,priority,1,1,1,0,0,1,0,0,0,0,1,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,priority,1,1,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0
296,not_recom,1,1,0,1,0,0,1,0,0,1,0,1,0,0,1,0,1,0,0
297,priority,1,1,0,1,0,0,1,0,0,1,0,0,1,1,0,0,0,0,1
298,priority,1,1,0,1,0,0,1,0,0,1,0,0,1,1,0,0,0,1,0


In [6]:
dataset = dataset.replace({'class_type':{'not_recom':0, 'priority':1, 'very_recom':2}})
dataset = dataset.reindex(columns=[ '2_complete', '2_completed','3_1','3_2','3_3','3_more','4_convenient','4_critical', '4_less_conv', '5_convenient', '5_inconv', '6_nonprob', '6_problematic', '6_slightly_prob', '7_not_recom', '7_priority', '7_recommended','class_type'])

##DTree

In [16]:
#Підрахунок ентропії
def entropy(target_col):
    elements,counts = np.unique(target_col,return_counts=True)
    entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy
  
#Інформаційний приріст

def InfoGain(data,split_attribute_name,target_name="class_type"):
    total_entropy = entropy(data[target_name])
    vals,counts = np.unique(data[split_attribute_name],return_counts=True)
    #підрахунок виваженої ентропії
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).
                                dropna()[target_name])for i in range(len(vals))])
    
    #формула Інформаційного приросту
    Information_Gain = total_entropy-Weighted_Entropy
    #print('Інформаційний приріст', Information_Gain)
    return Information_Gain

def Dtree(data,originaldata,features,target_attribute_name="class_type",
        parent_node_class=None):
    #якщо всі значення будуть однаковими
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]
    
    #якщо порожніми
    elif len(data) == 0:
        return np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name],
                                                                           return_counts=True)[1])]
    
    #якщо буде порожнє
    elif len(features) == 0:
        return parent_node_class 

    #тоді створюємо дерево
    else:
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name],
                                                                           return_counts=True)[1])]

    #Вибір найкращого поділу
    item_values = [InfoGain(data,feature,target_attribute_name)for feature in features] #повертає значення Інформаційного приросту
    best_feature_index = np.argmax(item_values)
    best_feature = features[best_feature_index]

    #створення структури дерева
    tree = {best_feature:{}}

    #Оновлення функції з найкращим отриманням інформації
    features = [i for i in features if i!= best_feature]


    #Одержання гілки під кореневим вузлом
    for value in np.unique(data[best_feature]):
        value = value
        sub_data = data.where(data[best_feature]==value).dropna()
        #виклик дерева
        subtree = Dtree(sub_data,dataset,features,target_attribute_name,parent_node_class)
        #створення гілки
        tree[best_feature][value] = subtree
    return(tree)

def predict(query,tree,default=1):
    for key in list(query.keys()):
        if key in list(tree.keys()):
            try:
               result = tree[key][query[key]]
            except:
               return default

            result = tree[key][query[key]]
            if isinstance(result,dict):
                return predict(query,result)
            else:
                return result

def train_test_split(dataset):
    training_data = dataset.iloc[:250].reset_index(drop=True)
    testing_data = dataset.iloc[-50:].reset_index(drop=True)
    return training_data,testing_data
training_data = train_test_split(dataset)[0]
testing_data = train_test_split(dataset)[1]

def test(data,tree):
   queries = data.iloc[:,:-1].to_dict(orient="records")
   predicted = pd.DataFrame(columns=["predicted"])

   for i in range(len(data)):
       predicted.loc[i,"predicted"] = predict(queries[i],tree,1.0)
   print("Prediction accuracy :",(np.sum(predicted["predicted"]==data["class_type"])/len(data))*100,'%')
  


In [17]:

tree = Dtree(training_data,training_data,training_data.columns[:-1])
#print(tree)
test(testing_data,tree)

Prediction accuracy : 88.0 %


In [9]:
from sklearn.metrics import confusion_matrix
def answer(data,tree):
   queries = data.iloc[:,:-1].to_dict(orient="records")
   predicted = pd.DataFrame(columns=["predicted"])

   for i in range(len(data)):
       predicted.loc[i,"predicted"] = predict(queries[i],tree,1.0)
   return predicted["predicted"] , data["class_type"]


Y_real= []
Y_predicting = []
for i in  answer(testing_data,tree)[0]:
  Y_real.append(i)
for i in  answer(testing_data,tree)[1]:
  Y_predicting.append(i)

dfs = pd.DataFrame()
dfs["actual"] = Y_real
dfs["predict_Dtree"] = Y_predicting
dfs

Unnamed: 0,actual,predict_Dtree
0,1.0,1
1,0.0,0
2,2.0,2
3,1.0,1
4,0.0,0
5,2.0,2
6,1.0,1
7,0.0,0
8,1.0,1
9,1.0,1


In [10]:
confusion_matrix(Y_real, Y_predicting)

array([[17,  0,  0],
       [ 0, 25,  0],
       [ 0,  6,  2]])

##KNN


In [11]:
#Метрика Евкліда
def eucledian(p1,p2):
    dist = np.sqrt(np.sum((p1-p2)**2))
    return dist
 
#Функція обчислення KNN
def predict(x_train, y , x_input, k):
    op_labels = []
     
    #Цикл за точками даних, що підлягають класифікації
    for item in x_input: 
         
        #Масив для зберігання відстаней
        point_dist = []
         
      
        for j in range(len(x_train)): 
            distances = eucledian(np.array(x_train[j,:]) , item) 
            #Обчислення відстані
            point_dist.append(distances) 
        point_dist = np.array(point_dist) 
         
        #Сортування масиву із збереженням індексу
        #Збереження перших K точок даних
        dist = np.argsort(point_dist)[:k] 
         
        labels = y[dist]
        
        lab = mode(labels) 
        lab = lab.mode[0]
        op_labels.append(lab)
 
    return op_labels

In [12]:
TRAIN = dataset.iloc[:200].reset_index(drop=True)
TEST = dataset.iloc[-50:].reset_index(drop=True)

X_train = TRAIN.iloc[:,:-1].reset_index(drop=True).to_numpy()
X_test = TEST.iloc[:,:-1].reset_index(drop=True).to_numpy()#.astype('int64')

y_train = TRAIN.iloc[:,-1:].reset_index(drop=True).to_numpy()
y_test = TEST.iloc[:,-1:].reset_index(drop=True).to_numpy()

In [13]:
y_pred = predict(X_train,y_train,X_test,3)
accuracy_score(y_test, y_pred)

0.92

In [14]:
for i in range(1,20):
  y_pred = predict(X_train,y_train,X_test,i)
  print(accuracy_score(y_test, y_pred))
y_pred = predict(X_train,y_train,X_test,2)

0.96
1.0
0.92
0.98
0.94
0.96
0.98
0.98
0.98
1.0
0.98
0.96
1.0
0.96
0.98
0.96
0.98
0.96
0.96


In [15]:
Y_pred_KNN = []
for i in y_pred:
  Y_pred_KNN.append(i[0])
dfs["predict_KNN"] = Y_pred_KNN
dfs

Unnamed: 0,actual,predict_Dtree,predict_KNN
0,1.0,1,1
1,0.0,0,0
2,2.0,2,2
3,1.0,1,1
4,0.0,0,0
5,2.0,2,2
6,1.0,1,1
7,0.0,0,0
8,1.0,1,1
9,1.0,1,1
