In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import random
from statistics import mode
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import math
from sklearn.mixture import GaussianMixture

### Building the abstraction for the input and output data to the model

In [2]:
class InputData:
    
    def __init__(self, data):
        self.data = data
        self.original = data
    
    def row(self, x):
        """return a particular row of the data"""
        return self.data.iloc[x, :]
    
    def normalise(self):
        """Normalising the data"""
        self.data = self.data.apply(MinMaxNormalization)

        
class OutputData:
    
    def __init__(self, data):
        self.data = list(data)
        
    def cls(self, x):
        """return class of the output which corresponds to index in the output data"""
        return self.data[x]
 
class Data:
    
    def __init__(self, data):
        self.data = data
        self.x = None
        self.y = None
        self.get_input()
        self.get_output()
    
    def get_output(self):
        op =  OutputData(self.data['Class'])
        self.y = op

    def get_input(self):
        without_class = self.data.drop('Class', axis="columns")
        self.x = InputData(without_class)
        
    
    def change_data(self, new_data):
        self.data = new_data
        self.get_input()
        self.get_output()


## Defining some basic functions 

In [3]:
def EucledianDistance(ser1, ser2):
    return np.sqrt(np.sum((ser1 - ser2)**2))

def MinMaxNormalization(ser):
    """Returns a minmaxnormalised form a given series."""
    maxi = max(ser)
    mini = min(ser)
    f = lambda x: (x - mini) / (maxi - mini) * 1 
    return ser.apply(f)

def Standardization(ser):
    """Returns standardised form of a given series"""
    mean = ser.mean()
    sd = ser.std()
    f = lambda x: (x - mean) / sd
    return ser.apply(f)    

def myround(x):
    return round(x, 3)


In [4]:
data = pd.read_csv('SteelPlateFaults-2class.csv')

In [5]:
data

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Class
0,1325,1339,30207,30238,268,29,31,25809,79,124,...,0.4828,1.0000,1.0,2.4281,1.1461,1.4914,0.5484,-0.2476,0.7065,1
1,1,16,55572,55629,370,48,62,39293,27,119,...,0.3125,0.9194,1.0,2.5682,1.1761,1.7559,0.7368,-0.1703,0.9755,1
2,1323,1333,68445,68506,330,48,61,33449,90,119,...,0.2083,1.0000,1.0,2.5185,1.0000,1.7853,0.8361,-0.2081,0.8861,1
3,1324,1333,75642,75681,207,25,39,21354,93,124,...,0.3600,1.0000,1.0,2.3160,0.9542,1.5911,0.7692,-0.1941,0.5805,1
4,1324,1335,97132,97213,594,55,81,61608,93,125,...,0.2000,1.0000,1.0,2.7738,1.0414,1.9085,0.8642,-0.1897,0.9806,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1114,250,373,3629947,3630060,6114,320,197,633741,40,134,...,0.3844,0.5736,0.0,3.7863,2.0899,2.0531,-0.0813,-0.1902,1.0000,0
1115,243,370,3658370,3658511,7639,462,260,819375,40,135,...,0.2749,0.5423,1.0,3.8830,2.1038,2.1492,0.0993,-0.1620,1.0000,0
1116,241,360,3711661,3711800,7080,492,293,773694,43,140,...,0.2419,0.4744,1.0,3.8500,2.0756,2.1430,0.1439,-0.1463,1.0000,0
1117,836,878,2150529,2150756,5390,297,258,635678,9,143,...,0.1414,0.8798,1.0,3.7316,1.6233,2.3560,0.8150,-0.0786,1.0000,0


In [6]:
# splitting the data of each class 
data_class1 = data[(data.Class == 1)]  # class 1 data
data_class0 = data[(data.Class == 0)]  # class 2 data


# splitting the data into testing and traning data
train1, test1 = train_test_split(data_class1, test_size=0.3, random_state=42)
train0, test0 = train_test_split(data_class0, test_size=0.3, random_state=42)

# combining the traning data from both the classes and suffling them 
train = pd.concat([train1, train0]).sample(frac=1).reset_index(drop=True)
test = pd.concat([test1, test0]).sample(frac=1).reset_index(drop=True)



# Storing the testing and traning data into csv files
train.to_csv('SteelPlateFaults-train.csv')
test.to_csv('SteelPlateFaults-test.csv')


# Converting the test and train data into Data object
train = Data(train)
test = Data(test)


## KNN Classifier function 

In [7]:
# # normalising data before applying KNN since it is a distance measure

# def KNN(test_sample, k):
#     """Will return the class to which test_sample belong"""
#     distances = []
#     n = train.x.data.shape[0]
#     for i in range(n):
#         row = train.x.row(i)
#         dis = [EucledianDistance(row, test_sample), i]
#         distances.append(dis)
#     distances.sort()
#     classes_observed = [train.y.cls(distances[i][1]) for i in range(k)]
#     return mode(classes_observed)

### Performing KNN on not normalised Data

In [8]:
# for k in [1,3, 5]:
#     test_y_predicted = [KNN(test.x.row(i), k) for i in range(test.x.data.shape[0])]
#     test_y_predicted = list(test_y_predicted)
#     cnf_mtrx = confusion_matrix(list(test.y.data), test_y_predicted)
#     acc = accuracy_score(test.y.data, test_y_predicted)
#     print("Confusion matrix for k:", k)
#     print("Accuracy score is:", acc)
#     print(cnf_mtrx)
#     print()
#     print()

## Normalising the data

In [9]:
# Normalising the input data since KNN is a distance measure
train.x.normalise()
test.x.normalise()

# storing the traning and testing data into csv files 
train.data.to_csv('SteelPlateFaults-train-Normalised.csv', index=False)
test.data.to_csv('SteelPlateFaults-test-Normalised.csv', index=False)

## Performing KNN on normalised data

In [10]:
# for k in [1,3, 5]:
#     test_y_predicted = [KNN(test.x.row(i), k) for i in range(test.x.data.shape[0])]
#     test_y_predicted = list(test_y_predicted)
#     cnf_mtrx = confusion_matrix(list(test.y.data), test_y_predicted)
#     acc = accuracy_score(test.y.data, test_y_predicted)
#     print("Confusion matrix for k:", k)
#     print("Accuracy score is:", acc)
#     print(cnf_mtrx)
#     print()
#     print()

# Bayes Classification 


In [11]:
## Dropping the redundant columns from testing and traning data 
train_bayes = pd.read_csv('SteelPlateFaults-train.csv', index_col = 0)
test_bayes = pd.read_csv('SteelPlateFaults-test.csv' , index_col = 0)


train = Data(train_bayes)
test = Data(test_bayes)

test.change_data(test.data.drop(columns=["X_Minimum", "Y_Minimum", "TypeOfSteel_A400", "TypeOfSteel_A300"]))
train.change_data(train.data.drop(columns=["X_Minimum", "Y_Minimum", "TypeOfSteel_A400", "TypeOfSteel_A300"]))


### Bayes Classifier is built form scratch and GMM(Gaussian Mixture Model) is built on top of that using SKlear Gaussian Mixture Model 

In [12]:
class ClassParameter:
    
    # A class has two parameter
    #1. mean vector of the given class 
    #2. covariance matrix of the given class
    
    def __init__(self, mean, vec):
        self.mean_vector = mean
        self.cov_matrix = vec



class BayesClassifier:
    
    def __init__(self, data):
        self.data = data
        self.classes = self.data['Class'].unique()
        self.prior = {}
        self.parameter = {}
        self.seperate_classes()
    
    def seperate_classes(self):
        """Seperate the data based on the classes in the class Attribute. It will store the parameters for the gaussian distribution of each class and prior probabilities of each class"""
        grouped = self.data.groupby('Class')
        
        for cls in self.classes:
            cls_data = grouped.get_group(cls).drop(columns=['Class'])
            mean_vector = cls_data.mean()
            cov_matrix = cls_data.cov()
            self.parameter[cls] = ClassParameter(mean_vector, cov_matrix)
            self.prior[cls] = cls_data.shape[0] / self.data.shape[0]
            
    def likelihood(self, sample, parameter):
        """Given a smaple it will return the likelihood of getting that sample from a given class """
        # |cov_matrix|
        determinant = np.linalg.det(parameter.cov_matrix)
        # |x - mu|
        sample_minus_mean = sample - parameter.mean_vector
        # cov_matrix^-1
        inverse_cov_matrix = np.linalg.inv(parameter.cov_matrix)
        
        exponent_value = - (1 / 2) * np.linalg.multi_dot([sample_minus_mean.transpose(), inverse_cov_matrix, sample_minus_mean])
        
        numerator = exponent_value
        denomenator = ((2 * math.pi) ** (len(sample)/2)) * ((determinant) ** 0.5)
        
        final_likelihood = numerator - np.log(denomenator)
        return final_likelihood
        
    def predict(self, sample):
        predictions = []
        
        for cls in self.classes:
            posterior = self.likelihood(sample, self.parameter[cls]) +  np.log(self.prior[cls])
            predictions.append((posterior, cls))
        
        predictions.sort(reverse=True)
        return predictions[0][1]
    
    
    
    
    
class GMM_Model(BayesClassifier):
    
    def __init__(self, data, Q):
        BayesClassifier.__init__(self, data)
        self.components = Q
        self.g_mixtures = {}
        self.make_mixtures()
    
    def make_mixtures(self):
        """This is will make gaussian mixtures for each component."""
        grouped = self.data.groupby('Class')
        for cls in self.classes:
            cls_data = grouped.get_group(cls).drop(columns=['Class'])
            gm = GaussianMixture(n_components=Q, random_state=42, covariance_type='full', reg_covar=1e-5)
            gm.fit(cls_data)
            self.g_mixtures[cls] = gm
    
    def predict(self, sample):
        
        predictions = []
        
        for cls in self.classes:
            posterior = (self.g_mixtures[cls]).score_samples([sample]) +  np.log(self.prior[cls])
            predictions.append((posterior, cls))
        
        predictions.sort(reverse=True)
        return predictions[0][1]
            
        
    

## Naive Bayes Classifier Accuracy

In [13]:

bayes = BayesClassifier(train.data)

test_y_predicted_bayes = [bayes.predict(test.x.row(i)) for i in range(test.x.data.shape[0])]
test_y_predicted_bayes = list(test_y_predicted_bayes)
cnf_matrix_bayes = confusion_matrix(list(test.y.data), test_y_predicted_bayes)
accuracy_bayes = accuracy_score(test.y.data, test_y_predicted_bayes)
print("Accuracy score is:", accuracy_bayes)
print(cnf_matrix_bayes)
print()
print()

Accuracy score is: 0.9436201780415431
[[102  16]
 [  3 216]]




# GMM Accuray on different number of components


In [14]:
test_data = pd.read_csv('SteelPlateFaults-test.csv', index_col = 0)
train_data = pd.read_csv('SteelPlateFaults-train.csv', index_col = 0)
test_data.drop(columns=["X_Minimum", "Y_Minimum", "TypeOfSteel_A400", "TypeOfSteel_A300"], inplace=True)
train_data.drop(columns=["X_Minimum", "Y_Minimum", "TypeOfSteel_A400", "TypeOfSteel_A300"], inplace=True)
test = Data(test_data)
train = Data(train_data)

In [15]:
for Q in [2, 4, 8, 16]:
    gmm_model = GMM_Model(train_data, Q)
    predicted_y = [gmm_model.predict(test.x.row(x)) for x in range(test.data.shape[0])]
    cnf_mtrx = confusion_matrix(list(test.y.data), predicted_y)
    acc = accuracy_score(test.y.data, predicted_y)
    print("Accuracy score is:", acc)
    print("Confusion matrix for Q:", Q)
    print(cnf_mtrx)
    print()
    print()


Accuracy score is: 0.9525222551928784
Confusion matrix for Q: 2
[[106  12]
 [  4 215]]


Accuracy score is: 0.9643916913946587
Confusion matrix for Q: 4
[[111   7]
 [  5 214]]


Accuracy score is: 0.9554896142433235
Confusion matrix for Q: 8
[[109   9]
 [  6 213]]


Accuracy score is: 0.9169139465875371
Confusion matrix for Q: 16
[[ 92  26]
 [  2 217]]


