# Activity 2: K-nn classification with credit data

#### Loading required libraries

In [1]:
library(dplyr)
library(class)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



#### Reading data from CSV file

In [2]:
data <- read.csv("german_credit_data1.csv")

#### Cleaning and reformating data

In [3]:
data <- na.omit(data)
levels(data$Sex) <- c('1', '2')
data$Sex <- as.numeric(as.character(data$Sex))
levels(data$Housing) <- c('1', '2', '3')
data$Housing <- as.numeric(as.character(data$Housing))
levels(data$Saving.accounts) <- c('1', '2', '3','4')
data$Saving.accounts <- as.numeric(as.character(data$Saving.accounts))
levels(data$Checking.account) <- c('1', '2', '3','4')
data$Checking.account <- as.numeric(as.character(data$Checking.account))
levels(data$Purpose) <- c('1', '2', '3','4','5','6','7','8')
data$Purpose <- as.numeric(as.character(data$Purpose))
data <- subset(data, select=c(2:11))
data

Unnamed: 0,Age,Sex,Job,Housing,Saving.accounts,Checking.account,Credit.amount,Duration,Purpose,Credit.Risks
2,22,1,2,2,1,2,5951,48,6,2
4,45,2,2,1,1,1,7882,42,5,1
5,53,2,2,1,1,1,4870,24,2,2
8,35,2,3,3,1,2,6948,36,2,1
10,28,2,3,2,1,2,5234,30,2,2
11,25,1,2,3,1,2,1295,12,2,2
12,24,1,2,3,1,1,4308,48,1,2
13,22,1,2,2,1,2,1567,12,6,1
14,60,2,1,2,1,1,1199,24,2,2
15,28,1,2,3,1,1,1403,15,2,1


#### Scaling data

In [4]:
new_data<-data.frame((scale(data)))

<b>K-nn process for :<br>Test Set : 20% <br>
Training Set : 80%</b>

In [5]:
# number of rows in the dataset
n.points <- nrow(new_data)
sampling.rate <- 0.8

#number of points in the test set
num.test.set.labels = n.points*(1-sampling.rate)

# randomly sample which rows will go in the training set
training <- sample(1:n.points, sampling.rate * n.points, replace=FALSE)

# define the training set to be those rows
train <- subset(new_data[training, ], select = c(1:9))

# the other rows are going into the test set
testing <- setdiff(1:n.points, training)

# define the test set to be the other rows
test <- subset(new_data[testing, ], select = c(1:9))

cl <- new_data$Credit.Risks[training]

# this is the subset of labels for the training set
true.labels <- new_data$Credit.Risks[testing]

In [6]:
#Loop through and calculate mismisclassification rate for different values of k
knn_misclassification1 <- NULL

for (k in 1:20) {
 predicted.labels <- knn(train, test, cl, k)
 num.incorrect.labels <- sum(predicted.labels != true.labels)
 misclassification.rate <- num.incorrect.labels / num.test.set.labels
 knn_misclassification1 <- rbind(knn_misclassification1, c(k, misclassification.rate))
}

#### Misclassification rate for values of k 1-20

In [7]:
colnames(knn_misclassification1)<-c("K-value","Misclassification rate")
knn_misclassification1
summary(knn_misclassification1)

K-value,Misclassification rate
1,0.4022989
2,0.3544061
3,0.2681992
4,0.3448276
5,0.2969349
6,0.316092
7,0.335249
8,0.3544061
9,0.3256705
10,0.2969349


    K-value      Misclassification rate
 Min.   : 1.00   Min.   :0.2299        
 1st Qu.: 5.75   1st Qu.:0.2850        
 Median :10.50   Median :0.2969        
 Mean   :10.50   Mean   :0.3056        
 3rd Qu.:15.25   3rd Qu.:0.3352        
 Max.   :20.00   Max.   :0.4023        

<b> We can observe from this data set that the minimum misclassification rate for training data : 80% and test data: 20% is 0.2299</b>

<b>K-nn misclassification rate for :<br>Test Set : 30% <br>
Training Set : 70%</b>

In [8]:
sampling.rate <- 0.7

#number of points in the test set
num.test.set.labels = n.points*(1-sampling.rate)

# randomly sample which rows will go in the training set
training <- sample(1:n.points, sampling.rate * n.points, replace=FALSE)

# define the training set to be those rows
train <- subset(new_data[training, ], select = c(1:9))

# the other rows are going into the test set
testing <- setdiff(1:n.points, training)

# define the test set to be the other rows
test <- subset(new_data[testing, ], select = c(1:9))

cl <- new_data$Credit.Risks[training]

# this is the subset of labels for the training set
true.labels <- new_data$Credit.Risks[testing]

In [9]:
#Loop through and calculate misclassification rate for different values of k
knn_misclassification2 <- NULL

for (k in 1:20) {
 predicted.labels <- knn(train, test, cl, k)
 num.incorrect.labels <- sum(predicted.labels != true.labels)
 misclassification.rate <- num.incorrect.labels / num.test.set.labels
 knn_misclassification2 <- rbind(knn_misclassification2, c(k, misclassification.rate))
}

#### Misclassification rate for values of k 1-20

In [10]:
colnames(knn_misclassification2)<-c("K-value","Misclassification rate")
knn_misclassification2
summary(knn_misclassification2)

K-value,Misclassification rate
1,0.3831418
2,0.3959132
3,0.3384419
4,0.3384419
5,0.3512133
6,0.3767561
7,0.3512133
8,0.357599
9,0.3703704
10,0.3767561


    K-value      Misclassification rate
 Min.   : 1.00   Min.   :0.3321        
 1st Qu.: 5.75   1st Qu.:0.3512        
 Median :10.50   Median :0.3576        
 Mean   :10.50   Mean   :0.3592        
 3rd Qu.:15.25   3rd Qu.:0.3720        
 Max.   :20.00   Max.   :0.3959        

<b>We can observe from this data set that the minimum misclassification rate for training data : 70% and test data: 30% is 0.3321</b>