# Loading the Data & Libraries

In [165]:
library(class)

In [166]:
library(car)

In [178]:
 df <- read.csv("german_credit_data1.csv",header = TRUE)
  

head(df)

X.,Age,Sex,Job,Housing,Saving.accounts,Checking.account,Credit.amount,Duration,Purpose,Credit.Risks
0,67,male,2,own,,little,1169,6,radio/TV,1
1,22,female,2,own,little,moderate,5951,48,radio/TV,2
2,49,male,1,own,little,,2096,12,education,1
3,45,male,2,free,little,little,7882,42,furniture/equipment,1
4,53,male,2,free,little,little,4870,24,car,2
5,35,male,1,free,,,9055,36,education,1


# Chaging the NA values to 0

In [168]:
df$Saving.accounts<-as.character(df$Saving.accounts)
df$Checking.account<-as.character(df$Checking.account)
df$Saving.accounts[is.na(df$Saving.accounts)]<-0
df$Checking.account[is.na(df$Checking.account)]<-0

In [169]:
nrow(df)

In [170]:
head(df,50)

X.,Age,Sex,Job,Housing,Saving.accounts,Checking.account,Credit.amount,Duration,Purpose,Credit.Risks
0,67,male,2,own,0,little,1169,6,radio/TV,1
1,22,female,2,own,little,moderate,5951,48,radio/TV,2
2,49,male,1,own,little,0,2096,12,education,1
3,45,male,2,free,little,little,7882,42,furniture/equipment,1
4,53,male,2,free,little,little,4870,24,car,2
5,35,male,1,free,0,0,9055,36,education,1
6,53,male,2,own,quite rich,0,2835,24,furniture/equipment,1
7,35,male,3,rent,little,moderate,6948,36,car,1
8,61,male,1,own,rich,0,3059,12,radio/TV,1
9,28,male,3,own,little,moderate,5234,30,car,2


In [171]:
summary(df)

       X.             Age            Sex           Job        Housing   
 Min.   :  0.0   Min.   :19.00   female:310   Min.   :0.000   free:108  
 1st Qu.:249.8   1st Qu.:27.00   male  :690   1st Qu.:2.000   own :713  
 Median :499.5   Median :33.00                Median :2.000   rent:179  
 Mean   :499.5   Mean   :35.55                Mean   :1.904             
 3rd Qu.:749.2   3rd Qu.:42.00                3rd Qu.:2.000             
 Max.   :999.0   Max.   :75.00                Max.   :3.000             
                                                                        
 Saving.accounts    Checking.account   Credit.amount      Duration   
 Length:1000        Length:1000        Min.   :  250   Min.   : 4.0  
 Class :character   Class :character   1st Qu.: 1366   1st Qu.:12.0  
 Mode  :character   Mode  :character   Median : 2320   Median :18.0  
                                       Mean   : 3271   Mean   :20.9  
                                       3rd Qu.: 3972   3rd Qu.:24.

# Changing the text in Savings and Checking account to numerical data


In [172]:

df$Saving.accounts<-ifelse(df$Saving.accounts == "little", 1, ifelse(df$Saving.accounts== "moderate", 2,ifelse(df$Saving.accounts == "quite rich",3,ifelse(df$Saving.accounts == "rich",4,0))))

df$Checking.account<-ifelse(df$Checking.account == "little", 1, ifelse(df$Checking.account== "moderate", 2,ifelse(df$Checking.account == "rich",3,0)))



# Normalizing the data for accurate distance calculation

In [173]:

df$Age<-((df$Age - min(df$Age)) / (max(df$Age) - min(df$Age)))

df$Saving.accounts<-((df$Saving.accounts - min(df$Saving.accounts)) / (max(df$Saving.accounts) - min(df$Saving.accounts)))

df$Checking.account<-((df$Checking.account - min(df$Checking.account)) / (max(df$Checking.account) - min(df$Checking.account)))
      
#df$Credit.Risks<-((df$Credit.Risks - min(df$Credit.Risks)) / (max(df$Credit.Risks) - min(df$Credit.Risks)))    

# Dataframe with normalized Values

In [174]:
df

X.,Age,Sex,Job,Housing,Saving.accounts,Checking.account,Credit.amount,Duration,Purpose,Credit.Risks
0,0.85714286,male,2,own,0.00,0.3333333,1169,6,radio/TV,1
1,0.05357143,female,2,own,0.25,0.6666667,5951,48,radio/TV,2
2,0.53571429,male,1,own,0.25,0.0000000,2096,12,education,1
3,0.46428571,male,2,free,0.25,0.3333333,7882,42,furniture/equipment,1
4,0.60714286,male,2,free,0.25,0.3333333,4870,24,car,2
5,0.28571429,male,1,free,0.00,0.0000000,9055,36,education,1
6,0.60714286,male,2,own,0.75,0.0000000,2835,24,furniture/equipment,1
7,0.28571429,male,3,rent,0.25,0.6666667,6948,36,car,1
8,0.75000000,male,1,own,1.00,0.0000000,3059,12,radio/TV,1
9,0.16071429,male,3,own,0.25,0.6666667,5234,30,car,2


# KNN Function

### Passing: Age, Savings Account , Checking Account 
### Target: Credit Risk

In [175]:
knncalc<-function (df,sampling.rate){

n.points <- nrow(df) # number of rows in the dataset
num.test.set.labels <- n.points * (1 - sampling.rate)
# randomly sample which rows will go in the training set
training <- sample(1:n.points, sampling.rate * n.points,
replace=FALSE)
train <- subset(df[training, ], select = c(Age,Saving.accounts,Checking.account))

testing <- setdiff(1:n.points, training)
# define the test set to be the other rows
test <- subset(df[testing, ], select = c(Age,Saving.accounts,Checking.account))
cl <- df$Credit.Risks[training]
# this is the subset of labels for the training set
true.labels <- df$Credit.Risks[testing]

d2 = data.frame( k=rep(0, 20), misclassification.rate=rep(0,20))

for (k in 1:20) {
#print(k)
predicted.labels <- knn(train, test, cl, k)
# We're using the R function knn()
num.incorrect.labels <- sum(predicted.labels != true.labels)
misclassification.rate <- num.incorrect.labels /num.test.set.labels
    d2[k,]=c(k,misclassification.rate)
#print(misclassification.rate)
}
    return (d2)
    }

# Calling th knn Function with sampling rate as the parameter

In [176]:
df0.9<-knncalc(df,0.9)
df0.7<-knncalc(df,0.7)
df0.6<-knncalc(df,0.6)

# Ananlysis of KNN for k varying from 1 to 20 

In [177]:
final<-merge(merge(df0.6,df0.7,by="k"),df0.9,by="k")

colnames(final)<-c("k","misclassification.rate train=60","misclassification.rate train=70","misclassification.rate train=90")

final

k,misclassification.rate train=60,misclassification.rate train=70,misclassification.rate train=90
1,0.3275,0.33,0.35
2,0.3325,0.3266667,0.31
3,0.3425,0.3166667,0.28
4,0.3275,0.3466667,0.28
5,0.3325,0.3133333,0.26
6,0.325,0.32,0.3
7,0.3,0.3333333,0.31
8,0.2975,0.3233333,0.29
9,0.3025,0.3166667,0.29
10,0.295,0.3233333,0.35


# Minimum Misclassification Rate=0.2875 at k=3,4 when trainset is 60 %

# Minimum Misclassification Rate=0.28 at k=5 when trainset is 70 %

# Minimum Misclassification Rate=0.22 at k=18 when trainset is 90 %

