## Program 4

Train a regularized logistic regression classifier on the iris dataset (https://archive.ics.uci.edu/ml/machine-learning-databases/iris/ or the inbuilt iris dataset) using sklearn.Train the model with the following hyperparameter C = 1e4 and report the best classification accuracy.


In [None]:
import json
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score,confusion_matrix

"Imports complete."
iris = load_iris()
y = iris.target
X = iris.data
class_names = iris.target_names

Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,test_size = 0.2, random_state =42)

"Training and test splits have been made"
scaler = StandardScaler()
Xtrain = scaler.fit_transform(Xtrain)
Xtest = scaler.transform(Xtest)

"Data has been rescaled"
C = 1e4
model = LogisticRegression(C=C, penalty='l2',max_iter=1000,solver='liblinear')
model.fit(Xtrain,ytrain)
ypred = model.predict(Xtest)
"Model created."
print("Accuracy based on test set:", accuracy_score(ypred,ytest))
cm = confusion_matrix(ytest,ypred)
plt.figure(figsize=(15,10))
sns.heatmap(cm,annot=True,cmap='Greys',xticklabels=class_names,yticklabels=class_names,fmt='d')
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.show()
sample = np.array([[5,3,1,0]])
ysample = model.predict(sample)
predclass = class_names[ysample[0]]
print("Predicted Class:" ,class_names[ysample[0]])

model_dict = {
    "coef": model.coef_.tolist(),
    "intercept": model.intercept_.tolist(),
    "classes": iris.target_names.tolist(),
    "attributes": iris.feature_names
}

with open("model.json",'w') as jsonlover:
    json.dump(model_dict,jsonlover)



## Program 5

Train an SVM classifier on the iris dataset using sklearn. Try different kernels and the associated hyperparameters. Train model with the following set hyperparameters RBF-kernel, gamma=0.5, onevs-rest classifier, no-feature-normalization. Also try C=0.01,1,10C=0.01,1,10. For the above set of hyperparameters, find the best classification accuracy along with total number of support vectors on the test data.


In [None]:
from sklearn.svm import SVC
import pickle

iris = load_iris()
X = iris.data
y = iris.target
class_names = iris.target_names
Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,test_size=0.2,random_state=42)

gammas = [0.5]
Cc = [0.01,1,10]
kernels = ['rbf']

best_accuracy = 0
best_sup_vectors = 0
colors = ['Blues','Reds','Greys']

for kernel in kernels:
    for gamma in gammas:
        for colour_no,C in enumerate(Cc):
            colour = colors[colour_no]
            model = SVC(kernel=kernel,gamma=gamma,C=C)
            model.fit(Xtrain,ytrain)
            ypred = model.predict(Xtest)
            print("Model Info: \nKernel, Gamma, C")
            print(kernel,gamma,C,sep='\t\t')

            accuracy=accuracy_score(ytest,ypred)

            print("Accuracy:",accuracy)
            print('-----------------')

            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_sup_vectors = model.n_support_.sum()

            cm = confusion_matrix(ytest,ypred)
            plt.figure(figsize=(4,4))
            sns.heatmap(cm,annot=True,cmap=colour,fmt='d')
            plt.xlabel('Actual Class')
            plt.ylabel('Predicted Class')
            plt.show()

print("Best Accuracy AND Number of Support Vectors:",best_sup_vectors,"and",best_accuracy)

with open('model.pkl','wb') as pkl:
    pickle.dump(model,pkl)

with open('model.pkl','rb') as pkl:
    loaded_model = pickle.load(pkl)




## Program 6

Consider the following dataset. Write a program to demonstrate the working of the decision tree based ID3 algorithm.
Headers = price, maintanence, capacity, airbag, profitable
                        [['low', 'low', 1, 'no', 'no'],
                                    ['low', 'low', 2, 'no', 'no'],
                                    ['med', 'low', 2, 'yes', 'yes'],
                                    ['high', 'med', 2, 'yes', 'yes'],
                                    ['high', 'high', 3, 'yes', 'yes'],
                                    ['high', 'high', 3, 'no', 'no'],
                                    ['med', 'high', 3, 'yes', 'no']]



In [None]:
from collections import Counter
import math
def build(dt,cl,attributes):
    if len(set(cl)) == 1:
        return cl[0]

    if len(attributes) == 0:
        return majority(cl)

    best_attr = get_best_attribute(dt,cl,attributes)
    tree = {best_attr:{}}
    attribute_values = set(sample[attributes.index(best_attr)] for sample in dt)

    for value in attribute_values:
        subset = [sample for sample in dt if sample[attributes.index(value)] == value]
        subset_class_labels = [sample[-1] for sample in subset]
        remaining_attributes = attributes[:]
        remaining_attributes.remove(best_attribute)
        tree[best_attribute][value] = build_decision_tree(subset, remaining_attributes, subset_class_labels)
    return tree

def majority(cl):
    yes = cl.count('yes')
    no = cl.count('no')
    if yes >= no:
        return "Yes"
    else:
        return "No"
def entropy(cl):
    print()



def get_best_attribute(dt,cl,attributes):
    gains = [gain(dt,cl,attribute) for attribute in range(len(attributes))]
    best_index = gains.index(max(gains))
    return attributes[best_index]

dt = [['low', 'low', 1, 'no', 'no'],['low', 'low', 2, 'no', 'no'],['med', 'low', 2, 'yes', 'yes'],['high', 'med', 2, 'yes', 'yes'],['high', 'high', 3, 'yes', 'yes'],['high', 'high', 3, 'no', 'no'],['med', 'high', 3, 'yes', 'no']]
cl = [row[-1] for row in dt]
attributes = ['Price', 'maintenance', 'capacity', 'airbag']

tree = build(dt,cl,attributes)

test = ['low', 'low', 1, 'no']
predval = predict(test,tree)

print("Prediction for given sample:",predval,"\nGiven Sample:",test)



## Program 7
Consider the dataset spiral.txt (https://bit.ly/2Lm75Ly). The first two columns in the dataset corresponds to the co-ordinates of each data point. The third column corresponds to the actual cluster label. Compute the rand index for the following methods: 1. K – means Clustering 2. Single – link Hierarchical Clustering 3. Complete link hierarchical clustering. Also visualize the dataset and which algorithm will be able to recover the true clusters.


In [None]:
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.metrics import adjusted_rand_score

df = pd.read_csv('Spiral.txt', skiprows=1,header=None, delimiter='\t')
df.columns = ['f1','f2','labels']
X = df[['f1','f2']].astype(float).values
y = df['labels'].values

km = KMeans(random_state=42,n_clusters=3)
ypredkm = km.fit_predict(X,y)
kmscore = adjusted_rand_score(y,ypredkm)

sg = AgglomerativeClustering(n_clusters=3, linkage='single')
ypredsg = sg.fit_predict(X,y)
sgscore = adjusted_rand_score(y,ypredsg)

cm = AgglomerativeClustering(n_clusters=3, linkage='complete')
ypredcm = cm.fit_predict(X,y)
cmscore = adjusted_rand_score(y,ypredcm)

fig, ax = plt.subplots(1,3,figsize=(15,7))
ax[0].scatter(X[:,0],X[:,1],c=ypredkm)
ax[0].set_title(f'Kmeans Clustering RI Score:{kmscore:.3f}')

ax[1].scatter(X[:,0],X[:,1],c=ypredsg)
ax[1].set_title(f'SL Clustering RI Score:{sgscore:.3f}')

ax[2].scatter(X[:,0],X[:,1],c=ypredcm)
ax[2].set_title(f'CL Clustering RI Score:{cmscore:.3f}')


plt.tight_layout()
plt.show()


## Program 8

Implement a k-Nearest Neighbor algorithm to classify the iris dataset. Print out both correct and wrong predictions.


In [None]:
from sklearn.neighbors import KNeighborsClassifier

iris = load_iris()
x = iris.data
y = iris.target
labels = iris.target_names

k=3
knn = KNeighborsClassifier(n_neighbors=k)
xtrain,xtest,ytrain,ytest = train_test_split(x,y,random_state=42,test_size=0.2)

knn.fit(xtrain,ytrain)

ypred = knn.predict(xtest)

for i in range(len(xtest)):
    pred = labels[ypred[i]]
    actual = labels[ytest[i]]
    result = "Correct" if pred == actual else "FALSE"
    print(f'The prediction is {pred} and the actual value is {actual} \n{result}')
    print('------------------------------------- \n')

print("Accuracy is:",accuracy_score(ytest,ypred))