In [1]:
#check correct import
def test_util():
    return "Import complete"

In [2]:
#description of sklearn datasets
def descRaw(ds):
    print(ds['DESCR'])
    for key,value in ds.items():
        print(key,'\n', value,'\n')    
    #print target classes
    print('data.shape\t',ds['data'].shape,'\ntarget.shape \t',ds['target'].shape)

In [3]:
#convert sklearn dataset into pandas dataframe
def pandasConverter(ds):
    features=pd.DataFrame(data=ds['data'],columns=ds['feature_names'])
    df=features
    df['target']=ds['target']
    df['class']=df['target'].map(lambda ind: ds['target_names'][ind])
    return df

In [4]:
#print correlation matrix of the relative dataframe
def corrMatrix(df,tit):
    corr = df[df.columns].corr()
    f, ax = plt.subplots(figsize=(15, 10))
    if tit == "iris":
        ax.set_title('IRIS CORRELATION MATRIX')
    else:
        ax.set_title('WINE CORRELATION MATRIX')
    ax=sns.heatmap(corr, cmap="YlGnBu", annot = True)

In [5]:
#get a new dataframe with only the 2 kbest features
def KbestRaw(ds,df):
    # Create features and target
    X = ds.data
    y = ds.target
    # Select two features with highest chi-squared statistics
    chi2_selector = SelectKBest(chi2, k=2)
    chi2_selector.fit(X, y)

    #print chi2 scores
    chi2_scores = pd.DataFrame(list(zip(ds.feature_names, chi2_selector.scores_, chi2_selector.pvalues_)), columns=['ftr', 'score', 'pval'])
    print(chi2_scores)

    #print kbest features names
    kbest = np.asarray(ds.feature_names)[chi2_selector.get_support()]
    print("kbest: ",kbest)
    #create new df with kbest features
    new_df=df[kbest]
    new_df['target']=ds['target']
    return new_df

In [6]:
#definition of MDC class
class MDC():
    def init(self):
        self.class_list = {}
        self.centroids = {}
    
    def fit(self, X, y):
        self.class_list = np.unique(y, axis=0)
        self.centroids = np.zeros((len(self.class_list), X.shape[1])) # each row is a centroid
        for i in range(len(self.class_list)): # for each class, we evaluate its centroid
            temp = np.where(y==self.class_list[i])[0]
            self.centroids[i,:] = np.mean(X[temp],axis=0)
    
    def predict(self, X, mtype):
        temp = np.argmin(
            cdist(X, self.centroids, mtype), # set distance metric
            axis=1
        )
        y_pred = np.array([self.class_list[i] for i in temp])

        return y_pred

In [7]:
#return accuracy score of the given dataset with the given metric
def mdcScore(datasets,nres,tsize,mtype):
    results = {
    dataset_name: {
        'train' : [None]*nres, 
        'test'  : [None]*nres
    }
    for dataset_name in datasets
    }
    
    for dataset_name in datasets:
        X = datasets[dataset_name].iloc[:, 0:2].values#data
        Y = datasets[dataset_name].iloc[:, 2].values#target

        for i in range(nres):
            # Train/test split
            X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=tsize)

            # scaling
            scaler = MinMaxScaler()
            X_tr_scaled = scaler.fit_transform(X_train)
            X_ts_scaled = scaler.transform(X_test)

            # model fitting
            mdc = MDC()
            mdc.fit(X_tr_scaled, y_train)

            # model evaluation
            y_pred_tr = mdc.predict(X_tr_scaled, mtype)
            y_pred_ts = mdc.predict(X_ts_scaled, mtype)
            results[dataset_name]['train'][i] = accuracy_score(y_train, y_pred_tr)*100 # %        
            results[dataset_name]['test'][i]  = accuracy_score(y_test,  y_pred_ts)*100 # %

        print(dataset_name+" "+mtype+" done")
    
    return results

In [8]:
#create accuracy boxplot for MDC
def accBox(datasets, results, metric):
    switch={
        "euclidean": 'Euclidean',
        "cityblock": 'Cityblock',
        "chebyshev": 'Chessboard'
    } 
    for dataset_name in datasets:
        display(HTML('<center><h1>'+dataset_name+' '+switch.get(metric)+'</h1></center>'))

        boxs = []
        for set_ in ['train', 'test']:
            boxs.append(
                go.Box(
                    y = results[dataset_name][set_],
                    x = ["{}{} set".format(set_[0].upper(), set_[1:])
                        ]*len(results[dataset_name][set_]),
                    boxmean='sd', name=set_
                    )
            )

        fig = go.Figure(data   = boxs,
                    layout = go.Layout(showlegend=True)
                        
                   )

        fig.update_layout(
            margin=dict(l=70, r=70, t=5, b=10),
            font=dict(size=15),
            yaxis=dict(title='Accuracy (%)'),
        )

        fig.show()

        display(HTML('<hr>'))

In [9]:
#plot the plane division for MDC
def division(datasets, metric):
    switch={
        "euclidean": 'MDC Euclidean',
        "cityblock": 'MDC Cityblock',
        "chebyshev": 'MDC Chessboard'
    } 
    print(switch.get(metric))
    for dataset_name in datasets:
        X = datasets[dataset_name].iloc[:, 0:2].values#data
        Y = datasets[dataset_name].iloc[:, 2].values#target
        
        # fitting the MDC again
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size)
        scaler = MinMaxScaler()
        X_tr_scaled = scaler.fit_transform(X_train)
        X_ts_scaled = scaler.transform(X_test)

        mdc = MDC()
        mdc.fit(X_tr_scaled, y_train)


        # projecting in 2D space
        X_full = np.vstack((X_tr_scaled, X_ts_scaled)) 
        pca  = PCA(n_components=2)
        X_2d = pca.fit_transform(X_full)


        # Making graph
        fig = go.Figure()

        n = 400 # number of points in each dimension
        size = 2 # size of points in the coloring grid

        x_max, y_max = X_2d.max(0)
        x_min, y_min = X_2d.min(0)
        x = np.linspace(x_min*1.1, x_max*1.1, n)
        y = np.linspace(y_min*1.1, y_max*1.1, n)
        # creating the coloring grid data points
        xv, yv = np.meshgrid(x, y)
        x = xv.flatten()
        y = yv.flatten()

        grid_pred = mdc.predict(
            pca.inverse_transform( np.column_stack((x, y)) ), metric
        )

        # coloring grid (coloring the regions)
        fig.add_trace(go.Scatter(
            x = x.tolist(),
            y = y.tolist(),
            mode='markers',
            opacity=0.3,
            marker = dict( size=size, color= grid_pred )
        ))

        # data set projection in 2d space
        fig.add_trace(go.Scatter(
            x=X_2d[:,0],
            y=X_2d[:,1],
            mode='markers',
            marker = dict(
                size  = 7,
                color = np.concatenate([y_train,y_test]),
                line  = dict(width=1.5,color='#000000')
            )
        ))

        fig.update_layout(
            margin=dict(l=20, r=2, b=0),#t=20),
            font=dict(size=20),
            autosize=False,
            width  = 600,
            height = 600,
            showlegend = False,
            title='{} [{} classes]'.format(dataset_name, len(np.unique(Y)))
        )

        fig.update_xaxes(range=[x_min*1.1, x_max*1.1]) 
        fig.update_yaxes(range=[y_min*1.1, y_max*1.1])

        fig.show(renderer="png")
        display(HTML('<hr>'))
        

In [10]:
#return splitted dataframe for KNN
def knnPreSplit(df, ts):
    #preprocessing
    X=df.iloc[:, 0:2].values#data
    y=df.iloc[:, 2].values#target
    #suddivision
    X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=ts)
    #feature scaling
    scaler=StandardScaler()
    scaler.fit(X_train)
    X_train=scaler.transform(X_train)
    X_test=scaler.transform(X_test)

    return X_train,X_test,y_train,y_test

In [11]:
#return classifier and predicted targets
def knnPred(k,X_train,y_train,X_test):
    #training and prediction
    classifier=KNeighborsClassifier(k)
    classifier.fit(X_train,y_train)

    y_pred=classifier.predict(X_test)

    return classifier,y_pred

In [12]:
#print accuracy evaluation
def knnEval(y_test,y_pred):
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))

In [13]:
#plot KNN plane division
def plotKNN(Xtr,Xte,ytr,yte,k,legend,case):
    fig = plt.figure(figsize=(8, 6))
    ax = fig.add_subplot(111)
    padding = 0.1
    resolution = 0.1

    colors = {0: 'green', 1: 'yellow', 2: 'black'}
    x_min, x_max = Xtr[:, 0].min(), Xtr[:, 0].max()
    y_min, y_max = Xtr[:, 1].min(), Xtr[:, 1].max()
    x_range = x_max - x_min
    y_range = y_max - y_min
    x_min -= x_range * padding
    y_min -= y_range * padding
    x_max += x_range * padding
    y_max += y_range * padding

    # Get decision boundaries from model
    xx, yy = np.meshgrid(np.arange(x_min, x_max, resolution),
                         np.arange(y_min, y_max, resolution))

    Z = classifier.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    # Plot the contour map
    plt.contourf(xx, yy, Z, cmap=plt.cm.get_cmap('tab20'))
    plt.axis('tight')
    
    #switch to auto set the name of the graph
    switch={
        "general": 'General K-nearest neighbors case with k= {}'.format(k),
        "low": 'Worst K-nearest neighbors case with k= {}'.format(k),
        "high": 'Best K-nearest neighbors case with k= {}'.format(k)
    } 
    
    i=0
    # Plot your testing points as well
    for label in np.unique(yte):
        indices = np.where(yte == label)
        plt.scatter(Xte[indices, 0], Xte[indices, 1], c=colors[label], alpha=0.8,label=legend[i])
        i=i+1
    
    plt.legend(loc='lower right')
    plt.title(switch.get(case))
    
    plt.show()

In [14]:
#return array with accuracy for k from 1 to 40 in the test set
def varKtest(X_train,y_train,X_test,y_test):#su test
    #evaluation varying k
    accuracy=[]
    #accuracy calculation for k varying from 1 to 40
    for i in range(1,41):
        k=i
        knn=KNeighborsClassifier(n_neighbors=i)
        knn.fit(X_train,y_train)
        pred_i=knn.predict(X_test)
        report = classification_report(y_test, pred_i, output_dict=True)
        accuracy.append(report['accuracy'])
    return accuracy

In [15]:
#return array with accuracy for k from 1 to 40 in the test set
def varKtrain(X_train,y_train,X_test,y_test):#su train
    #evaluation varying k
    accuracy=[]
    #accuracy calculation for k varying from 1 to 40
    for i in range(1,41):
        k=i
        knn=KNeighborsClassifier(n_neighbors=i)
        knn.fit(X_train,y_train)
        pred_i=knn.predict(X_train)
        report = classification_report(y_train, pred_i, output_dict=True)
        accuracy.append(report['accuracy'])
    return accuracy

In [16]:
#plot the accuracy value of each k from 1 to 40
def plotVarK(error,name):
    #varying k plot
    plt.figure(figsize=(12,6))
    plt.plot(range(1,41),error,color='red',linestyle='dashed',marker='o',markerfacecolor='blue',
        markersize=10)
    plt.title('Accuracy Rate K Value '+name)
    plt.xlabel('Kvalue')
    plt.ylabel('Accuracy(%)')

In [17]:
#return the values of a good k and a bad k
def knnLowHigh(accuracy):
    listak=[]
    for i in range(1,41):
        listak.append(i)
    #convert array tinto df   
    accuracyDf= pd.DataFrame(accuracy, columns=['accuracy'])
    accuracyDf['kappa']=listak
    accuracyDf.set_index('kappa', inplace=True)
    #sort df
    accuracyDf.sort_values(by=['accuracy'], inplace = True)

    testa = accuracyDf.head(1)
    for row in testa.index:
        low=row

    coda= accuracyDf.tail(1)
    for row in coda.index:
        high=row
    
    print("A K with low accuracy: ")
    print(low)
    print("A K with high accuracy: ")
    print(high)
    return low,high

In [18]:
#convert accuracy array to dataframe
def accuracyDf(accuracy):
    listak=[]
    for i in range(1,41):
        listak.append(i)
        
    a= pd.DataFrame(accuracy, columns=['accuracy'])
    a['kappa']=listak
    return a

In [19]:
#print two accuracy boxplot, one for the train set and one for the test set of the KNN
def BoxCompare(Test_set, Train_set, ds_arg):

    if ds_arg=="Iris":
        tit="Boxplot KNN Iris"
    else:
        tit="Boxplot KNN Wine"
    
    fig = go.Figure()
    fig.add_trace(go.Box(y=Train_set["accuracy"], name="Train set"))
    fig.add_trace(go.Box(y=Test_set["accuracy"], name="Test set"))
    fig.update_layout(title=tit ,yaxis_title="Accuracy",xaxis_title="Test")
    
    fig.show()    