## 1. Read arff files

In [1]:
from scipy.io import arff
import pandas as pd
import numpy as np

data = arff.loadarff('Feature_Selection_using_Weka/data_with_label.arff')
df = pd.DataFrame(data[0])
#Exclude the labels, ‘population’ column here, from the data and store them in the variable X.
X = df.loc[:, df.columns != 'population']
#copy the labels , 'population' column here, in the variable y.
y = df.loc[:, df.columns =='population']

## 2. Convert the labels from categorical data into numbers and change their type from a dataframe to a numpy array

In [2]:
#The clustering performed by kmeans indicates classes using numbers starting from 0. In kmeans,
#0 and 1 denote ‘pop1’ and ‘pop2’, respectively. Therefore, the replace method is used to convert
#categorical data into numbers. Before that, a deep copy of the dataframe column is required.
#When changing categorical data into numbers, the pandas get_dummies method could also be used.
y = y.copy(deep=True)
y['population'].replace([b'pop1', b'pop2'], [0, 1], inplace=True)

#pycm needs a list or a numpy array. Thus convert pandas dataframe into a numpy array
y = np.array(y)

## 3. K-Means: Cluster the data using the KMeans algorithm and obtain the accuracy using the pycm package

In [3]:
from pycm import *
from sklearn.cluster import KMeans
from sklearn import metrics
kmeans = KMeans(n_clusters=2, random_state=0)
kmeans.fit(X)
y_pred = kmeans.labels_

#prepare y_pred data for pycm
y_pred.astype(int)
cm = ConfusionMatrix(actual_vector=y, predict_vector=y_pred)
Acc=metrics.accuracy_score(y, y_pred)
Acc

1.0

## 4. Count the number of correctly predicted instances

In [4]:
def count_correct(y_actual, y_predict):
    c = 0
    for i in range(len(y_actual)):
        if y_actual[i]==y_predict[i]:
            c=c+1
    return c


count_correct(y, y_pred)

32