In [None]:
# Handling all the imports
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

Data Set Information:

The examined group comprised kernels belonging to three different varieties of wheat: Kama, Rosa and Canadian, 70 elements each, randomly selected for
the experiment. High quality visualization of the internal kernel structure was detected using a soft X-ray technique. It is non-destructive and considerably cheaper than other more sophisticated imaging techniques like scanning microscopy or laser technology. The images were recorded on 13x18 cm X-ray KODAK plates. Studies were conducted using combine harvested wheat grain originating from experimental fields, explored at the Institute of Agrophysics of the Polish Academy of Sciences in Lublin.

The data set can be used for the tasks of classification and cluster analysis.


Attribute Information:

To construct the data, seven geometric parameters of wheat kernels were measured:
1. area A,
2. perimeter P,
3. compactness C = 4*pi*A/P^2,
4. length of kernel,
5. width of kernel,
6. asymmetry coefficient
7. length of kernel groove.
All of these parameters were real-valued continuous.

In [None]:
# defining the cols in data
cols = ["area", "perimeter", "compactness", "length", "width", "asymmetry", "groove", "class"]
# Reading the data frame
df = pd.read_csv("seeds_dataset.txt", names=cols, sep="\s+")
# we use seperator as to classify the data into columns as we do not always get pre col data
# here \s+ is used to serve the purpose

In [None]:
df.head()

In [None]:
# plotting the data against each other, with actually not using the class
# we will pretend that this data is not useful and we'll predict this data
for i in range(len(cols)-1):
    for j in range(i+1, len(cols)-1):
        x_label = cols[i]
        y_label = cols[j]
        # Here the hue wull compare the three different classes with three different colors in the plot
        sns.scatterplot(x=x_label, y=y_label, data=df, hue='class')
        plt.show()

# Clustering

In [None]:
# Using the sklearn module to import KMeans
from sklearn.cluster import KMeans

In [None]:
x = "compactness"
y = "asymmetry"
X = df[[x,y]].values

In [None]:
kmeans = KMeans(n_clusters=3, n_init="auto")
kmeans.fit(X)

In [None]:
# This gives the predictions that from our cluster model
clusters = kmeans.labels_

In [None]:
clusters

In [None]:
# These are the actual values that we get with the dataset
# and we can also use this data to compare with our predicted value
df["class"].values

In [None]:
cluster_df = pd.DataFrame(np.hstack((X, clusters.reshape(-1,1))), columns = [x,y,"class"])

In [None]:
# K Means Classes, as predicted
sns.scatterplot(x=x,y=y, hue='class', data=cluster_df)
plt.show()

In [None]:
# printing the original plot
sns.scatterplot(x=x,y=y, hue='class', data=df)
plt.show()

# Higher Dimentions

In [None]:
# Actually using all the dimentions insted of only one
X = df[cols[:-1]].values
kmeans = KMeans(n_clusters=3, n_init="auto")
kmeans.fit(X)
cluster_df = pd.DataFrame(np.hstack((X, kmeans.labels_.reshape(-1,1))), columns = df.columns)

In [None]:
# K Means Classes, as predicted
sns.scatterplot(x=x,y=y, hue='class', data=cluster_df)
plt.show()

In [None]:
# K Means Classes, as predicted
sns.scatterplot(x=x,y=y, hue='class', data=df)
plt.show()

# PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
# Component is how many dimentions i want to map it into
pca = PCA(n_components=2)
transformed_x = pca.fit_transform(X)

In [None]:
X.shape
# Can we seen as I had 210 samples and each of them has 7 features

In [None]:
transformed_x.shape

In [None]:
# taking a look at the first five things
transformed_x[:5]

In [None]:
plt.scatter(transformed_x[:,0], transformed_x[:,1])
plt.show()

In [None]:
kmeans_pca_df = pd.DataFrame(np.hstack((transformed_x, kmeans.labels_.reshape(-1, 1))), columns=["pca1", "pca2", "class"])

In [None]:
truth_pca_df = pd.DataFrame(np.hstack((transformed_x, df["class"].values.reshape(-1, 1))), columns=["pca1", "pca2", "class"])

In [None]:
# K Means classes
sns.scatterplot(x="pca1", y="pca2", hue='class', data=kmeans_pca_df)
plt.show()

In [None]:
# Truth classes
sns.scatterplot(x="pca1", y="pca2", hue='class', data=truth_pca_df)
plt.show()