# Wheat Seeds 2
## Load packages

In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.impute import SimpleImputer

# Data Preprocessing
## Download the Wheat Seed dataset from the given link

In [25]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt
path = '/content/seeds_dataset.txt'

--2023-02-20 22:37:19--  https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9300 (9.1K) [application/x-httpd-php]
Saving to: ‘seeds_dataset.txt.1’


2023-02-20 22:37:19 (126 MB/s) - ‘seeds_dataset.txt.1’ saved [9300/9300]



## Read Wheat Seed Data with all features and target values

In [26]:
features = ['Area',
            'Perimeter',
            'Compactness',
            'Length of kernel',
            'Width of kernel',
            'Asymmetry coefficient',
            'Length of kernel groove.']

df = pd.read_csv(path, delimiter=r'[\t]+',
                 names=features + ['target'])
display(df)

  return func(*args, **kwargs)


Unnamed: 0,Area,Perimeter,Compactness,Length of kernel,Width of kernel,Asymmetry coefficient,Length of kernel groove.,target
0,15.26,14.84,0.8710,5.763,3.312,2.221,5.220,1
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,1
2,14.29,14.09,0.9050,5.291,3.337,2.699,4.825,1
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,1
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,1
...,...,...,...,...,...,...,...,...
205,12.19,13.20,0.8783,5.137,2.981,3.631,4.870,3
206,11.23,12.88,0.8511,5.140,2.795,4.325,5.003,3
207,13.20,13.66,0.8883,5.236,3.232,8.315,5.056,3
208,11.84,13.21,0.8521,5.175,2.836,3.598,5.044,3


### Separate features as X and target as y

In [27]:
X = df.loc[:, features]
original_X = df.loc[:, features]
y = df.loc[:, 'target']


## Manipulate the original data
### Select random %5 of cells and replace their value with NaN

In [28]:
percentage = 0.05
X[np.random.random(X.shape) > 1 - percentage] = np.nan

display(X.head(50).fillna('_'))

Unnamed: 0,Area,Perimeter,Compactness,Length of kernel,Width of kernel,Asymmetry coefficient,Length of kernel groove.
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22
1,_,14.57,0.8811,5.554,3.333,1.018,4.956
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175
5,14.38,_,0.8951,5.386,3.312,2.462,4.956
6,14.69,14.49,0.8799,5.563,3.259,_,5.219
7,14.11,14.1,0.8911,5.42,3.302,2.7,_
8,16.63,15.46,0.8747,_,_,2.04,5.877
9,16.44,15.25,0.888,5.884,3.505,1.969,5.533


In [29]:
print()
print('─' * 150)  # U+2500, Box Drawings Light Horizontal
print('\n1. Number of NaN cells generated for each feature:\n')
display(pd.DataFrame(X.isnull().sum(0)).T)
print()
print('─' * 150)  # U+2500, Box Drawings Light Horizontal
print('\n2. Number of NaN features generated for each wine samples:\n')
display(pd.DataFrame(X.isnull().sum(1)).T)
print()
print('─' * 150)  # U+2500, Box Drawings Light Horizontal
print('\n3. Showing all the rows and features with NaN value:\n')
display(X[np.isin(X.isnull(), 1)].fillna('_').head(50))



──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

1. Number of NaN cells generated for each feature:



Unnamed: 0,Area,Perimeter,Compactness,Length of kernel,Width of kernel,Asymmetry coefficient,Length of kernel groove.
0,7,18,8,12,9,7,14



──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

2. Number of NaN features generated for each wine samples:



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,200,201,202,203,204,205,206,207,208,209
0,0,1,0,0,0,1,1,1,2,0,...,1,0,0,1,0,0,1,1,0,1



──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

3. Showing all the rows and features with NaN value:



Unnamed: 0,Area,Perimeter,Compactness,Length of kernel,Width of kernel,Asymmetry coefficient,Length of kernel groove.
1,_,14.57,0.8811,5.554,3.333,1.018,4.956
5,14.38,_,0.8951,5.386,3.312,2.462,4.956
6,14.69,14.49,0.8799,5.563,3.259,_,5.219
7,14.11,14.1,0.8911,5.42,3.302,2.7,_
8,16.63,15.46,0.8747,_,_,2.04,5.877
8,16.63,15.46,0.8747,_,_,2.04,5.877
12,13.89,_,0.888,5.439,3.199,3.986,4.738
15,14.59,14.28,0.8993,_,3.333,4.185,4.781
22,_,14.9,0.8988,5.618,3.507,0.7651,5.091
24,15.01,14.76,0.8657,5.789,3.245,1.791,_


## Comparison and Impute
### Compare the original dataframe with the manupulated data

In [30]:
display(X.compare(original_X).fillna(''))

Unnamed: 0_level_0,Area,Area,Perimeter,Perimeter,Compactness,Compactness,Length of kernel,Length of kernel,Width of kernel,Width of kernel,Asymmetry coefficient,Asymmetry coefficient,Length of kernel groove.,Length of kernel groove.
Unnamed: 0_level_1,self,other,self,other,self,other,self,other,self,other,self,other,self,other
1,,14.88,,,,,,,,,,,,
5,,,,14.21,,,,,,,,,,
6,,,,,,,,,,,,3.586,,
7,,,,,,,,,,,,,,5.0
8,,,,,,,,6.053,,3.465,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,,,,,,0.8609,,,,,,,,
203,,,,,,,,5.183,,,,,,
206,,,,12.88,,,,,,,,,,
207,,13.2,,,,,,,,,,,,


### Use median imputer to estimate the values of the missing values in the manupulated data frame

In [31]:
imr = SimpleImputer(missing_values=np.nan, strategy='median')
imr = imr.fit(X)
imputed_data = pd.DataFrame(imr.transform(X), columns=features)
imputed_data.head(50)

Unnamed: 0,Area,Perimeter,Compactness,Length of kernel,Width of kernel,Asymmetry coefficient,Length of kernel groove.
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22
1,14.34,14.57,0.8811,5.554,3.333,1.018,4.956
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175
5,14.38,14.36,0.8951,5.386,3.312,2.462,4.956
6,14.69,14.49,0.8799,5.563,3.259,3.619,5.219
7,14.11,14.1,0.8911,5.42,3.302,2.7,5.22
8,16.63,15.46,0.8747,5.5235,3.242,2.04,5.877
9,16.44,15.25,0.888,5.884,3.505,1.969,5.533


### Compare the imputed data with the manipulated dataframe

In [32]:
display(X.compare(imputed_data).fillna(''))

Unnamed: 0_level_0,Area,Area,Perimeter,Perimeter,Compactness,Compactness,Length of kernel,Length of kernel,Width of kernel,Width of kernel,Asymmetry coefficient,Asymmetry coefficient,Length of kernel groove.,Length of kernel groove.
Unnamed: 0_level_1,self,other,self,other,self,other,self,other,self,other,self,other,self,other
1,,14.34,,,,,,,,,,,,
5,,,,14.36,,,,,,,,,,
6,,,,,,,,,,,,3.619,,
7,,,,,,,,,,,,,,5.22
8,,,,,,,,5.5235,,3.242,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,,,,,,0.87395,,,,,,,,
203,,,,,,,,5.5235,,,,,,
206,,,,14.36,,,,,,,,,,
207,,14.34,,,,,,,,,,,,


### Compare the original data vs. the imputed data

In [None]:
display(original_X.compare(imputed_data).fillna(''))

Unnamed: 0_level_0,Area,Area,Perimeter,Perimeter,Compactness,Compactness,Length of kernel,Length of kernel,Width of kernel,Width of kernel,Asymmetry coefficient,Asymmetry coefficient,Length of kernel groove.,Length of kernel groove.
Unnamed: 0_level_1,self,other,self,other,self,other,self,other,self,other,self,other,self,other
3,,,13.94,14.35,,,,,,,,,,
4,16.14,14.285,,,,,,,,,,,,
5,,,,,,,,,3.312,3.245,,,,
8,,,,,,,6.053,5.51,,,,,,
14,13.74,14.285,,,,,,,,,,,,
17,,,14.75,14.35,,,,,,,,,,
18,,,,,,,,,,,,,4.649,5.22
25,16.19,14.285,,,,,,,,,,,,
27,,,13.67,14.35,,,,,,,,,,
30,13.16,14.285,,,,,,,,,,,,
