# Dataset Analysis

In [347]:
import pandas as pd
import numpy as np

In [348]:
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [349]:
from sklearn.preprocessing import MinMaxScaler

# Iris Dataset

In [350]:
iris = pd.read_csv('./Iris/iris.data.txt',delimiter=',',header=None)

In [351]:
mapper = {0:'sepal_length',1:'sepal_width',2:'petal_length',3:'petal_width',4:'class'}

In [352]:
iris.rename(mapper=mapper, axis=1, inplace=True)

In [353]:
iris.shape

(150, 5)

In [354]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [355]:
iris.isnull().any()

sepal_length    False
sepal_width     False
petal_length    False
petal_width     False
class           False
dtype: bool

In [356]:
iris['class'].unique() #checking how many unique target values we have

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [357]:
#changing the target attribute into numeric form
iris['class'] = iris['class'].map({'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2})

In [358]:
iris.describe() #All the statistical vlaues described

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667,1.0
std,0.828066,0.433594,1.76442,0.763161,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [359]:
#initialising MinMaxScaler
scaler = MinMaxScaler()

In [360]:
#Fitting the data in the MinMaxScaler algorithm
scaler.fit(iris[['sepal_length','sepal_width','petal_length','petal_width']])

MinMaxScaler(copy=True, feature_range=(0, 1))

In [361]:
#applying the MinMaxScaler Algorithm on the Data
scaled = scaler.transform(iris[['sepal_length','sepal_width','petal_length','petal_width']])

In [362]:
#getting all the normalized values back to the DataFrame
attributes = ['sepal_length','sepal_width','petal_length','petal_width']
for att in range(len(attributes)):
    lst=[]
    for i in range(150):
        lst.append(scaled[i][att])
    iris[attributes[att]]=lst    

In [363]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,0.222222,0.625,0.067797,0.041667,0
1,0.166667,0.416667,0.067797,0.041667,0
2,0.111111,0.5,0.050847,0.041667,0
3,0.083333,0.458333,0.084746,0.041667,0
4,0.194444,0.666667,0.067797,0.041667,0


In [364]:
iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
count,150.0,150.0,150.0,150.0,150.0
mean,0.428704,0.439167,0.467571,0.457778,1.0
std,0.230018,0.180664,0.299054,0.317984,0.819232
min,0.0,0.0,0.0,0.0,0.0
25%,0.222222,0.333333,0.101695,0.083333,0.0
50%,0.416667,0.416667,0.567797,0.5,1.0
75%,0.583333,0.541667,0.694915,0.708333,2.0
max,1.0,1.0,1.0,1.0,2.0


In [365]:
#Function for removing Outliars by using Quartiles
def Outliars_by_Quartiles(array):
    h1,h2 = np.split(array.sort_values(),2)
    Q1 = np.median(h1) #First Quartile
    Q3 = np.median(h2) #Second Quartile
    IQR = Q3 - Q1      #Inter-Quartile Range 
    OF1 = Q1 - 1.5 * IQR #negative outer Fence
    OF2 = Q3 + 1.5 * IQR #positive Outer Fence
    for i in range(len(array)):
        if array[i]<OF1 or array[i]>OF2: #condition for being an outliar
            array[i]=np.median(array)    #Filling Median Value on the place of outliar
    return array        

In [366]:
iris["sepal_width"]=Outliars_by_Quartiles(iris["sepal_width"])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


# Final Dataset

In [367]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,0.222222,0.625,0.067797,0.041667,0
1,0.166667,0.416667,0.067797,0.041667,0
2,0.111111,0.5,0.050847,0.041667,0
3,0.083333,0.458333,0.084746,0.041667,0
4,0.194444,0.666667,0.067797,0.041667,0
