# DataSet Analysis

In [1]:
import pandas as pd
import numpy as np

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
from sklearn.preprocessing import MinMaxScaler

# Perfume Dataset

In [5]:
df = pd.read_csv(r"Perfume/testing in rows.csv") #reading the csv file in pandas dataframe

In [114]:
target = df["Name"].unique() #getting the unique target values
target_dict={}              
#giving every target a numeric value
for i in range(len(target)):   
    target_dict[target[i]]=i
target_dict    

{'RoseMusk': 13,
 'TeaTreeOil': 11,
 'aday jmal': 1,
 'ajayeb': 0,
 'amreaj': 2,
 'aood': 3,
 'asgar_ali': 4,
 'bukhoor': 5,
 'carolina_herrera': 16,
 'constrected': 18,
 'constrected2': 15,
 'dehenalaod': 6,
 'junaid': 7,
 'kausar': 8,
 "oudh_ma'alattar": 17,
 'raspberry': 12,
 'rose': 9,
 'solidmusk': 10,
 'strawberry': 14}

In [115]:
df["Name"] = df['Name'].map(target_dict) #Mapping the target values

In [116]:
attributes = list(df)
del attributes[0]   #deleting the "Name" attributes from the list of attributes

In [117]:
#Removing "," from the input values and changing them to integer
for att in attributes:
    lst = []
    for val in df[att]:
        newstr = val.replace(",", "")
        lst.append(int(newstr))
    df[att] = lst      

In [118]:
df.describe()  

Unnamed: 0,Name,One,Two,Three,Four,Five,Six,Seven,Eight,Nine,...,Nineteen,Twenty,Twenty_one,Twenty_two,Twenty_three,Twenty_four,Twenty_five,Twenty_six,Twenty_seven,Twenty_eight
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,8.8,68311.5,68209.45,68258.3,68406.35,68203.05,68354.4,68204.05,68455.05,68206.0,...,68353.0,68552.35,68550.35,68649.75,68548.5,68498.3,68499.5,68500.95,68501.1,68351.15
std,5.549775,8846.587374,8795.110062,8850.989687,8730.317883,8753.408398,8801.430054,8816.565064,8734.20106,8895.42117,...,8382.485808,8362.122485,8447.651871,8375.858566,8554.045466,8546.33413,8546.71877,8636.665508,8568.117283,8598.780129
min,0.0,46015.0,46015.0,46015.0,46015.0,46015.0,46015.0,46015.0,46015.0,46015.0,...,48016.0,48016.0,48016.0,48016.0,48016.0,48016.0,48016.0,48016.0,48016.0,48016.0
25%,4.75,63396.5,63396.5,64145.75,64145.75,64145.75,64145.75,64144.25,64394.25,64145.75,...,63399.25,64157.5,64144.25,64407.5,63148.75,64145.75,63385.25,63385.25,63385.25,63135.25
50%,8.5,68181.0,68681.0,68681.0,68681.0,68179.0,68679.0,68176.5,68680.0,68674.0,...,68177.5,69136.0,69136.0,69136.0,69136.0,69085.0,69139.5,69139.5,69139.5,69139.5
75%,13.25,72553.0,72202.75,72021.0,72145.75,72020.0,72208.75,72019.0,72187.5,72019.0,...,71625.5,71520.0,71658.5,71647.25,72196.25,71688.75,72193.25,72193.25,72016.5,71657.0
max,18.0,85056.0,85056.0,85056.0,85056.0,85056.0,85056.0,85056.0,85056.0,85056.0,...,85056.0,85056.0,85056.0,85056.0,85054.0,85054.0,85054.0,85054.0,85054.0,85054.0


In [119]:
scaler = MinMaxScaler() #initializing MinMaxScaler

In [120]:
scaled = scaler.fit_transform(df[attributes]) #Fitting and Transforming the values in the DataFrame
for at in range(len(attributes)):
    lst=[]
    for i in range(20):
        lst.append(scaled[i][at])
    df[attributes[at]] = lst


In [121]:
#Function for removing Outliars by using Quartiles
def Outliars_by_Quartiles(array):
    h1,h2 = np.split(array.sort_values(),2)
    Q1 = np.median(h1) #First Quartile
    Q3 = np.median(h2) #Second Quartile
    IQR = Q3 - Q1      #Inter-Quartile Range 
    OF1 = Q1 - 2 * IQR #negative outer Fence
    OF2 = Q3 + 2 * IQR #positive Outer Fence
    for i in range(len(array)):
        if array[i]<OF1 or array[i]>OF2: #condition for being an outliar
            array[i]=np.median(array)    #Filling Median Value on the place of outliar
    return array 

In [122]:
for att in attributes:
    df[att] = Outliars_by_Quartiles(df[att])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


# Final Dataset

In [123]:
df.head(10)

Unnamed: 0,Name,One,Two,Three,Four,Five,Six,Seven,Eight,Nine,...,Nineteen,Twenty,Twenty_one,Twenty_two,Twenty_three,Twenty_four,Twenty_five,Twenty_six,Twenty_seven,Twenty_eight
0,0,0.448605,0.448605,0.474194,0.474194,0.474194,0.474194,0.474143,0.474143,0.474194,...,0.418413,0.445734,0.445734,0.445734,0.418435,0.445812,0.418435,0.418435,0.418435,0.418435
1,1,0.395226,0.394944,0.369048,0.394944,0.368766,0.394611,0.369355,0.394944,0.369663,...,0.362554,0.362932,0.362878,0.363202,0.362952,0.36252,0.36252,0.362574,0.362898,0.362952
2,2,0.256781,0.256781,0.256781,0.282395,0.282395,0.256781,0.256781,0.256781,0.231167,...,0.270653,0.270653,0.243683,0.243683,0.243696,0.216696,0.24375,0.21675,0.21675,0.21675
3,3,0.693604,0.667888,0.667811,0.693399,0.667708,0.667708,0.667606,0.667606,0.667606,...,0.622489,0.62257,0.62257,0.62257,0.649603,0.649495,0.649495,0.649495,0.649414,0.649414
4,4,0.569017,0.569017,0.569017,0.569043,0.569043,0.569043,0.569043,0.569222,0.569043,...,0.545545,0.545545,0.545545,0.545545,0.545575,0.518603,0.545764,0.545764,0.545764,0.545764
5,5,0.641146,0.615532,0.615532,0.615532,0.615532,0.615532,0.615532,0.615584,0.615584,...,0.594843,0.594843,0.594843,0.594843,0.594876,0.594849,0.594876,0.594876,0.594876,0.594876
6,5,0.641146,0.615532,0.615532,0.615532,0.615532,0.615532,0.615532,0.615584,0.615584,...,0.594843,0.594843,0.594843,0.594843,0.594876,0.594849,0.594876,0.594876,0.594876,0.594876
7,6,0.566507,0.592121,0.592121,0.592095,0.566379,0.591993,0.566251,0.591865,0.591737,...,0.543089,0.543089,0.543089,0.542981,0.542983,0.542848,0.542848,0.542848,0.54274,0.54274
8,7,0.679465,0.67944,0.653492,0.653134,0.653134,0.653134,0.653134,0.678722,0.653134,...,0.634044,0.634044,0.634044,0.634395,0.63443,0.63443,0.63443,0.63443,0.63443,0.63443
9,8,0.680592,0.680592,0.680592,0.680592,0.680592,0.680592,0.680592,0.680592,0.680592,...,0.662554,0.662554,0.662554,0.662554,0.66259,0.635212,0.66259,0.66259,0.66259,0.635212
