# Problem Statement

### Using Iris data cluster Iris flowers into different clusters based on similarity.

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load data
data=pd.read_csv("Iris.csv")

In [None]:
data

In [None]:
# drop unnamed column
data.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
# Basic Checks

In [None]:
data.head(2)

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.target.unique()


In [None]:
# EDA
plt.figure(figsize=(3,3))
sns.scatterplot(data=data,x='petal_length',y='petal_width',hue='target',palette='Set1')

# Data Preprocessing

In [None]:
# check for missing values
data.isnull().sum()

In [None]:
# check for duplicates
data.duplicated().sum()

In [None]:
data.drop_duplicates()

In [None]:
# Check for outliers
plt.figure(figsize=(3,5))
plotnumber=1

for column in data.drop('target',axis=1):
    if plotnumber<=4: 
        ax=plt.subplot(2,2,plotnumber)
        sns.boxplot(data[column])
        plt.xlabel(column)
        plt.ylabel("count")
    plotnumber+=1
plt.tight_layout()

In [None]:
plt.figure(figsize=(3,3))
sns.histplot(data=data,x='sepal_width',kde=True)

In [None]:
# 3-sigma rule
low_limit=data['sepal_width'].mean()-3*data['sepal_width'].std()
upper_limit=data['sepal_width'].mean()+3*data['sepal_width'].std()

print("lower limit",low_limit)
print("upper limit",upper_limit)

In [None]:
data.sepal_width.describe()

In [None]:
# Find values less than lower limit
data.loc[data['sepal_width']<low_limit]

In [None]:
# Find values greater than upper limit
data.loc[data['sepal_width']>upper_limit]

In [None]:
# replace outlier with mean
data.loc[data['sepal_width']>upper_limit,'sepal_width']=data['sepal_width'].mean()

In [None]:
sns.boxplot(data=data,x='sepal_width')

In [None]:
# Apply scaling
from sklearn.preprocessing import StandardScaler

scaling=StandardScaler()
data[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]=scaling.fit_transform(data[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])

In [None]:
data.head()

In [None]:
data.columns

In [None]:
# Remove target from data
x=data.drop('target',axis=1)

In [None]:
x

# Apply Kmean model

In [None]:
# import kmean model
from sklearn.cluster import KMeans
# initialize the model
model=KMeans(n_clusters=5,random_state=10)
# train a model
model.fit(x)

In [None]:
# centroids-cluster centers
# no of clusters is sames as number of centroids
model.cluster_centers_

In [None]:
model.labels_

In [None]:
# Visualize the clusters created by model
plt.figure(figsize=(3,3))
color_schema=np.array(["red","green","black","orange","blue"])
plt.scatter(x.petal_length,x.petal_width,color=color_schema[model.labels_])

In [None]:
data['labels']=pd.DataFrame(model.labels_)

In [None]:
from sklearn.metrics import silhouette_score
silhouette_score(x,model.labels_)

In [None]:
# We see overlapping of clusters because number of clusters that we have choosen might be wrong

# How to  find optimal value for K?
Elbow Method

In [None]:
WCSS=[]
for i in range(2,11):
    model=KMeans(n_clusters=i,random_state=20)
    model.fit(x)
    WCSS.append(model.inertia_)
plt.plot(range(2,11),WCSS)
plt.title('Elbow curve')
plt.xlabel('no of clusters(k)')
plt.ylabel('WCSS')
    
    

In [None]:
WCSS

In [None]:
# Clearly from elbow method the optimal value for k is 3

In [None]:
model=KMeans(n_clusters=3,random_state=10)
model.fit(x)

In [None]:
silhouette_score(x,model.labels_)

In [None]:
plt.scatter(x.petal_length,x.petal_width,color=color_schema[model.labels_])