#Import Libraries

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import os

#Dataset Load

In [10]:
df = pd.read_csv('/content/drive/MyDrive/ML Lab/Iris.csv')
print(df.head())

   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa


In [11]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB
None


In [12]:
print(df.describe())

               Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
count  150.000000     150.000000    150.000000     150.000000    150.000000
mean    75.500000       5.843333      3.054000       3.758667      1.198667
std     43.445368       0.828066      0.433594       1.764420      0.763161
min      1.000000       4.300000      2.000000       1.000000      0.100000
25%     38.250000       5.100000      2.800000       1.600000      0.300000
50%     75.500000       5.800000      3.000000       4.350000      1.300000
75%    112.750000       6.400000      3.300000       5.100000      1.800000
max    150.000000       7.900000      4.400000       6.900000      2.500000


In [13]:
# 1. Drop Id if present
if 'Id' in df.columns:
    df = df.drop(columns=['Id'])

In [14]:
# 2. Drop duplicates
df = df.drop_duplicates().reset_index(drop=True)

In [15]:
# 3. Numeric cols
numeric_cols = ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']


In [16]:
# 4. Outlier handling: clip per-column using IQR bounds
Q1 = df[numeric_cols].quantile(0.25)
Q3 = df[numeric_cols].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

df_clipped = df.copy()
for col in numeric_cols:
    df_clipped[col] = df_clipped[col].clip(lower[col], upper[col])


In [17]:
# 5. Encode target
le = LabelEncoder()
df_clipped['Species_encoded'] = le.fit_transform(df_clipped['Species'])


In [18]:
# 6. Scale numeric features and create scaled columns
scaler = StandardScaler()
scaled = scaler.fit_transform(df_clipped[numeric_cols])
scaled_cols = [c + "_scaled" for c in numeric_cols]
df_clipped[scaled_cols] = scaled

In [20]:
# 7. Save cleaned CSV
output_path = '/content/drive/MyDrive/ML Lab/Iris.csv'
df_clipped.to_csv(output_path, index=False)


In [21]:
# 8. Train-test split using scaled features
X = df_clipped[scaled_cols]
y = df_clipped['Species_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [22]:
# 9. Train a simple Logistic Regression
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

In [25]:
# Prepare a small summary dataframe for display
summary = {
    "rows_original": len(pd.read_csv('/content/drive/MyDrive/ML Lab/Iris.csv')),
    "rows_after_dedup": len(df),
    "rows_after_clip": len(df_clipped),
    "cleaned_csv_path": output_path,
    "test_size": len(X_test),
    "accuracy": acc
}

report_df = pd.DataFrame(report).transpose()


In [26]:
# Display first few rows of cleaned df
preview = df_clipped.head()


In [28]:
# Save model metrics to a small CSV too
metrics_path = '/content/drive/MyDrive/ML Lab/Iris.csv'
pd.DataFrame({
    "metric": ["accuracy"],
    "value": [acc]
}).to_csv(metrics_path, index=False)


In [29]:
# Outputs
{
    "summary": summary,
    "preview": preview,
    "classification_report": report_df,
    "confusion_matrix": cm,
    "cleaned_csv": output_path,
    "metrics_csv": metrics_path
}

{'summary': {'rows_original': 147,
  'rows_after_dedup': 147,
  'rows_after_clip': 147,
  'cleaned_csv_path': '/content/drive/MyDrive/ML Lab/Iris.csv',
  'test_size': 30,
  'accuracy': 0.9333333333333333},
 'preview':    SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species  \
 0            5.1           3.5            1.4           0.2  Iris-setosa   
 1            4.9           3.0            1.4           0.2  Iris-setosa   
 2            4.7           3.2            1.3           0.2  Iris-setosa   
 3            4.6           3.1            1.5           0.2  Iris-setosa   
 4            5.0           3.6            1.4           0.2  Iris-setosa   
 
    Species_encoded  SepalLengthCm_scaled  SepalWidthCm_scaled  \
 0                0             -0.915509             1.053523   
 1                0             -1.157560            -0.123285   
 2                0             -1.399610             0.347438   
 3                0             -1.520635             0