In [4]:
# 1. Import all the required Python Libraries
import pandas as pd
import numpy as np

# 2. Locate an open-source data from the web
# Dataset: Iris Species Dataset
# URL: https://www.kaggle.com/datasets/uciml/iris

# 3. Load the Dataset into pandas dataframe
df = pd.read_csv('Iris.csv')  # Make sure you have Iris.csv file downloaded

# 4. Data Preprocessing
# Check for missing values
print("\nMissing values in each column:\n", df.isnull().sum())

# Get some initial statistics
print("\nStatistical summary of the dataset:\n", df.describe())

# Variable descriptions:
# Id: Integer (Identifier)
# SepalLengthCm: Float (Sepal Length in cm)
# SepalWidthCm: Float (Sepal Width in cm)
# PetalLengthCm: Float (Petal Length in cm)
# PetalWidthCm: Float (Petal Width in cm)
# Species: Categorical (Type of Iris flower)

# Check dimensions
print("\nShape of the dataframe:", df.shape)

# 5. Data Formatting and Normalization
# Check data types
print("\nData types of each column:\n", df.dtypes)

# If necessary, we can convert 'Id' to string if it's just an identifier
df['Id'] = df['Id'].astype(str)

# Confirm conversion
print("\nData types after conversion:\n", df.dtypes)

# 6. Turn categorical variables into quantitative variables
# The 'Species' column is categorical, so let's encode it
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Species_encoded'] = le.fit_transform(df['Species'])

print("\nFirst 5 rows after encoding:\n", df.head())

# Final view
print("\nFinal dataset columns:\n", df.columns)



Missing values in each column:
 Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

Statistical summary of the dataset:
                Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
count  150.000000     150.000000    150.000000     150.000000    150.000000
mean    75.500000       5.843333      3.054000       3.758667      1.198667
std     43.445368       0.828066      0.433594       1.764420      0.763161
min      1.000000       4.300000      2.000000       1.000000      0.100000
25%     38.250000       5.100000      2.800000       1.600000      0.300000
50%     75.500000       5.800000      3.000000       4.350000      1.300000
75%    112.750000       6.400000      3.300000       5.100000      1.800000
max    150.000000       7.900000      4.400000       6.900000      2.500000

Shape of the dataframe: (150, 6)

Data types of each column:
 Id                 int64
SepalLengthCm    float64
Sepal