In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

# Load and prepare the dataset
iris = load_iris()
data = pd.DataFrame(data=np.c_[iris['data'], iris['target']], 
                   columns=iris['feature_names'] + ['target'])


In [2]:
# Basic data exploration
def explore_dataset():
    print("Dataset Shape:", data.shape)
    print("\nFirst few rows:")
    print(data.head())
    print("\nDataset Information:")
    print(data.info())
    print("\nBasic Statistics:")
    print(data.describe())
    print("\nClass Distribution:")
    print(data['target'].value_counts())
    
    # Check for missing values
    print("\nMissing Values:")
    print(data.isnull().sum())

In [3]:
# Data preprocessing
def preprocess_data():
    # Separate features and target
    X = data.drop('target', axis=1)
    y = data['target']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test, scaler


In [4]:
# Visualization functions
def create_visualizations():
    # Pairplot
    plt.figure(figsize=(10, 6))
    sns.pairplot(data, hue='target')
    plt.savefig('pairplot.png')
    plt.close()
    
    # Correlation matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
    plt.title('Feature Correlation Matrix')
    plt.savefig('correlation_matrix.png')
    plt.close()

In [5]:
if __name__ == "__main__":
    # Explore the dataset
    explore_dataset()
    
    # Create visualizations
    create_visualizations()
    
    # Preprocess the data
    X_train_scaled, X_test_scaled, y_train, y_test, scaler = preprocess_data()
    
    # Save preprocessed data
    np.save('X_train_scaled.npy', X_train_scaled)
    np.save('X_test_scaled.npy', X_test_scaled)
    np.save('y_train.npy', y_train)
    np.save('y_test.npy', y_test)

Dataset Shape: (150, 5)

First few rows:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0     0.0  
1     0.0  
2     0.0  
3     0.0  
4     0.0  

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float6

<Figure size 1000x600 with 0 Axes>