In [2]:
# Data Analysis with Pandas and Matplotlib
# Author: Claude
# Date: May 21, 2025

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
import matplotlib.dates as mdates
from datetime import datetime, timedelta
import os

# Set the style for our plots
plt.style.use('seaborn-v0_8')  # Updated from 'seaborn-v0_8-whitegrid' which may cause errors
sns.set_palette("Set2")

# Set up figure size for better readability
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['font.size'] = 12

print("====== TASK 1: LOAD AND EXPLORE THE DATASET ======")

# Load the Iris dataset
try:
    # Try to load the dataset
    iris = load_iris()

    # Create a pandas DataFrame
    iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

    # Add the target column (species)
    iris_df['species'] = pd.Series(
        [iris.target_names[t] for t in iris.target],
        index=iris_df.index
    )

    print("Dataset loaded successfully!")
except Exception as e:
    print(f"Error loading the dataset: {e}")
    exit()

# Display the first 5 rows of the dataset
print("\nFirst 5 rows of the dataset:")
print(iris_df.head())

# Explore the structure of the dataset
print("\nDataset information:")
print(iris_df.info())

# Check for missing values
print("\nMissing values in the dataset:")
print(iris_df.isnull().sum())

# Since the Iris dataset is clean, we'll introduce some missing values for demonstration purposes
np.random.seed(42)  # For reproducibility
mask = np.random.random(size=iris_df.shape) < 0.05  # 5% of data will be missing
iris_df_with_missing = iris_df.copy()
iris_

Dataset loaded successfully!

First 5 rows of the dataset:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

  species  
0  setosa  
1  setosa  
2  setosa  
3  setosa  
4  setosa  

Dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 

NameError: name 'iris_' is not defined