In [3]:
import pandas as pd

# Load the dataset
data = pd.read_csv('/home/nashtech/Documents/dataset.csv')
print(f"Dataset {data}")

Dataset     Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  \
0  144            6.8           3.2            5.9           2.3   
1   92            6.1           3.0            4.6           1.4   
2   83            5.8           2.7            3.9           1.2   
3   26            5.0           3.0            1.6           0.2   
4  115            5.8           2.8            5.1           2.4   

           Species  
0   Iris-virginica  
1  Iris-versicolor  
2  Iris-versicolor  
3      Iris-setosa  
4   Iris-virginica  


In [5]:
# Column names
print("Column Names:", data.columns.tolist())

# Data types
print("\nData Types:\n", data.dtypes)

# Null values
print("\nMissing Values:\n", data.isnull().sum())


Column Names: ['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species']

Data Types:
 Id                 int64
SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object

Missing Values:
 Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64


In [6]:
def describe_dataset(df):
    return {
        "rows": df.shape[0],
        "columns": df.shape[1],
        "column_names": df.columns.tolist()
    }

# Call the function
summary = describe_dataset(data)
print("\nDataset Summary:\n", summary)



Dataset Summary:
 {'rows': 5, 'columns': 6, 'column_names': ['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species']}


In [8]:
def remove_invalid_rows(df):
    rows = df.to_dict(orient='records')
    valid_rows = list(filter(lambda row: all(pd.notnull(v) for v in row.values()), rows))
    return pd.DataFrame(valid_rows)


def normalize_text_columns(df):
    def normalize_value(val):
        if isinstance(val, str):
            return val.strip().upper()
        return val

    rows = df.to_dict(orient='records')
    normalized_rows = [
        {k: normalize_value(v) for k, v in row.items()}
        for row in rows
    ]
    return pd.DataFrame(normalized_rows)

# Step 1: Remove invalid rows
clean_df = remove_invalid_rows(data)

# Step 2: Normalize text fields
normalized_df = normalize_text_columns(clean_df)

# Show result
print(normalized_df.head())


    Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  \
0  144            6.8           3.2            5.9           2.3   
1   92            6.1           3.0            4.6           1.4   
2   83            5.8           2.7            3.9           1.2   
3   26            5.0           3.0            1.6           0.2   
4  115            5.8           2.8            5.1           2.4   

           Species  
0   IRIS-VIRGINICA  
1  IRIS-VERSICOLOR  
2  IRIS-VERSICOLOR  
3      IRIS-SETOSA  
4   IRIS-VIRGINICA  
