In [2]:
# Data Loading & Familiarization

import pandas as pd
df = pd.read_csv("/home/knoldus/Downloads/iris.csv")
df.head()

def describe_dataset(dataframe):
    """Pure function that returns a summary of the dataset."""
    return {
        "rows": dataframe.shape[0],
        "columns": dataframe.shape[1],
        "column_names": list(dataframe.columns),
        "data_types": dataframe.dtypes.to_dict(),
        "null_counts": dataframe.isnull().sum().to_dict()
    }

describe_dataset(df)

{'rows': 150,
 'columns': 5,
 'column_names': ['sepal_length',
  'sepal_width',
  'petal_length',
  'petal_width',
  'species'],
 'data_types': {'sepal_length': dtype('float64'),
  'sepal_width': dtype('float64'),
  'petal_length': dtype('float64'),
  'petal_width': dtype('float64'),
  'species': dtype('O')},
 'null_counts': {'sepal_length': 0,
  'sepal_width': 0,
  'petal_length': 0,
  'petal_width': 0,
  'species': 0}}

In [6]:
# Functional Cleaning & Preprocessing

def is_valid_row(row):
    return not any(pd.isnull(row))

filtered_data = list(filter(is_valid_row, df.values.tolist()))

def normalize_column(col):
    return list(map(lambda x: str(x).strip().lower() if isinstance(x, str) else x, col))

normalized_columns = {
    col: normalize_column(df[col]) for col in df.columns
}

cleaned_df = pd.DataFrame(normalized_columns)
print(cleaned_df)


     sepal_length  sepal_width  petal_length  petal_width    species
0             5.1          3.5           1.4          0.2     setosa
1             4.9          3.0           1.4          0.2     setosa
2             4.7          3.2           1.3          0.2     setosa
3             4.6          3.1           1.5          0.2     setosa
4             5.0          3.6           1.4          0.2     setosa
..            ...          ...           ...          ...        ...
145           6.7          3.0           5.2          2.3  virginica
146           6.3          2.5           5.0          1.9  virginica
147           6.5          3.0           5.2          2.0  virginica
148           6.2          3.4           5.4          2.3  virginica
149           5.9          3.0           5.1          1.8  virginica

[150 rows x 5 columns]


In [12]:
# Code Refactoring with FP Principles

data = df.to_dict(orient='records')
# Procedural approach
setosa_data = []
for row in data:
    if row['species'] == 'setosa':
        setosa_data.append(row)

# Functional approach
setosa_data = list(filter(lambda row: row['species'] == 'setosa', data))
setosa_df = pd.DataFrame(setosa_data)
print(setosa_df)


    sepal_length  sepal_width  petal_length  petal_width species
0            5.1          3.5           1.4          0.2  setosa
1            4.9          3.0           1.4          0.2  setosa
2            4.7          3.2           1.3          0.2  setosa
3            4.6          3.1           1.5          0.2  setosa
4            5.0          3.6           1.4          0.2  setosa
5            5.4          3.9           1.7          0.4  setosa
6            4.6          3.4           1.4          0.3  setosa
7            5.0          3.4           1.5          0.2  setosa
8            4.4          2.9           1.4          0.2  setosa
9            4.9          3.1           1.5          0.1  setosa
10           5.4          3.7           1.5          0.2  setosa
11           4.8          3.4           1.6          0.2  setosa
12           4.8          3.0           1.4          0.1  setosa
13           4.3          3.0           1.1          0.1  setosa
14           5.8         