# Data Exploration and Cleaning for agriculture_crop_yield.csv

This notebook performs tasks 1.a and 1.b on the provided dataset `agriculture_crop_yield.csv` in the workspace.

- 1.a: Import libraries, read and display dataset, show dimensionality (columns, types, missing values), compute statistics on numerical features, and compute shape.
- 1.b: Read dataset and display 5 lines, count nulls per column, clean up blank (fully-null) columns, identify and remove duplicates.

The cleaned dataset will be saved as `agriculture_crop_yield_cleaned.csv` in the same folder.


In [None]:
import os
import pandas as pd
import numpy as np

# Display options for readability
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)

DATA_PATH = "/Users/karthikmac/Downloads/DV_USECASE/task 1/agriculture_crop_yield.csv"
assert os.path.exists(DATA_PATH), f"Dataset not found at {DATA_PATH}"


In [None]:
# 1.a) Read and display the details of the dataset

df = pd.read_csv(DATA_PATH)
print("First 10 rows:")
display(df.head(10))

print("\nInfo:")
df_info = df.dtypes.to_frame(name='dtype')
df_info['non_null_count'] = df.notna().sum()
df_info['null_count'] = df.isna().sum()
df_info['null_percent'] = (df_info['null_count'] / len(df)) * 100

display(df_info)

print("\nDimensionality (rows, columns):", df.shape)
print("\nColumns:")
print(list(df.columns))

print("\nCompute statistics on numerical features:")
display(df.describe(include=[np.number]).T)

print("\nOverall shape of dataset:")
print(df.shape)


In [None]:
# 1.b) Additional EDA and cleaning

print("Display first 5 rows:")
display(df.head(5))

print("\nNull values per column:")
null_counts = df.isna().sum().sort_values(ascending=False)
display(null_counts)

# Remove fully-null columns
fully_null_cols = [c for c in df.columns if df[c].isna().all()]
print("\nFully null columns to drop:", fully_null_cols)
df_clean = df.drop(columns=fully_null_cols) if fully_null_cols else df.copy()

# Remove duplicate rows
initial_shape = df_clean.shape
df_clean = df_clean.drop_duplicates()
print(f"\nRemoved duplicates: {initial_shape[0] - df_clean.shape[0]}")
print("New shape after cleaning:", df_clean.shape)

display(df_clean.head(10))


In [None]:
# Save cleaned dataset
CLEAN_PATH = "/Users/karthikmac/Downloads/DV_USECASE/task 1/agriculture_crop_yield_cleaned.csv"
df_clean.to_csv(CLEAN_PATH, index=False)
print(f"Saved cleaned dataset to: {CLEAN_PATH}")

# Quick preview of saved file
preview_df = pd.read_csv(CLEAN_PATH, nrows=5)
display(preview_df)
