# How to Explore a Dataset

## Step 1: Importing

In [1]:
import pandas as pd
import numpy as np

# Update this path to wherever your CSV lives
DATA_PATH = '/Users/vedikabaradwaj/Downloads/morg19.csv'

df = pd.read_csv(DATA_PATH)

## Step 2: Understanding the structure

In [None]:
# Shape: (rows, columns)
df.shape

In [None]:
# Peek at the data
df.head()

In [None]:
# Column names
df.columns.tolist()

In [None]:
# Data types + missing values
df.info()

In [None]:
# Count missing values explicitly
df.isna().sum().sort_values(ascending=False)

In [None]:
#Understand data distribution for numerical variables
df.describe()

In [None]:
#understand data distribution for categorical variables
df["category_column"].value_counts(dropna=False)

#For multiple variables
for col in ["sex", "class", "region"]:
    print(f"\n--- {col} ---")
    print(df[col].value_counts(dropna=False))

In [None]:
#Selecting columns to view
cols = ["age", "sex", "income", "employment_status"]
df_small = df[cols]

df_small.head()

## Filtering Data

In [None]:
# Single condition
df[df["sex"] == "female"]

In [None]:
#Multiple conditions
df[(df["sex"] == "female") & (df["age"] >= 50)]

In [None]:
#Excluding missing values
df[df["age"].notna()]

In [None]:
#Filtering by numerical ranges
df[df["age"].between(55, 64)]

In [None]:
#Filtering by .isin command
df[df["state"].isin(["IL", "CA", "NY"])]

In [None]:
#Creating derived variables
df["age_55_plus"] = df["age"] >= 55

## Descriptive Analysis

In [None]:
# Multi statistics
df.groupby("employment_status").agg(
    n=("income", "count"),
    avg_income=("income", "mean"),
    median_age=("age", "median")
)