In [None]:
# ==========================================
# PART 1: SETUP AND IMPORTS
# ==========================================

# Import necessary libraries for data manipulation and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set seed
np.random.seed(123)

# Set visual styles for plots to make them look professional
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
# ==========================================
# PART 2: LOADING DATA
# ==========================================

# dynamic way to find the Desktop path so I don't have to hardcode my username
desktop_path = os.path.join(os.path.expanduser("~"), "Desktop")

# Replace 'my_data.csv' with the actual file name
filename = 'my_data.csv'
file_path = os.path.join(desktop_path, filename)

# Load the data
# Note: If using Excel, change to pd.read_excel(file_path)
try:
    df = pd.read_csv(file_path)
    print(f"Successfully loaded {filename}")
except FileNotFoundError:
    print("Error: File not found. Check the filename and ensure it's on the Desktop.")

In [None]:
# ==========================================
# PART 3: INITIAL INSPECTION (The "Sanity Check")
# ==========================================

print("\n--- Shape of the Data ---")
# checking how many rows and columns I'm working with
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")

print("\n--- First 5 Rows (Head) ---")
# taking a look at the top of the file to see if headers loaded correctly
print(df.head())

print("\n--- Last 5 Rows (Tail) ---")
# checking the bottom to make sure there's no garbage footer data
print(df.tail())

print("\n--- Column Names ---")
# getting a list of all variables
print(df.columns.tolist())

print("\n--- Data Types and Non-Null Counts ---")
# this is crucial to see if numbers are being read as text (objects)
print(df.info())

In [None]:
# ==========================================
# PART 4: DATA CLEANING CHECKS
# ==========================================

print("\n--- Duplicate Rows ---")
# checking if there are fully duplicated rows which might skew results
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

print("\n--- Missing Values ---")
# counting nulls per column to see where the data quality issues are
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0]) # only showing columns with missing data

# Visualization: Heatmap of missing values
# simple way to see if missingness is random or structural
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Visual Map of Missing Data')
plt.show()

In [None]:
# ==========================================
# PART 5: SUMMARY STATISTICS
# ==========================================

print("\n--- Numerical Summary ---")
# basic stats: mean, median (50%), min, max, std dev
print(df.describe().T) # Transposing (.T) makes it easier to read if there are many columns

print("\n--- Categorical Summary ---")
# summary for non-numeric columns (unique counts, top occurring values)
# we include 'O' (Object) to target strings
try:
    print(df.describe(include=['O']).T)
except ValueError:
    print("No categorical columns found.")

In [None]:
# ==========================================
# PART 6: DISTRIBUTION & VISUALIZATION
# ==========================================

# 1. Histograms for all Numerical Columns
# This helps me spot normal distributions vs skewed data
# select only numeric columns to avoid errors
numeric_df = df.select_dtypes(include=[np.number])

if not numeric_df.empty:
    numeric_df.hist(bins=30, figsize=(15, 10), layout=(4, 4))
    plt.suptitle('Distribution of Numerical Variables')
    plt.show()

# 2. Boxplots for Outlier Detection
# Good for seeing the spread and spotting dots outside the whiskers (outliers)
for column in numeric_df.columns:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=df[column])
    plt.title(f'Boxplot of {column}')
    plt.show()

# 3. Correlation Matrix (Heatmap)
# Checking how variables relate to each other.
# 1 = perfect positive correlation, -1 = perfect negative correlation
if len(numeric_df.columns) > 1:
    plt.figure(figsize=(12, 10))
    correlation_matrix = numeric_df.corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title('Correlation Heatmap')
    plt.show()

# 4. Pairplot
# Scatter plots of every numeric variable against every other numeric variable
# This is heavy on performance but great for spotting patterns immediately
if len(numeric_df.columns) <= 10: # Limiting to 10 cols to prevent crashing
    sns.pairplot(df.select_dtypes(include=[np.number]).dropna())
    plt.title('Pairwise Relationships')
    plt.show()
else:
    print("Skipping pairplot: Too many columns.")

In [None]:
# ==========================================
# PART 7: CATEGORICAL ANALYSIS
# ==========================================

# analyzing the frequency of categories in text columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

for col in categorical_cols:
    # check if there are too many unique values (like Names or IDs) to plot
    if df[col].nunique() < 20:
        plt.figure(figsize=(10, 5))
        sns.countplot(y=df[col], order=df[col].value_counts().index)
        plt.title(f'Count of Categories in {col}')
        plt.show()
    else:
        print(f"\nSkipping plot for {col}: Too many unique values ({df[col].nunique()})")