# Databricks notebook source
# Basic Data Exploration Tutorial
 
This tutorial demonstrates how to load, explore, and understand datasets using Databricks Free Edition.
 ## Learning Objectives
 - Load data from various sources
 - Explore data structure and characteristics
 - Perform basic statistical analysis
 - Handle common data quality issues

In [0]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import matplotlib.pyplot as plt
import seaborn as sns

In [0]:
# Initialize Spark session (already available in Databricks)
spark = SparkSession.builder.appName("DataExploration").getOrCreate()

print("Spark version:", spark.version)
print("Python libraries loaded successfully!")

In [0]:
# Generate sample data
n_records = 1000
dates = pd.date_range('2023-01-01', periods=n_records, freq='D')
products = ['Laptop', 'Mouse', 'Keyboard', 'Monitor', 'Tablet', 'Phone']
regions = ['North', 'South', 'East', 'West']
sales_reps = [f'Rep_{i}' for i in range(1, 21)]

In [0]:
# Create DataFrame
sales_data = pd.DataFrame({
    'date': np.random.choice(dates, n_records),
    'product': np.random.choice(products, n_records),
    'region': np.random.choice(regions, n_records),
    'sales_rep': np.random.choice(sales_reps, n_records),
    'quantity': np.random.randint(1, 100, n_records),
    'unit_price': np.random.uniform(10, 1000, n_records).round(2),
    'customer_satisfaction': np.random.uniform(1, 5, n_records).round(1)
})


In [0]:
# Calculate total sales
sales_data['total_sales'] = sales_data['quantity'] * sales_data['unit_price']

In [0]:
# Convert to Spark DataFrame
df = spark.createDataFrame(sales_data)

print(f"Created dataset with {df.count()} records")
df.show(5)