# Data Analysis Example
This notebook demonstrates data processing techniques.

In [ ]:
import pandas as pd
import numpy as np
df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})

   x  y
0  1  4
1  2  5
2  3  6

In [ ]:
# Statistical analysis of the dataframe
def analyze_dataframe(df):
    """Perform comprehensive statistical analysis on a dataframe."""
    print('Dataset Overview:')
    print('-' * 50)
    print(f'Shape: {df.shape}')
    print(f'Columns: {list(df.columns)}')
    print(f'Data types:\n{df.dtypes}')
    
    # Basic statistics
    stats = df.describe()
    print('\nStatistical Summary:')
    print('-' * 50)
    print(stats)
    
    # Check for missing values
    missing = df.isnull().sum()
    if missing.any():
        print('\nMissing Values:')
        print('-' * 50)
        print(missing[missing > 0])
    
    # Calculate correlations
    if len(df.select_dtypes(include=['number']).columns) > 1:
        print('\nCorrelation Matrix:')
        print('-' * 50)
        print(df.corr())
    
    return stats

# Run the analysis
statistics = analyze_dataframe(df)
print('\nAnalysis complete!')

Dataset Overview:
--------------------------------------------------
Shape: (3, 2)
Columns: ['x', 'y']
Data types:
x    int64
y    int64
dtype: object

Statistical Summary:
--------------------------------------------------
              x         y
count  3.000000  3.000000
mean   2.000000  5.000000
std    1.000000  1.000000
min    1.000000  4.000000
25%    1.500000  4.500000
50%    2.000000  5.000000
75%    2.500000  5.500000
max    3.000000  6.000000

Correlation Matrix:
--------------------------------------------------
     x    y
x  1.0  1.0
y  1.0  1.0

Analysis complete!

## Visualization
Creating plots with matplotlib.

In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns

def create_visualizations(df):
    """Create comprehensive visualizations for the dataframe."""
    # Set the style
    sns.set_style('whitegrid')
    
    # Create figure with subplots
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    fig.suptitle('Data Analysis Visualizations', fontsize=16, fontweight='bold')
    
    # Line plot
    axes[0, 0].plot(df['x'], df['y'], marker='o', linestyle='-', linewidth=2, markersize=8)
    axes[0, 0].set_title('Line Plot: X vs Y')
    axes[0, 0].set_xlabel('X values')
    axes[0, 0].set_ylabel('Y values')
    axes[0, 0].grid(True, alpha=0.3)
    
    # Scatter plot
    axes[0, 1].scatter(df['x'], df['y'], s=100, c='red', alpha=0.6, edgecolors='black')
    axes[0, 1].set_title('Scatter Plot: X vs Y')
    axes[0, 1].set_xlabel('X values')
    axes[0, 1].set_ylabel('Y values')
    
    # Bar plot for X values
    axes[1, 0].bar(range(len(df)), df['x'], color='blue', alpha=0.7, label='X')
    axes[1, 0].bar(range(len(df)), df['y'], color='green', alpha=0.7, bottom=df['x'], label='Y')
    axes[1, 0].set_title('Stacked Bar Chart')
    axes[1, 0].set_xlabel('Index')
    axes[1, 0].set_ylabel('Values')
    axes[1, 0].legend()
    
    # Histogram
    axes[1, 1].hist([df['x'], df['y']], bins=5, label=['X', 'Y'], alpha=0.7, color=['blue', 'green'])
    axes[1, 1].set_title('Histogram of Values')
    axes[1, 1].set_xlabel('Value')
    axes[1, 1].set_ylabel('Frequency')
    axes[1, 1].legend()
    
    # Adjust layout to prevent overlap
    plt.tight_layout()
    
    # Add some statistics as text
    stats_text = f'Mean X: {df["x"].mean():.2f}\nMean Y: {df["y"].mean():.2f}\nCorrelation: {df["x"].corr(df["y"]):.2f}'
    fig.text(0.02, 0.02, stats_text, fontsize=10, bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    return fig

# Create and display the visualizations
figure = create_visualizations(df)
plt.show()
print('Visualizations created successfully!')

<Figure size 1200x1000 with 4 Axes>

Visualizations created successfully!