In [3]:
import plotly.graph_objects as go

# Provided stages and brand colors
stages = ["Collection", "Assessment", "Cleaning", "Transform", "Feature Eng", "Validation", "Final Data"]
brand_colors = ['#1FB8CD', '#DB4545', '#2E8B57', '#5D878F', '#D2BA4C', '#B4413C', '#964325']


x_pos = list(range(len(stages)))

# Create figure
fig = go.Figure()

# Add connecting lines between stages
line_x, line_y = [], []
for i in range(len(x_pos) - 1):
    line_x.extend([x_pos[i], x_pos[i + 1], None])
    line_y.extend([0, 0, None])
fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', line=dict(color='gray', width=4),
                         showlegend=False, hoverinfo='skip', cliponaxis=False))

# Add stage markers
for i, stage in enumerate(stages):
    fig.add_trace(go.Scatter(x=[x_pos[i]], y=[0], mode='markers',
                             marker=dict(size=50, color=brand_colors[i % len(brand_colors)], symbol='square'),
                             name=stage, cliponaxis=False,
                             hovertemplate='<b>' + stage + '</b><extra></extra>'))


fig.update_layout(title='Data Pipeline Flow', showlegend=True)

# X-axis
fig.update_xaxes(tickvals=x_pos, ticktext=stages, title='Stage #', showgrid=False)
# Y-axis
fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False)

# Save image
# fig.write_image('pipeline_flowchart2.png') # Removed as kaleido is required for this

# Display the figure
fig.show()

In [5]:
import plotly.express as px
import plotly.graph_objects as go
import json

# Parse the data
data_json = {"time_distribution": [{"phase": "Data Collection & Sourcing", "percentage": 20}, {"phase": "Data Cleaning & Preprocessing", "percentage": 50}, {"phase": "Exploratory Data Analysis", "percentage": 15}, {"phase": "Model Building & Training", "percentage": 10}, {"phase": "Model Evaluation & Deployment", "percentage": 5}]}

# Extract data
phases = [item["phase"] for item in data_json["time_distribution"]]
percentages = [item["percentage"] for item in data_json["time_distribution"]]

# Abbreviate phase names to fit 15 character limit
abbreviated_phases = [
    "Data Collect",
    "Data Cleaning",
    "EDA",
    "Model Build",
    "Model Deploy"
]

# Define the 5 primary brand colors
colors = ['#1FB8CD', '#DB4545', '#2E8B57', '#5D878F', '#D2BA4C']

# Create bar chart
fig = go.Figure(data=[
    go.Bar(
        x=abbreviated_phases,
        y=percentages,
        marker_color=colors,
        text=[f"{p}%" for p in percentages],
        textposition='outside',
        cliponaxis=False
    )
])

# Update layout
fig.update_layout(
    title="Time Distribution in Data Science",
    xaxis_title="Project Phase",
    yaxis_title="Time (%)",
    showlegend=False
)

# Save the chart
# fig.write_image("data_science_time_distribution.png") # Removed as kaleido is required for this

In [6]:
# Create a comprehensive comparison table of data preprocessing approaches and techniques

import pandas as pd

# Create data for comparison table
preprocessing_comparison = {
    'Approach': [
        'Manual Data Cleaning',
        'Automated Pipelines',
        'Machine Learning-Based',
        'Rule-Based Systems',
        'Statistical Methods',
        'Cloud-Based Solutions'
    ],
    'Scalability': [
        'Low',
        'High',
        'Medium',
        'Medium',
        'Medium',
        'Very High'
    ],
    'Accuracy': [
        'Variable',
        'Consistent',
        'High',
        'Medium',
        'Medium',
        'High'
    ],
    'Time_Efficiency': [
        'Low',
        'High',
        'Medium',
        'High',
        'Medium',
        'Very High'
    ],
    'Resource_Requirements': [
        'High (Human)',
        'Medium (Setup)',
        'High (Computational)',
        'Low',
        'Medium',
        'Variable (Cost-based)'
    ],
    'Flexibility': [
        'Very High',
        'Medium',
        'High',
        'Low',
        'Medium',
        'High'
    ],
    'Maintenance': [
        'High',
        'Medium',
        'High',
        'Low',
        'Medium',
        'Low'
    ],
    'Best_Use_Cases': [
        'Small datasets, unique problems',
        'Regular, structured processes',
        'Complex patterns, large datasets',
        'Well-defined business rules',
        'Outlier detection, validation',
        'Enterprise-scale operations'
    ]
}

# Create DataFrame
df_comparison = pd.DataFrame(preprocessing_comparison)

# Display the table
print("Data Preprocessing Approaches Comparison")
print("=" * 60)
print(df_comparison.to_string(index=False))

# Save to CSV
df_comparison.to_csv('data_preprocessing_approaches_comparison.csv', index=False)

print("\n\nTable saved as 'data_preprocessing_approaches_comparison.csv'")

# Create another table for data quality metrics
quality_metrics = {
    'Quality_Dimension': [
        'Accuracy',
        'Completeness',
        'Consistency',
        'Validity',
        'Uniqueness',
        'Timeliness',
        'Integrity',
        'Relevance'
    ],
    'Definition': [
        'Data correctly represents real-world values',
        'All required data points are present',
        'Data is uniform across different sources',
        'Data conforms to defined formats and rules',
        'No duplicate or redundant records exist',
        'Data is current and up-to-date',
        'Data maintains logical relationships',
        'Data is applicable to the intended use case'
    ],
    'Measurement_Method': [
        'Compare against authoritative sources',
        'Calculate missing value percentages',
        'Cross-system validation checks',
        'Rule-based validation testing',
        'Duplicate detection algorithms',
        'Timestamp and freshness analysis',
        'Referential integrity constraints',
        'Domain expert assessment'
    ],
    'Common_Issues': [
        'Data entry errors, sensor malfunctions',
        'Missing fields, incomplete records',
        'Different formats across systems',
        'Invalid formats, out-of-range values',
        'Duplicate records, redundant data',
        'Stale data, delayed updates',
        'Broken relationships, orphaned records',
        'Irrelevant or outdated information'
    ],
    'Impact_if_Poor': [
        'Incorrect analysis and decisions',
        'Biased results, incomplete insights',
        'Integration failures, confusion',
        'Processing errors, system failures',
        'Inflated metrics, skewed analysis',
        'Outdated insights, poor decisions',
        'Data corruption, unreliable results',
        'Wasted resources, irrelevant outcomes'
    ]
}

df_quality = pd.DataFrame(quality_metrics)

print("\n\nData Quality Dimensions and Assessment")
print("=" * 70)
print(df_quality.to_string(index=False))

# Save quality metrics table
df_quality.to_csv('data_quality_dimensions.csv', index=False)
print("\n\nQuality metrics table saved as 'data_quality_dimensions.csv'")

Data Preprocessing Approaches Comparison
              Approach Scalability   Accuracy Time_Efficiency Resource_Requirements Flexibility Maintenance                   Best_Use_Cases
  Manual Data Cleaning         Low   Variable             Low          High (Human)   Very High        High  Small datasets, unique problems
   Automated Pipelines        High Consistent            High        Medium (Setup)      Medium      Medium    Regular, structured processes
Machine Learning-Based      Medium       High          Medium  High (Computational)        High        High Complex patterns, large datasets
    Rule-Based Systems      Medium     Medium            High                   Low         Low         Low      Well-defined business rules
   Statistical Methods      Medium     Medium          Medium                Medium      Medium      Medium    Outlier detection, validation
 Cloud-Based Solutions   Very High       High       Very High Variable (Cost-based)        High         Low      