<a href="https://colab.research.google.com/github/thasniazeez/CODSOFT/blob/main/Thasni_TA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import os
import base64
import pandas as pd
import matplotlib.pyplot as plt
from io import BytesIO

# Helper to generate Base64-encoded chart
def save_chart_to_base64(df, column, chart_type="distribution"):
    buffer = BytesIO()
    plt.figure(figsize=(8, 4))
    if chart_type == "distribution":
        df[column].value_counts().plot(kind="bar", color="skyblue")
        plt.title(f"Distribution of {column}")
        plt.xlabel(column)
        plt.ylabel("Count")
    elif chart_type == "boxplot":
        plt.boxplot(df[column].dropna(), vert=False, patch_artist=True, boxprops=dict(facecolor="skyblue"))
        plt.title(f"Boxplot of {column}")
        plt.xlabel(column)

    plt.savefig(buffer, format="png")
    plt.close()
    buffer.seek(0)
    base64_string = base64.b64encode(buffer.read()).decode()
    buffer.close()
    return f"data:image/png;base64,{base64_string}"

# Generate the report
def generate_html_report(df, report_path="data_analysis_report.html"):
    # 1. Missing Values
    missing_values = df.isnull().sum()
    missing_values_html = missing_values[missing_values > 0].to_frame(name="Missing Count").to_html()

    # 2. Categorize Columns by Data Type
    numeric_columns = df.select_dtypes(include=["number"]).columns
    categorical_columns = df.select_dtypes(include=["object", "category"]).columns
    column_types_html = f"""
    <h3>Numeric Columns</h3>
    <ul>{"".join([f"<li>{col}</li>" for col in numeric_columns])}</ul>
    <h3>Categorical Columns</h3>
    <ul>{"".join([f"<li>{col}</li>" for col in categorical_columns])}</ul>
    """

    # 3. Duplicates
    duplicates_before = df.duplicated().sum()
    df_no_duplicates = df.drop_duplicates()
    duplicates_after = df_no_duplicates.duplicated().sum()

    # 4. Constant Columns
    constant_columns = [col for col in df.columns if df[col].nunique() <= 1]
    constant_columns_html = pd.DataFrame(constant_columns, columns=["Constant Columns"]).to_html()

    # 5. Boxplots for Numeric Columns
    boxplots_html = "".join([f'<img src="{save_chart_to_base64(df, col, chart_type="boxplot")}" alt="Boxplot of {col}"><br>'
                             for col in numeric_columns])

    # 6. Distribution Charts for Selected Columns
    selected_columns = df.columns[:6]  # First 6 columns for demonstration
    distributions_html = "".join([f'<img src="{save_chart_to_base64(df, col, chart_type="distribution")}" alt="Distribution of {col}"><br>'
                                  for col in selected_columns])

    #  HTML
    html_report = f"""
    <html>
        <head><title>Data Analysis Report</title></head>
        <body>
            <h1>Data Analysis Summary</h1>
            <h2>1. Missing Values</h2>
            {missing_values_html}

            <h2>2. Column Types</h2>
            {column_types_html}

            <h2>3. Duplicate Rows</h2>
            <p>Before: {duplicates_before}, After: {duplicates_after}</p>

            <h2>4. Constant Columns</h2>
            {constant_columns_html}

            <h2>5. Outlier Visualization (Boxplots)</h2>
            {boxplots_html}

            <h2>6. Distribution Charts</h2>
            {distributions_html}
        </body>
    </html>
    """

    # Saving the HTML report
    with open(report_path, "w") as f:
        f.write(html_report)
    print(f"Report saved to {report_path}")


df = pd.read_csv("/content/DS_Python_Assignment.xlsx - Data.csv")
generate_html_report(df)


Report saved to data_analysis_report.html
