<a href="https://colab.research.google.com/github/wairiukoirwine/AI-DATA-ANALYZER/blob/main/Untitled10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import io
import os

# ===================== Upload or Input File =====================
def load_file():
    """Upload CSV/JSON or enter file path manually."""
    try:
        from google.colab import files
        uploaded = files.upload()
        file_name = list(uploaded.keys())[0]
        print(f"Uploaded: {file_name}")
        ext = file_name.split('.')[-1].lower()
        if ext == 'csv':
            df = pd.read_csv(io.BytesIO(uploaded[file_name]))
        elif ext == 'json':
            df = pd.read_json(io.BytesIO(uploaded[file_name]))
        else:
            raise ValueError("Unsupported file type. Use CSV or JSON.")
        return df
    except ImportError:
        # If not in Colab, ask for path input
        file_path = input("Enter path to CSV or JSON file: ").strip()
        ext = file_path.split('.')[-1].lower()
        if ext == 'csv':
            df = pd.read_csv(file_path)
        elif ext == 'json':
            df = pd.read_json(file_path)
        else:
            raise ValueError("Unsupported file type. Use CSV or JSON.")
        return df

# ===================== Clean Data =====================
def clean_data(df):
    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
    for col in numeric_cols:
        df[col] = df[col].fillna(df[col].mean())

    categorical_cols = df.select_dtypes(include='object').columns.tolist()
    for col in categorical_cols:
        df[col] = df[col].fillna('Unknown')

    df = df.drop_duplicates()
    return df

# ===================== Analyze Data =====================
def analyze_data(df):
    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
    categorical_cols = df.select_dtypes(include='object').columns.tolist()

    print("\n===== Numeric Data Analysis =====")
    if numeric_cols:
        print(df[numeric_cols].describe())
        print("\nCorrelation Matrix:")
        print(df[numeric_cols].corr())
    else:
        print("No numeric columns found.")

    print("\n===== Categorical Data Analysis =====")
    if categorical_cols:
        for col in categorical_cols:
            print(f"\nValue counts for {col}:")
            print(df[col].value_counts())
    else:
        print("No categorical columns found.")

    return numeric_cols, categorical_cols

# ===================== Visualize Data =====================
def visualize_data(df, numeric_cols):
    if not numeric_cols:
        print("No numeric data to visualize.")
        return

    # Histogram
    df[numeric_cols].hist(bins=10, figsize=(12,6), color='skyblue', edgecolor='black')
    plt.suptitle("Histogram of Numeric Columns")
    plt.show()

    # Boxplot
    df_melted = df.melt(id_vars=None, value_vars=numeric_cols,
                        var_name='Variable', value_name='Value')
    plt.figure(figsize=(10,6))
    sns.boxplot(x='Variable', y='Value', data=df_melted)
    plt.title("Boxplot of Numeric Columns")
    plt.show()

    # Correlation heatmap
    plt.figure(figsize=(8,6))
    sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm')
    plt.title("Correlation Heatmap")
    plt.show()

# ===================== Summarize Findings =====================
def summarize_findings(df, numeric_cols, categorical_cols):
    print("\n===== Summary of Findings =====")
    if numeric_cols:
        for col in numeric_cols:
            print(f"- {col} -> Mean: {df[col].mean():.2f}, Median: {df[col].median():.2f}, Max: {df[col].max()}, Min: {df[col].min()}")
    if categorical_cols:
        for col in categorical_cols:
            top_category = df[col].value_counts().idxmax()
            print(f"- {col} -> Most common category: {top_category}")

# ===================== Save Clean Data =====================
def save_clean_data(df, file_name="cleaned_data.csv"):
    """Overwrite file safely without appending."""
    df.to_csv(file_name, index=False, line_terminator='\n')  # normalize line endings
    print(f"\nCleaned data saved to {file_name}")

# ===================== Main =====================
def run_analyzer():
    df = load_file()
    print("\nDataset Loaded Successfully!")
    print(df.head())

    df = clean_data(df)
    numeric_cols, categorical_cols = analyze_data(df)
    visualize_data(df, numeric_cols)
    summarize_findings(df, numeric_cols, categorical_cols)

    # Save cleaned dataset safely
    save_clean_data(df, "cleaned_data.csv")

    print("\nâœ… Analysis Complete!")

# ===================== Run =====================
run_analyzer()
