In [13]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from fpdf import FPDF
import re

# Step 1: Importing the CSV File
file_path = 'LX_1.csv'
data = pd.read_csv(file_path)
print("Data Imported Successfully")

# Step 2: Data Summary and Preprocessing
summary = data.describe(include='all')
data = data.fillna(data.mean())
summary.to_csv('data_summary.csv')
print("Data Summary and Preprocessing Completed and File Saved")

# Step 3: Data Analysis
def analyze_data(data):
    analysis_results = {}

    for column in data.select_dtypes(include=np.number).columns:
        col_data = data[column].dropna()
        
        try:
            mode_val = stats.mode(col_data, nan_policy='omit')[0][0]
        except IndexError:
            mode_val = None
        
        analysis_results[column] = {
            'Median': np.median(col_data),
            'Mode': mode_val,
            'Range': np.ptp(col_data),
            'Interquartile Range (IQR)': stats.iqr(col_data),
            'Skewness': stats.skew(col_data),
            'Kurtosis': stats.kurtosis(col_data),
            'Percentiles': np.percentile(col_data, [25, 50, 75]),
            'Quartiles': np.percentile(col_data, [25, 50, 75]),
            'Standard Deviation': np.std(col_data),
            'Variance': np.var(col_data)
        }
    
    return analysis_results

analysis_results = analyze_data(data)
analysis_df = pd.DataFrame(analysis_results).transpose()
analysis_df.to_csv('data_analysis.csv')
print("Data Analysis Completed and File Saved")

# Step 4: Data Visualization
def sanitize_filename(filename):
    return re.sub(r'[^a-zA-Z0-9_]', '_', filename)

def create_visualizations(data):
    for column in data.select_dtypes(include=np.number).columns:
        sanitized_column = sanitize_filename(column)
        
        plt.figure(figsize=(10, 6))
        sns.histplot(data[column], kde=True)
        plt.title(f'Distribution of {column}')
        plt.savefig(f'{sanitized_column}_distribution.png')
        plt.close()  # Close the plot to avoid display issues in loops
        
        plt.figure(figsize=(10, 6))
        sns.boxplot(x=data[column])
        plt.title(f'Boxplot of {column}')
        plt.savefig(f'{sanitized_column}_boxplot.png')
        plt.close()  # Close the plot to avoid display issues in loops

create_visualizations(data)
print("Data Visualization Completed and Files Saved")

# Step 5: Report Generation
def generate_report(summary, analysis_results):
    pdf = FPDF()
    pdf.add_page()
    
    pdf.set_font("Arial", size=12)
    pdf.cell(200, 10, txt="Data Summary", ln=True, align='C')
    
    for col in summary.columns:
        pdf.cell(200, 10, txt=f"{col}: {summary[col].to_dict()}", ln=True, align='L')
    
    pdf.add_page()
    pdf.cell(200, 10, txt="Data Analysis", ln=True, align='C')
    
    for col, stats in analysis_results.items():
        pdf.cell(200, 10, txt=f"{col}:", ln=True, align='L')
        for stat, value in stats.items():
            pdf.cell(200, 10, txt=f"    {stat}: {value}", ln=True, align='L')
    
    pdf.add_page()
    pdf.cell(200, 10, txt="Data Visualizations", ln=True, align='C')
    
    for column in data.select_dtypes(include=np.number).columns:
        sanitized_column = sanitize_filename(column)
        pdf.add_page()
        pdf.image(f'{sanitized_column}_distribution.png', x=10, y=10, w=180)
        pdf.add_page()
        pdf.image(f'{sanitized_column}_boxplot.png', x=10, y=10, w=180)
    
    pdf.output("data_report.pdf")
    print("Report Generation Completed and File Saved")

generate_report(summary, analysis_results)


Data Imported Successfully
Data Summary and Preprocessing Completed and File Saved
Data Analysis Completed and File Saved
Data Visualization Completed and Files Saved
Report Generation Completed and File Saved
