In [None]:
import sys
import os
import importlib
import pandas as pd
import numpy as np
sys.path.insert(0, os.path.abspath("../data_model/"))

In [None]:
processed_dir = "../data/processed"
reports_dir = "../reports"
data_model_output_file = os.path.join(processed_dir, "data_model_output.csv")

summary_document = os.path.join(reports_dir, "variable_summary_for_appendix.docx")

In [None]:
data_model_output_df = pd.read_csv(data_model_output_file)
data_model_output_df = data_model_output_df[data_model_output_df['is_valid_record']==True]
data_model_output_df.shape

In [None]:
data_model_output_df.head()

In [None]:
def create_summary_table(df, col, weight_col=None):
    """
    Create a summary table with value counts, percentages, weighted percentages, 
    and cumulative percentages for a specified column.

    Parameters:
        df (pd.DataFrame): Input dataframe.
        col (str): Column to analyze.
        weight_col (str, optional): Column containing weights. Defaults to None.

    Returns:
        pd.DataFrame: A summary table with value counts, percentages, weighted percentages, 
                      and cumulative percentages.
    """
    # Calculate value counts and percentages
    value_counts = df[col].value_counts()
    percentages = df[col].value_counts(normalize=True) * 100

    # Sort by value counts
    sorted_value_counts = value_counts.sort_values(ascending=False)
    sorted_percentages = percentages.loc[sorted_value_counts.index]

    # Calculate weighted percentages if weight_col is provided
    if weight_col:
        weights = df.groupby(col)[weight_col].sum()
        sorted_weights = weights.loc[sorted_value_counts.index]
        weighted_percentages = (sorted_weights / sorted_weights.sum()) * 100
    else:
        weighted_percentages = pd.Series([None] * len(sorted_value_counts), index=sorted_value_counts.index)

    # Calculate cumulative percentage
    cumulative_percentages = sorted_percentages.cumsum()

    # Combine into a summary table
    output_df = pd.DataFrame({
        'Number of Respondents': sorted_value_counts,
        'Percentage(%)': round(sorted_percentages, 2),
        'Weighted Percentage(%)': round(weighted_percentages, 2),
        'Cumulative Percentage(%)': round(cumulative_percentages,2)
    }).reset_index().rename(columns={col: 'Response'})

    return output_df

In [None]:
data_model_output_df['access_mode_label'].value_counts(normalize = True)

In [None]:
summary_df = create_summary_table(data_model_output_df, 'marketsegment_label', 'weight_departing_only')
summary_df

In [None]:
data_model_output_df['passenger_type_label'].value_counts()

In [None]:
from docx import Document

def generate_summary_document(df, weight_col=None, segment_cols=None, output_file='summary_tables.docx'):
    """
    Generate a Word document with summary tables for all columns in a dataframe,
    optionally grouped by a concatenated custom segmentation column.
    
    Parameters:
        df (pd.DataFrame): Input dataframe.
        weight_col (str, optional): Column containing weights. Defaults to None.
        segment_cols (list of str, optional): Columns to segment by. Defaults to None.
        output_file (str): Path to save the generated Word document.
    """
    doc = Document()

    # Initial summary of segment columns
    if segment_cols:
        doc.add_heading("Segment Columns Summary", level=1)
        for col in segment_cols:
            doc.add_heading(f"{col}", level=2)
            summary = df[col].value_counts(dropna=False).reset_index()
            summary.columns = [col, "Count"]
            
            table = doc.add_table(rows=1, cols=2)
            table.style = 'Table Grid'
            table.cell(0, 0).text = col
            table.cell(0, 1).text = "Count"
            for _, row in summary.iterrows():
                cells = table.add_row().cells
                cells[0].text = str(row[col])
                cells[1].text = str(row["Count"])
            doc.add_paragraph()

    # Create custom segmentation column
    if segment_cols:
        df['custom_segmentation'] = df[segment_cols].astype(str).agg(" | ".join, axis=1)
        segments = df['custom_segmentation'].unique()
    else:
        df['custom_segmentation'] = 'All Data'
        segments = ['All Data']

    # Filter columns ending with '_label' and exclude segment columns
    label_columns = [col for col in df.columns if col.endswith('_label') and col not in (segment_cols or [])]

    # Iterate through each unique segment
    for segment in segments:
        subset_df = df[df['custom_segmentation'] == segment]

        doc.add_heading(f"Segment: {segment}", level=1)

        for col in label_columns:
            summary_table = create_summary_table(subset_df, col, weight_col)

            # Skip if summary is empty
            if summary_table.empty:
                continue

            doc.add_heading(f'{col.replace("_label", "")}', level=2)

            table = doc.add_table(rows=1, cols=summary_table.shape[1])
            table.style = 'Table Grid'

            for i, column_name in enumerate(summary_table.columns):
                table.cell(0, i).text = column_name

            for _, row in summary_table.iterrows():
                cells = table.add_row().cells
                for i, value in enumerate(row):
                    cells[i].text = str(value)

            doc.add_paragraph()

    # Save the document
    doc.save(output_file)
    print(f"Word document saved as {output_file}")


In [None]:
generate_summary_document(data_model_output_df, weight_col='weight_departing_and_arriving', segment_cols=['marketsegment_label', 'passenger_type_label'], output_file = summary_document)