In [1]:
import sys
import os
import importlib
import pandas as pd
import numpy as np
sys.path.insert(0, os.path.abspath("../data_model/"))

In [2]:
processed_dir = "../data/processed"
reports_dir = "../reports"
data_model_output_file = os.path.join(processed_dir, "data_model_output.csv")

summary_document = os.path.join(reports_dir, "variable_summary.docx")

In [3]:
data_model_output_df = pd.read_csv(data_model_output_file)
data_model_output_df.shape

  data_model_output_df = pd.read_csv(data_model_output_file)


(9090, 330)

In [4]:
data_model_output_df.head()

Unnamed: 0,unique_id,is_completed,is_pilot,record_type_synthetic,respondentid,submit,date_completed,interview_location,interview_location_label,interview_location_other,...,next_flight_destination,non_airport_activity_type,parking_cost_numeric,previous_flight_origin,taxi_fhv_fare_numeric,taxi_fhv_wait_numeric,valid_record,validation_error,validation_severity,weight
0,1,True,False,0,5473,True,2024-10-04,2.0,TERMINAL_2,,...,,2.0,100.0,,,,True,,,1.0
1,2,True,False,0,5476,True,2024-10-04,3.0,ONBOARD_992,,...,,2.0,,,,,False,Prefer Not to disclose cannot be combined with...,Low,1.0
2,3,True,False,0,5489,True,2024-10-04,2.0,TERMINAL_2,,...,,2.0,134.0,,,,True,,,1.0
3,4,True,False,0,5558,True,2024-10-04,2.0,TERMINAL_2,,...,,2.0,,,,,True,,,1.0
4,5,True,False,0,5593,True,2024-10-04,2.0,TERMINAL_2,,...,,2.0,,,,,True,,,1.0


In [5]:
def create_summary_table(df, col, weight_col=None):
    """
    Create a summary table with value counts, percentages, weighted percentages, 
    and cumulative percentages for a specified column.

    Parameters:
        df (pd.DataFrame): Input dataframe.
        col (str): Column to analyze.
        weight_col (str, optional): Column containing weights. Defaults to None.

    Returns:
        pd.DataFrame: A summary table with value counts, percentages, weighted percentages, 
                      and cumulative percentages.
    """
    # Calculate value counts and percentages
    value_counts = df[col].value_counts()
    percentages = df[col].value_counts(normalize=True) * 100

    # Sort by value counts
    sorted_value_counts = value_counts.sort_values(ascending=False)
    sorted_percentages = percentages.loc[sorted_value_counts.index]

    # Calculate weighted percentages if weight_col is provided
    if weight_col:
        weights = df.groupby(col)[weight_col].sum()
        sorted_weights = weights.loc[sorted_value_counts.index]
        weighted_percentages = (sorted_weights / sorted_weights.sum()) * 100
    else:
        weighted_percentages = pd.Series([None] * len(sorted_value_counts), index=sorted_value_counts.index)

    # Calculate cumulative percentage
    cumulative_percentages = sorted_percentages.cumsum()

    # Combine into a summary table
    output_df = pd.DataFrame({
        'Number of Respondents': sorted_value_counts,
        'Percentage(%)': round(sorted_percentages, 2),
        'Weighted Percentage(%)': round(weighted_percentages, 2),
        'Cumulative Percentage(%)': round(cumulative_percentages,2)
    }).reset_index().rename(columns={col: 'Response'})

    return output_df

In [6]:
data_model_output_df['access_mode_label'].value_counts(normalize = True)

access_mode_label
WALK                                    0.472561
DROVE_ALONE_AND_PARKED                  0.216463
DROPPED_OFF_BY_FAMILY_FRIEND            0.158537
UBER_LYFT                               0.060976
DROVE_WITH_OTHERS_AND_PARKED            0.021341
OTHER_PUBLIC_TRANSIT                    0.021341
OTHER                                   0.018293
CAR_SERVICE_BLACK_LIMO                  0.018293
RODE_WITH_OTHER_TRAVELERS_AND_PARKED    0.009146
TAXI                                    0.003049
Name: proportion, dtype: float64

In [7]:
summary_df = create_summary_table(data_model_output_df, 'marketsegment_label', 'weight')
summary_df

Unnamed: 0,Response,Number of Respondents,Percentage(%),Weighted Percentage(%),Cumulative Percentage(%)
0,PASSENGER,8530,93.84,95.07,93.84
1,EMPLOYEE,458,5.04,4.93,98.88
2,UNKNOWN,86,0.95,0.0,99.82
3,NEITHER,16,0.18,0.0,100.0


In [8]:
from docx import Document

def generate_summary_document(df, weight_col=None, output_file='summary_tables.docx'):
    """
    Generate a Word document with summary tables for all columns in a dataframe.
    
    Parameters:
        df (pd.DataFrame): Input dataframe.
        weight_col (str, optional): Column containing weights. Defaults to None.
        output_file (str): Path to save the generated Word document.
    """
    # Initialize a Word document
    doc = Document()
        # Filter columns ending with '_label'
    label_columns = [col for col in df.columns if col.endswith('_label')]
    # Iterate through all columns
    for col in label_columns:

        # Generate summary table for the column
        summary_table = create_summary_table(df, col, weight_col)

        # Add a heading for the column
        doc.add_heading(f'{col.replace("_label", "")}', level=1)

        # Add the summary table as text
        table = doc.add_table(rows=1, cols=summary_table.shape[1])
        table.style = 'Table Grid'

        # Add table headers
        for i, column_name in enumerate(summary_table.columns):
            table.cell(0, i).text = column_name

        # Add table rows
        for _, row in summary_table.iterrows():
            cells = table.add_row().cells
            for i, value in enumerate(row):
                cells[i].text = str(value)

        # Add a blank line after the table for spacing
        doc.add_paragraph()

    # Save the document
    doc.save(output_file)
    print(f"Word document saved as {output_file}")


In [10]:
generate_summary_document(data_model_output_df, weight_col='weight', output_file = summary_document)

Word document saved as ../reports\variable_summary.docx
