In [1]:
import sys
import os
import importlib
import pandas as pd
import numpy as np
sys.path.insert(0, os.path.abspath("../data_model/"))

In [2]:
processed_dir = "../data/processed"
reports_dir = "../reports"
data_model_output_file = os.path.join(processed_dir, "data_model_output.csv")

summary_document = os.path.join(reports_dir, "variable_summary.docx")

In [3]:
data_model_output_df = pd.read_csv(data_model_output_file)
data_model_output_df.shape

  data_model_output_df = pd.read_csv(data_model_output_file)


(9892, 437)

In [4]:
data_model_output_df.head()

Unnamed: 0,unique_id,respondentid,is_completed,is_valid_record,date_completed,time_completed,is_pilot,is_self_administered,record_type_synthetic,access_mode,...,trip_start_time_label,validation_error_person,validation_error_trip,validation_num_errors_person,validation_num_errors_trip,validation_severity_person,validation_severity_trip,weight_departing_and_arriving,weight_departing_only,weight_non_sas_departing_only
0,1,5473,True,True,2024-10-04,08:41:12,False,False,0,,...,FIVE_TO_FIVE_THIRTY,[],[],0,0,,,5.077165,5.124186,8.599911
1,2,5476,True,True,2024-10-04,08:40:04,False,False,0,1.0,...,EIGHT_THIRTY_TO_NINE,[],[],0,0,,,4.749957,4.81339,7.511901
2,3,5489,True,True,2024-10-04,08:51:36,False,False,0,,...,SIX_TO_SIX_THIRTY,[],[],0,0,,,5.077165,5.124186,8.599911
3,4,5558,True,True,2024-10-04,10:32:58,False,False,0,1.0,...,ELEVEN_TO_ELEVEN_THIRTY,[],[],0,0,,,4.749957,4.81339,7.511901
4,5,5593,True,True,2024-10-04,11:09:46,False,False,0,1.0,...,NOON_TO_TWELVE_THIRTY,[],[],0,0,,,4.749957,4.81339,7.511901


In [5]:
def create_summary_table(df, col, weight_col=None):
    """
    Create a summary table with value counts, percentages, weighted percentages, 
    and cumulative percentages for a specified column.

    Parameters:
        df (pd.DataFrame): Input dataframe.
        col (str): Column to analyze.
        weight_col (str, optional): Column containing weights. Defaults to None.

    Returns:
        pd.DataFrame: A summary table with value counts, percentages, weighted percentages, 
                      and cumulative percentages.
    """
    # Calculate value counts and percentages
    value_counts = df[col].value_counts()
    percentages = df[col].value_counts(normalize=True) * 100

    # Sort by value counts
    sorted_value_counts = value_counts.sort_values(ascending=False)
    sorted_percentages = percentages.loc[sorted_value_counts.index]

    # Calculate weighted percentages if weight_col is provided
    if weight_col:
        weights = df.groupby(col)[weight_col].sum()
        sorted_weights = weights.loc[sorted_value_counts.index]
        weighted_percentages = (sorted_weights / sorted_weights.sum()) * 100
    else:
        weighted_percentages = pd.Series([None] * len(sorted_value_counts), index=sorted_value_counts.index)

    # Calculate cumulative percentage
    cumulative_percentages = sorted_percentages.cumsum()

    # Combine into a summary table
    output_df = pd.DataFrame({
        'Number of Respondents': sorted_value_counts,
        'Percentage(%)': round(sorted_percentages, 2),
        'Weighted Percentage(%)': round(weighted_percentages, 2),
        'Cumulative Percentage(%)': round(cumulative_percentages,2)
    }).reset_index().rename(columns={col: 'Response'})

    return output_df

In [6]:
data_model_output_df['access_mode_label'].value_counts(normalize = True)

access_mode_label
WALK                                    0.454988
DROVE_ALONE_AND_PARKED                  0.187348
DROPPED_OFF_BY_FAMILY_FRIEND            0.175182
UBER_LYFT                               0.058394
OTHER_PUBLIC_TRANSIT                    0.046229
DROVE_WITH_OTHERS_AND_PARKED            0.021898
CAR_SERVICE_BLACK_LIMO                  0.019465
OTHER                                   0.014599
TAXI                                    0.012165
RODE_WITH_OTHER_TRAVELERS_AND_PARKED    0.007299
BICYCLE_PERSONAL_NON_ELECTRIC           0.002433
Name: proportion, dtype: float64

In [7]:
summary_df = create_summary_table(data_model_output_df, 'marketsegment_label', 'weight_departing_only')
summary_df

Unnamed: 0,Response,Number of Respondents,Percentage(%),Weighted Percentage(%),Cumulative Percentage(%)
0,PASSENGER,9075,91.74,83.57,91.74
1,EMPLOYEE,756,7.64,16.43,99.38
2,UNKNOWN,45,0.45,0.0,99.84
3,NEITHER,16,0.16,0.0,100.0


In [8]:
from docx import Document

def generate_summary_document(df, weight_col=None, output_file='summary_tables.docx'):
    """
    Generate a Word document with summary tables for all columns in a dataframe.
    
    Parameters:
        df (pd.DataFrame): Input dataframe.
        weight_col (str, optional): Column containing weights. Defaults to None.
        output_file (str): Path to save the generated Word document.
    """
    # Initialize a Word document
    doc = Document()
        # Filter columns ending with '_label'
    label_columns = [col for col in df.columns if col.endswith('_label')]
    # Iterate through all columns
    for col in label_columns:

        # Generate summary table for the column
        summary_table = create_summary_table(df, col, weight_col)

        # Add a heading for the column
        doc.add_heading(f'{col.replace("_label", "")}', level=1)

        # Add the summary table as text
        table = doc.add_table(rows=1, cols=summary_table.shape[1])
        table.style = 'Table Grid'

        # Add table headers
        for i, column_name in enumerate(summary_table.columns):
            table.cell(0, i).text = column_name

        # Add table rows
        for _, row in summary_table.iterrows():
            cells = table.add_row().cells
            for i, value in enumerate(row):
                cells[i].text = str(value)

        # Add a blank line after the table for spacing
        doc.add_paragraph()

    # Save the document
    doc.save(output_file)
    print(f"Word document saved as {output_file}")


In [9]:
generate_summary_document(data_model_output_df, weight_col='weight_departing_only', output_file = summary_document)

Word document saved as ../reports\variable_summary.docx
