In [1]:
import pandas as pd
import os
import getpass
from pylatex import Table, NoEscape, Command

export_username = "ts"  # Only save tables to dropbox on my machine

In [2]:
def create_latex_table_content(df, caption, label, colnames=None):
    """
    Create LaTeX table content from a pandas DataFrame.
    Automatically formats columns with all integer-like floats as integers.
    Uses 'H' float specifier, centers all values except the leftmost column,
    adds vertical lines between columns after the first one,
    ensures the table fits within textwidth, and allows line breaks in column names.
    
    Args:
    df (pd.DataFrame): Input DataFrame
    caption (str): Table caption
    label (str): Table label for referencing
    colnames (list): List of column names. If None, uses df.columns. Use \\\ for line breaks (will cause warning).
    
    Returns:
    str: LaTeX code for the table
    """
    table_content = []
    table_content.append(r'\begin{table}[H]')
    table_content.append(r'\centering')
    table_content.append(f'\\caption{{{caption}}}')
    table_content.append(f'\\label{{{label}}}')
    
    # Ensure table fits within text width
    table_content.append(r'\resizebox{\textwidth}{!}{')
    
    # Infer number of columns from df and create table format
    num_cols = len(df.columns) + 1  # +1 for the index column
    table_format = 'l' + '|c' * (num_cols - 1)
    table_content.append(f'\\begin{{tabular}}{{{table_format}}}')
    
    # Use provided column names or df.columns, and add 'Dataset' for the index
    if colnames is None:
        colnames = ['Dataset'] + list(df.columns)
    
    # Wrap column names with \makecell to allow line breaks
    wrapped_colnames = [f'\\makecell{{{name}}}' for name in colnames]
    table_content.append(' & '.join(wrapped_colnames) + r' \\')
    
    # Add cline after column names
    table_content.append(r'\cline{1-' + str(num_cols) + '}')
    
    # Determine which columns should be formatted as integers
    int_columns = [col for col in df.columns if df[col].dtype in ['float64', 'int64'] and 
                   df[col].apply(lambda x: x.is_integer() if isinstance(x, float) else True).all()]
    
    for index, row in df.iterrows():
        row_content = [f"{index}"]  # Start with the index (dataset name)
        for col, value in row.items():
            if pd.isna(value):
                row_content.append('')
            elif isinstance(value, (int, float)):
                if col in int_columns:
                    row_content.append(f"\\num{{{int(value):,}}}")
                elif value < 1 and value > 0:  # Assume it's a percentage
                    row_content.append(f"\\num{{{value*100:.4f}}}\\%")
                else:
                    row_content.append(f"\\num{{{value:,}}}")
            else:
                row_content.append(f"{value}")
        table_content.append(' & '.join(row_content) + r' \\')
    
    # Add bottom line
    table_content.append(r'\hline')
    table_content.append(r'\end{tabular}')
    table_content.append(r'}')  # Close resizebox
    table_content.append(r'\end{table}')
    
    return '\n'.join(table_content)

  """


In [3]:
def save_latex_table_content(content, filename):
    """
    Save LaTeX table content to a file, but only if on the specified machine.
    
    Args:
    content (str): LaTeX table content to save
    filename (str): Name of the file to save (without extension)
    
    Returns:
    None
    """
    username = getpass.getuser()
    if username == export_username:
        filepath = "/Users/ts/Library/CloudStorage/Dropbox/Apps/Overleaf/Dissertation Oxford/Tables"
        full_filename = os.path.join(filepath, filename + ".tex")
        
        with open(full_filename, 'w') as file:
            file.write(content)
        
        print(f"Table content saved to {full_filename}")
    else:
        print("Table content not saved (not on the specified machine)")

In [4]:
# Load the data
df = pd.read_parquet('analysis/summary-stats/data_quality_summary.parquet')

# Create the table content
latex_table_content = create_latex_table_content(
    df,
    caption="Data Quality Summary",
    label="tab:data_quality_summary",
    colnames=["Dataset", "Total Entries", "Missing Entries", "Missing \\%", "Longest Missing \\\Streak", "Overlapping Missing \\\Entries"]
)

# Save the table content
save_latex_table_content(latex_table_content, "data_summary")

Table content saved to /Users/ts/Library/CloudStorage/Dropbox/Apps/Overleaf/Dissertation Oxford/Tables/data_summary.tex


  colnames=["Dataset", "Total Entries", "Missing Entries", "Missing \\%", "Longest Missing \\\Streak", "Overlapping Missing \\\Entries"]
  colnames=["Dataset", "Total Entries", "Missing Entries", "Missing \\%", "Longest Missing \\\Streak", "Overlapping Missing \\\Entries"]
