# Repetend ↔ Tomotope analysis

This notebook reproduces and documents the repetend scan and enrichment analyses, shows key figures, and runs multivariate logistic regressions for properties of interest (full repetend, div12, eq6). Figures and model outputs are saved under `data/repetend_scan/` so they can be included in a PR or report.

In [None]:
# Imports and setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import os
from pathlib import Path

pd.options.display.max_columns = 200
OUT_DIR = Path('data') / 'repetend_scan' / 'figs'
OUT_DIR.mkdir(parents=True, exist_ok=True)

print('Output directory:', OUT_DIR)


In [None]:
# Load data
rep_csv = Path('data') / 'repetend_scan' / 'repetend_scan_10000.csv'
df = pd.read_csv(rep_csv)
# basic types
df['repetend'] = df['repetend'].astype(int)
df['full_repetend'] = df['full_repetend'].astype(int)

print('Rows:', len(df))
df.head()


In [None]:
# Figures: histogram of repetend lengths and mod12 distribution
plt.figure(figsize=(6,3))
sns.histplot(df.loc[df['repetend']>0,'repetend'], bins=40)
plt.title('Repetend length distribution (d≤10000)')
plt.xlabel('Repetend length')
plt.tight_layout()
plt.savefig(OUT_DIR / 'repetend_hist_10000.png')
plt.show()

plt.figure(figsize=(6,3))
df_full = df[df['repetend']>0]
res = df_full['mod12'].value_counts().sort_index()
res.plot(kind='bar')
plt.xlabel('d mod 12')
plt.ylabel('count (repetend>0)')
plt.tight_layout()
plt.savefig(OUT_DIR / 'mod12_counts_10000.png')
plt.show()

# fraction div12 by residue (recompute quickly)
frac = df_full.groupby('mod12').apply(lambda g: ((g['repetend']>0) & (g['repetend']%12==0)).sum()/len(g))
plt.figure(figsize=(6,3))
frac.plot(kind='bar')
plt.ylabel('fraction with repetend divisible by 12')
plt.tight_layout()
plt.savefig(OUT_DIR / 'fraction_div12_by_mod12_10000.png')
plt.show()


In [None]:
# Multivariate logistic regressions
import patsy
from statsmodels.tools.sm_exceptions import PerfectSeparationError

# create categorical mod12
df['mod12_cat'] = df['mod12'].astype(str)

properties = {'full_repetend':'full_repetend', 'div12': (df['repetend']>0)&(df['repetend']%12==0), 'eq6': df['repetend']==6}

model_summaries = {}
for name, expr in properties.items():
    # create response
    y = expr.astype(int) if not isinstance(expr, pd.Series) else expr.astype(int)
    # design matrix with intercept and mod12 dummies
    X = pd.get_dummies(df['mod12_cat'], prefix='r', drop_first=True)
    X = sm.add_constant(X)
    try:
        model = sm.Logit(y, X).fit(disp=False)
        model_summaries[name] = {'params': model.params.to_dict(), 'pvalues': model.pvalues.to_dict(), 'summary': str(model.summary())}
        # save coeffs
        coeffs_df = pd.DataFrame({'coef': model.params, 'p': model.pvalues})
        coeffs_df.to_csv(OUT_DIR / f'logit_coeffs_{name}_mod12_10000.csv')
    except PerfectSeparationError as e:
        model_summaries[name] = {'error': 'PerfectSeparation', 'message': str(e)}
    except Exception as e:
        model_summaries[name] = {'error': 'Other', 'message': str(e)}

# write JSON
import json
Path('data/repetend_scan/logistic_models_10000.json').write_text(json.dumps(model_summaries, indent=2))

# display quick summary
for k,v in model_summaries.items():
    print('---', k)
    if 'summary' in v:
        print(v['summary'][:500])
    else:
        print('error', v.get('error'), v.get('message'))


# Conclusions

- Residue classes mod 12 show strong non-uniformity for certain repetend-derived properties (see heatmaps and bar plots).  
- Logistic regression controlling for mod12 shows which residue indicators have significant associations (coeff CSVs saved under `data/repetend_scan/figs/` and model JSON summary in `data/repetend_scan/logistic_models_10000.json`).  

Next steps: run model with interactions or with additional predictors (mod24, totient factors), and prepare a concise PR showing figures and a short interpretation note.