In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('couples.unique.csv')
df = df.set_index('Couple')
df = df[~pd.isnull(df['spvl'])]
og_df = df.copy()

## Options of how to treat the data

In [None]:
base_output_name = 'blanquart.couples.for.fitting'

file_suffix = ''
dose_column = 'spvl'
# remove undectable viral load?
remove_undectable = True
if remove_undectable:
    file_suffix += '.ur'
    dose_column = 'spvl.ur'

In [None]:
if remove_undectable:
    df = df[~pd.isna(df['spvl.ur'])]

## Processing data

In [None]:
df['partner.ever.seroconverted'] = (df['partner.firstPosDate'] != np.inf)
df['partner.ever.seroconverted'].sum()

In [None]:
# do this later with inferred seroconversion date?
df['partner.first.pos.after.art'] = df['partner.ever.seroconverted'] & (df['partner.firstPosDate'] > df['index.first.art.date'])
df['partner.first.pos.after.art'].sum()

In [None]:
df['partner.seroconverted.before.art'] = df['partner.ever.seroconverted'] & ~df['partner.first.pos.after.art']
df['partner.seroconverted.before.art'].sum()

In [None]:
df['index.inferred.spvl.start.date'] = df['index.firstPosDate'] + 0.5

In [None]:
# here we can choose to do midpoint or not!

df['partner.inferred.seroconversion.date'] = np.where(
    df['partner.ever.seroconverted'],
    (df['partner.firstPosDate'] + df['partner.lastNegDate'])/2,
    np.inf
)

In [None]:
df['infectious.contact.period.end'] = np.where(
    df['partner.seroconverted.before.art'],
    df[['index.first.art.date', 'partner.inferred.seroconversion.date']].min(axis=1),
    df[['index.first.art.date', 'partner.lastNegDate']].min(axis=1),
)

In [None]:
df['duration'] = df['infectious.contact.period.end'] - df['index.inferred.spvl.start.date']

In [None]:
df[df['duration'] <= 0].index

In [None]:
df_bad = df[df['duration'] <= 0]
df = df[df['duration'] > 0]

with open('couples.extra.columns.csv', 'w') as f:
    df.to_csv(f)

df_new = pd.DataFrame({
    'couple': df.index,
    'dose': 10**df[dose_column],
    'number': 1,
    'success': df['partner.seroconverted.before.art'],
    'duration': df['duration'],
    'dose_frequency': 9*12.
    },
)

df_new['success'] = df_new['success'].astype(int)
df_new = df_new.set_index('couple')

with open(base_output_name + file_suffix + '.csv', 'w') as f:
    df_new.to_csv(f)

with open('dropped.blanquart.couples.csv', 'w') as f:
    df_bad.to_csv(f)

In [None]:
df_bad[df_bad['index.first.art.date'] < df_bad['index.firstPosDate']]

In [None]:
df