In [None]:
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyreadr

%load_ext memory_profiler

In [None]:
data_src = "./data/Data_20130610.RData"

In [None]:
r_data = pyreadr.read_r(data_src)

In [None]:
data1 = r_data["Data1"]
data2 = r_data["Data2"]

## `Data1` -- misclassifications

In [None]:
data1.head()

* Change STATUS to float.
* Split IdTooth into IDNR (patient ID) and TOOTH (tooth ID).
* Change their dtypes from str to int.
* Change EXAMINER to int (was an int but int32, this makes it an int64).
* Drop IdTooth.
* Reorder so that IDNR and TOOTH are the first two columns.

In [None]:
data1["STATUS"] = data1["STATUS"].astype(float)
data1[["IDNR", "TOOTH"]] = data1["IdTooth"].str.split("_", expand=True)
data1["IDNR"] = data1["IDNR"].astype(int) - 1
data1["TOOTH"] = data1["TOOTH"].astype(int)
data1["TOOTH_RANK"] = data1["TOOTH"].replace({16: 0, 26: 1, 36: 2, 46: 3})
data1["EXAMINER"] = data1["EXAMINER"].astype(int) - 1
data1.drop("IdTooth", axis="columns", inplace=True)
data1["VISIT_RANK"] = data1.groupby(["IDNR", "TOOTH"]).rank()["VISIT"].astype(int) - 1
data1 = data1[["IDNR", "TOOTH", "TOOTH_RANK", "VISIT", "VISIT_RANK", "EXAMINER", "STATUS"]]

In [None]:
data1.dtypes

In [None]:
data1.head()

In [None]:
data1.sort_values(by=["IDNR", "TOOTH"]).equals(data1)

## `Data2` -- regressors

In [None]:
data2.head()

In [None]:
data2.isna().sum()

* Change IDNR and TOOTH to int.

In [None]:
data2["IDNR"] = data2["IDNR"].astype(int) - 1
data2["TOOTH"] = data2["TOOTH"].astype(int)
data2["TOOTH_RANK"] = data2["TOOTH"].replace({16: 0, 26: 1, 36: 2, 46: 3})

* Change FBEG NA to $0$.
* Change FEND NA to $+\infty$.

In [None]:
data2.loc[data2["FBEG"].isna(), "FBEG"] = 0.0
data2.loc[data2["FEND"].isna(), "FEND"] = float("inf")

In [None]:
data2.dtypes

In [None]:
data2.head()

In [None]:
data2.isna().sum()

In [None]:
data2.sort_values(by=["IDNR", "TOOTH"]).equals(data2)

In [None]:
grouped = data2.groupby("IDNR").count()
not_4_teeth = grouped[grouped["TOOTH"] != 4].index
data1.drop(data1[data1["IDNR"].isin(not_4_teeth)].index, inplace=True)
data2.drop(data2[data2["IDNR"].isin(not_4_teeth)].index, inplace=True)

In [None]:
N = data2["IDNR"].nunique()
J = 4
data2["IDNR"] = np.repeat(np.arange(N, dtype=int), J)

visits_per_subject = data1.groupby("IDNR").count()["VISIT"]
data1["IDNR"] = np.repeat(np.arange(N, dtype=int), visits_per_subject)

## Join Data1 and Data2

In [None]:
%memit df = pd.merge(data1, data2, how="inner", on=["IDNR", "TOOTH"], validate="many_to_one")

In [None]:
df.memory_usage(deep=True).values.sum() / 1024 / 1024

In [None]:
df.sort_values(by=["IDNR", "TOOTH"]).equals(df)