# Merge data from Zooniverse with data from TPMP DB

## Imports

In [1]:
from pathlib import Path

import pandas as pd

## Constants

In [2]:
data_path = Path("..").joinpath("data_in")

## Load CSVs

### Load Zooniverse data

In [3]:
df_z = (
    pd.read_csv(str(data_path.joinpath("boxes_final.csv")))
    .assign(hash=lambda x: x.filename)
    .drop(["filename"], axis=1)
    .sort_values("hash")
)
df_z


Unnamed: 0,x,y,width,height,hash
0,619.566864,756.123444,65.635284,63.824951,b-1HoJ-Hqz5STrwrZHGBYdjAE3Q.jpg
1,756.218201,591.466797,46.376831,38.321350,b-1HoJ-Hqz5STrwrZHGBYdjAE3Q.jpg
2,900.525146,730.371765,56.363098,56.363037,b-1HoJ-Hqz5STrwrZHGBYdjAE3Q.jpg
13,939.846252,695.875488,25.022858,28.276367,b-38OOnRNVd8OdKdmNYZXXy83E.jpg
12,914.323547,613.455078,32.049561,40.362610,b-38OOnRNVd8OdKdmNYZXXy83E.jpg
...,...,...,...,...,...
15847,613.711975,739.365784,62.688782,76.064880,bzyUXItGRt98Cjh2dk1KeIyqjga8.jpg
15850,874.374115,922.859985,65.104553,77.646912,bzyUXItGRt98Cjh2dk1KeIyqjga8.jpg
15852,765.876777,568.327881,90.995261,83.236267,bzywiC3csPQ0738i9JSahLgzD9wE.jpg
15851,927.864258,701.034180,87.356779,74.697632,bzywiC3csPQ0738i9JSahLgzD9wE.jpg


### Load TPMP data

In [4]:
df_t = pd.read_csv(str(data_path.joinpath("filename_to_hash_v2.csv")))
df_t

Unnamed: 0,experiment,plant,date_time,camera,view_option,hash,date,time
0,10ac_mpo1_1904,10ac100_ca_mock_xx_100,2019-05-04 05:59:55,msp,sw755,bBxnW-VJguTHrR1heyox3ydBbfpE.jpg,2019-05-04,05:59:55
1,10ac_mpo1_1904,10ac100_ca_mock_xx_100,2019-05-05 06:00:37,msp,sw755,bpW70Td5eV4xkUIym9XRJnu8Acho.jpg,2019-05-05,06:00:37
2,10ac_mpo1_1904,10ac100_ca_mock_xx_100,2019-05-07 06:00:54,msp,sw755,bPM-aOQctLNKzYGiZ9ZdTBmthavI.jpg,2019-05-07,06:00:54
3,10ac_mpo1_1904,10ac100_ca_mock_xx_100,2019-05-08 12:28:21,msp,sw755,bsZSULAoEEENMYtElvh67OgXeCnw.jpg,2019-05-08,12:28:21
4,10ac_mpo1_1904,10ac100_ca_mock_xx_100,2019-05-09 05:59:48,msp,sw755,bd6qIKeyu9HuV3oo2xyuJ-6vSHo0.jpg,2019-05-09,05:59:48
...,...,...,...,...,...,...,...,...
3764,10ac_mpo1_1904,10ac98_16_mock_xx_98,2019-05-14 11:42:27,msp,sw755,bTk4nXOeRW4WdfYRscx2yYKZyApA.jpg,2019-05-14,11:42:27
3765,10ac_mpo1_1904,10ac98_16_mock_xx_98,2019-05-15 05:58:10,msp,sw755,blLmjsjUYVawSvMq8XBs4Kwqab8.jpg,2019-05-15,05:58:10
3766,10ac_mpo1_1904,10ac98_16_mock_xx_98,2019-05-16 05:58:13,msp,sw755,bn8VJkrD8MOvOMpmvFNVS04Usr8.jpg,2019-05-16,05:58:13
3767,10ac_mpo1_1904,10ac98_16_mock_xx_98,2019-05-17 05:58:11,msp,sw755,b6-esQJlMKhaeTjsHLg9Z4D7qBw.jpg,2019-05-17,05:58:11


## Merge data

In [8]:
df = (
    pd.merge(left=df_z, right=df_t, on="hash")
    .sort_values(["experiment", "plant", "date", "time"])
    .assign(filename=lambda x: x.hash)
    .drop(["hash"], axis=1)
)[
    [
        "experiment",
        "plant",
        "camera",
        "view_option",
        "date_time",
        "date",
        "time",
        "filename",
        "x",
        "y",
        "width",
        "height",
    ]
].assign(
    x1=lambda x: x.x,
    y1=lambda x: x.y,
    x2=lambda x: x.x + x.width,
    y2=lambda x: x.y + x.height,
)
df


Unnamed: 0,experiment,plant,camera,view_option,date_time,date,time,filename,x,y,width,height,x1,y1,x2,y2
3368,10ac_mpo1_1904,10ac100_ca_mock_xx_100,msp,sw755,2019-05-04 05:59:55,2019-05-04,05:59:55,bBxnW-VJguTHrR1heyox3ydBbfpE.jpg,,,,,,,,
13240,10ac_mpo1_1904,10ac100_ca_mock_xx_100,msp,sw755,2019-05-05 06:00:37,2019-05-05,06:00:37,bpW70Td5eV4xkUIym9XRJnu8Acho.jpg,,,,,,,,
6748,10ac_mpo1_1904,10ac100_ca_mock_xx_100,msp,sw755,2019-05-07 06:00:54,2019-05-07,06:00:54,bPM-aOQctLNKzYGiZ9ZdTBmthavI.jpg,811.757904,774.484711,24.912323,27.238861,811.757904,774.484711,836.670227,801.723572
13979,10ac_mpo1_1904,10ac100_ca_mock_xx_100,msp,sw755,2019-05-08 12:28:21,2019-05-08,12:28:21,bsZSULAoEEENMYtElvh67OgXeCnw.jpg,678.094482,709.526489,47.968628,45.099426,678.094482,709.526489,726.063110,754.625916
13980,10ac_mpo1_1904,10ac100_ca_mock_xx_100,msp,sw755,2019-05-08 12:28:21,2019-05-08,12:28:21,bsZSULAoEEENMYtElvh67OgXeCnw.jpg,814.060791,767.004517,42.440308,49.254456,814.060791,767.004517,856.501099,816.258972
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7887,10ac_mpo1_1904,10ac98_16_mock_xx_98,msp,sw755,2019-05-14 11:42:27,2019-05-14,11:42:27,bTk4nXOeRW4WdfYRscx2yYKZyApA.jpg,,,,,,,,
12354,10ac_mpo1_1904,10ac98_16_mock_xx_98,msp,sw755,2019-05-15 05:58:10,2019-05-15,05:58:10,blLmjsjUYVawSvMq8XBs4Kwqab8.jpg,1159.625732,681.139709,31.703491,41.302795,1159.625732,681.139709,1191.329224,722.442505
12710,10ac_mpo1_1904,10ac98_16_mock_xx_98,msp,sw755,2019-05-16 05:58:13,2019-05-16,05:58:13,bn8VJkrD8MOvOMpmvFNVS04Usr8.jpg,1148.169373,649.089752,50.336182,46.785217,1148.169373,649.089752,1198.505554,695.874969
1882,10ac_mpo1_1904,10ac98_16_mock_xx_98,msp,sw755,2019-05-17 05:58:11,2019-05-17,05:58:11,b6-esQJlMKhaeTjsHLg9Z4D7qBw.jpg,1136.784790,634.862610,62.473633,66.035828,1136.784790,634.862610,1199.258423,700.898438


## Save data

In [9]:
df.to_csv(str(data_path.joinpath("zooniverse_tpmp_data.csv")), index=False)