Skip to content

Commit

Permalink
Added mf::outerjoin()
Browse files Browse the repository at this point in the history
  • Loading branch information
tedmiddleton committed May 4, 2023
1 parent 0c3f1db commit 7d2a64a
Show file tree
Hide file tree
Showing 2 changed files with 321 additions and 0 deletions.
78 changes: 78 additions & 0 deletions mainframe/join.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,84 @@ leftjoin(frame<Ts...> left, columnindex<Ind1>, frame<Us...> right, columnindex<I
// hcat fleft and fright into out
frame<Ts..., Us...> out = fleft.hcat(fright);

return out;
}

template<typename... Ts, size_t Ind1, typename... Us, size_t Ind2>
frame<Ts..., Us...>
outerjoin(frame<Ts...> left, columnindex<Ind1>, frame<Us...> right, columnindex<Ind2>)
{
// These should be comparable
using LT = typename detail::pack_element<Ind1, Ts...>::type;
using RT = typename detail::pack_element<Ind2, Us...>::type;
static_assert(detail::is_equality_comparable<LT, RT>::value,
"Column types to join on must be equality comparable");

const frame_indexer<index_defn<Ind1>, Ts...> ileft{ left };
const frame_indexer<index_defn<Ind2>, Us...> iright{ right };
ileft.build_index();
iright.build_index();
frame<Ts...> fleft;
fleft.set_column_names(left.column_names());
frame<Us...> fright;
fright.set_column_names(right.column_names());

// Iterator through left index keys
for (auto liit = ileft.begin_index(); liit != ileft.end_index(); ++liit) {

auto riit = iright.find_index(liit->first);
if (riit != iright.end_index()) {

for (auto lrit = ileft.begin_index_row(liit); lrit != ileft.end_index_row(liit);
++lrit) {
size_t leftind = *lrit;
auto leftrow = *(ileft.begin() + leftind);
(void)leftrow;

for (auto rrit = iright.begin_index_row(riit); rrit != iright.end_index_row(riit);
++rrit) {
size_t rightind = *rrit;
auto rightrow = *(iright.begin() + rightind);
(void)rightrow;

fleft.push_back(leftrow);
fright.push_back(rightrow);
}
}
}
else {

for (auto lrit = ileft.begin_index_row(liit); lrit != ileft.end_index_row(liit);
++lrit) {
size_t leftind = *lrit;
auto leftrow = *(ileft.begin() + leftind);
(void)leftrow;

fleft.push_back(leftrow);
fright.resize(fright.size() + 1);
}
}
}

// Iterator through right index keys
for (auto riit = iright.begin_index(); riit != iright.end_index(); ++riit) {

auto liit = ileft.find_index(riit->first);
if (liit == ileft.end_index()) {

for (auto rlit = iright.begin_index_row(riit); rlit != iright.end_index_row(riit);
++rlit) {
size_t rightind = *rlit;
auto rightrow = *(iright.begin() + rightind);
(void)rightrow;
fright.push_back(rightrow);
fleft.resize(fleft.size() + 1);
}
}
}

// hcat fleft and fright into out
frame<Ts..., Us...> out = fleft.hcat(fright);

return out;
}
Expand Down
243 changes: 243 additions & 0 deletions tests/mainframe_test_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2968,6 +2968,249 @@ TEST_CASE("leftjoin", "[frame]")
}
}

TEST_CASE("outerjoin", "[frame]")
{
SECTION("no duplicates")
{
frame<mi<year_month_day>, mi<bool>> f1;
f1.set_column_names("date", "rain");
f1.push_back(2022_y / January / 1, false);
f1.push_back(2022_y / January / 2, false);
f1.push_back(2022_y / January / 3, true);
f1.push_back(2022_y / January / 4, true);

frame<mi<year_month_day>, mi<double>> f2;
f2.set_column_names("date", "temperature");
f2.push_back(2022_y / January / 1, 9.0);
f2.push_back(2022_y / January / 2, 10.0);
f2.push_back(2022_y / January / 3, 11.0);
f2.push_back(2022_y / January / 4, 12.2);

auto res = outerjoin(f1, _0, f2, _0);
res.sort(_0, _1);
dout << res;
REQUIRE(res.size() == 4);

// 0| 2022-01-01 | false | 2022-01-01 | 9
// 1| 2022-01-02 | false | 2022-01-02 | 10
// 2| 2022-01-03 | true | 2022-01-03 | 11
// 3| 2022-01-04 | true | 2022-01-04 | 12.2

auto it = res.cbegin();
REQUIRE((it + 0)->at(_0) == 2022_y / 1 / 1);
REQUIRE((it + 1)->at(_0) == 2022_y / 1 / 2);
REQUIRE((it + 2)->at(_0) == 2022_y / 1 / 3);
REQUIRE((it + 3)->at(_0) == 2022_y / 1 / 4);

REQUIRE((it + 0)->at(_1) == false);
REQUIRE((it + 1)->at(_1) == false);
REQUIRE((it + 2)->at(_1) == true);
REQUIRE((it + 3)->at(_1) == true);

REQUIRE((it + 0)->at(_2) == 2022_y / 1 / 1);
REQUIRE((it + 1)->at(_2) == 2022_y / 1 / 2);
REQUIRE((it + 2)->at(_2) == 2022_y / 1 / 3);
REQUIRE((it + 3)->at(_2) == 2022_y / 1 / 4);

REQUIRE((it + 0)->at(_3) == 9.0);
REQUIRE((it + 1)->at(_3) == 10.0);
REQUIRE((it + 2)->at(_3) == 11.0);
REQUIRE((it + 3)->at(_3) == 12.2);
}

SECTION("left duplicates")
{
frame<mi<year_month_day>, mi<bool>> f1;
f1.set_column_names("date", "rain");
f1.push_back(2022_y / January / 1, false);
f1.push_back(2022_y / January / 1, true);
f1.push_back(2022_y / January / 3, true);
f1.push_back(2022_y / January / 4, true);

frame<mi<year_month_day>, mi<double>> f2;
f2.set_column_names("date", "temperature");
f2.push_back(2022_y / January / 1, 9.0);
f2.push_back(2022_y / January / 2, 10.0);
f2.push_back(2022_y / January / 3, 11.0);
f2.push_back(2022_y / January / 4, 12.2);

auto res = outerjoin(f1, _0, f2, _0);
res.sort(_0, _1);
dout << res;
REQUIRE(res.size() == 5);

// 0 | missing | missing | 2022-01-02 | 10
// 1 | 2022-01-01 | false | 2022-01-01 | 9
// 2 | 2022-01-01 | true | 2022-01-01 | 9
// 3 | 2022-01-03 | true | 2022-01-03 | 11
// 4 | 2022-01-04 | true | 2022-01-04 | 12.2

auto it = res.cbegin();
REQUIRE((it + 0)->at(_0) == missing);
REQUIRE((it + 1)->at(_0) == 2022_y / 1 / 1);
REQUIRE((it + 2)->at(_0) == 2022_y / 1 / 1);
REQUIRE((it + 3)->at(_0) == 2022_y / 1 / 3);
REQUIRE((it + 4)->at(_0) == 2022_y / 1 / 4);

REQUIRE((it + 0)->at(_1) == missing);
REQUIRE((it + 1)->at(_1) == false);
REQUIRE((it + 2)->at(_1) == true);
REQUIRE((it + 3)->at(_1) == true);
REQUIRE((it + 4)->at(_1) == true);

REQUIRE((it + 0)->at(_2) == 2022_y / 1 / 2);
REQUIRE((it + 1)->at(_2) == 2022_y / 1 / 1);
REQUIRE((it + 2)->at(_2) == 2022_y / 1 / 1);
REQUIRE((it + 3)->at(_2) == 2022_y / 1 / 3);
REQUIRE((it + 4)->at(_2) == 2022_y / 1 / 4);

REQUIRE((it + 0)->at(_3) == 10.0);
REQUIRE((it + 1)->at(_3) == 9.0);
REQUIRE((it + 2)->at(_3) == 9.0);
REQUIRE((it + 3)->at(_3) == 11.0);
REQUIRE((it + 4)->at(_3) == 12.2);
}

SECTION("right duplicates")
{
frame<mi<year_month_day>, mi<bool>> f1;
f1.set_column_names("date", "rain");
f1.push_back(2022_y / January / 1, false);
f1.push_back(2022_y / January / 2, true);
f1.push_back(2022_y / January / 3, true);
f1.push_back(2022_y / January / 4, true);

frame<mi<year_month_day>, mi<double>> f2;
f2.set_column_names("date", "temperature");
f2.push_back(2022_y / January / 1, 9.0);
f2.push_back(2022_y / January / 1, 10.0);
f2.push_back(2022_y / January / 3, 11.0);
f2.push_back(2022_y / January / 4, 12.2);

auto res = outerjoin(f1, _0, f2, _0);
res.sort(_0, _1);
dout << res;
REQUIRE(res.size() == 5);

// 0| 2022-01-01 | false | 2022-01-01 | 9
// 1| 2022-01-01 | false | 2022-01-01 | 10
// 2| 2022-01-02 | true | missing | missing
// 3| 2022-01-03 | true | 2022-01-03 | 11
// 4| 2022-01-04 | true | 2022-01-04 | 12.2

auto it = res.cbegin();
REQUIRE((it + 0)->at(_0) == 2022_y / 1 / 1);
REQUIRE((it + 1)->at(_0) == 2022_y / 1 / 1);
REQUIRE((it + 2)->at(_0) == 2022_y / 1 / 2);
REQUIRE((it + 3)->at(_0) == 2022_y / 1 / 3);
REQUIRE((it + 4)->at(_0) == 2022_y / 1 / 4);

REQUIRE((it + 0)->at(_1) == false);
REQUIRE((it + 1)->at(_1) == false);
REQUIRE((it + 2)->at(_1) == true);
REQUIRE((it + 3)->at(_1) == true);
REQUIRE((it + 4)->at(_1) == true);

REQUIRE((it + 0)->at(_2) == 2022_y / 1 / 1);
REQUIRE((it + 1)->at(_2) == 2022_y / 1 / 1);
REQUIRE((it + 2)->at(_2) == missing);
REQUIRE((it + 3)->at(_2) == 2022_y / 1 / 3);
REQUIRE((it + 4)->at(_2) == 2022_y / 1 / 4);

REQUIRE((it + 0)->at(_3) == 9.0);
REQUIRE((it + 1)->at(_3) == 10.0);
REQUIRE((it + 2)->at(_3) == missing);
REQUIRE((it + 3)->at(_3) == 11.0);
REQUIRE((it + 4)->at(_3) == 12.2);
}

SECTION("left empty")
{
frame<mi<year_month_day>, mi<bool>> f1;
f1.set_column_names("date", "rain");

frame<mi<year_month_day>, mi<double>> f2;
f2.set_column_names("date", "temperature");
f2.push_back(2022_y / January / 1, 9.0);
f2.push_back(2022_y / January / 2, 10.0);
f2.push_back(2022_y / January / 3, 11.0);
f2.push_back(2022_y / January / 4, 12.2);

auto res = outerjoin(f1, _0, f2, _0);
res.sort(_0, _1, _2);
dout << res;
REQUIRE(res.size() == 4);
// 1| missing | missing | 2022-01-01 | 9.0
// 2| missing | missing | 2022-01-02 | 10.0
// 3| missing | missing | 2022-01-03 | 11.0
// 4| missing | missing | 2022-01-04 | 12.2

auto it = res.cbegin();
REQUIRE((it + 0)->at(_0) == missing);
REQUIRE((it + 1)->at(_0) == missing);
REQUIRE((it + 2)->at(_0) == missing);
REQUIRE((it + 3)->at(_0) == missing);

REQUIRE((it + 0)->at(_1) == missing);
REQUIRE((it + 1)->at(_1) == missing);
REQUIRE((it + 2)->at(_1) == missing);
REQUIRE((it + 3)->at(_1) == missing);

REQUIRE((it + 0)->at(_2) == 2022_y / 1 / 1);
REQUIRE((it + 1)->at(_2) == 2022_y / 1 / 2);
REQUIRE((it + 2)->at(_2) == 2022_y / 1 / 3);
REQUIRE((it + 3)->at(_2) == 2022_y / 1 / 4);

REQUIRE((it + 0)->at(_3) == 9.0);
REQUIRE((it + 1)->at(_3) == 10.0);
REQUIRE((it + 2)->at(_3) == 11.0);
REQUIRE((it + 3)->at(_3) == 12.2);
}

SECTION("right empty")
{
frame<mi<year_month_day>, mi<bool>> f1;
f1.set_column_names("date", "rain");
f1.push_back(2022_y / January / 1, false);
f1.push_back(2022_y / January / 2, true);
f1.push_back(2022_y / January / 3, true);
f1.push_back(2022_y / January / 4, true);

frame<mi<year_month_day>, mi<double>> f2;
f2.set_column_names("date", "temperature");

auto res = outerjoin(f1, _0, f2, _0);
res.sort(_0, _1);
dout << res;
REQUIRE(res.size() == 4);
// 1| 2022-01-01 | false | missing | missing
// 2| 2022-01-02 | true | missing | missing
// 3| 2022-01-03 | true | missing | missing
// 4| 2022-01-04 | true | missing | missing

auto it = res.cbegin();
REQUIRE((it + 0)->at(_0) == 2022_y / 1 / 1);
REQUIRE((it + 1)->at(_0) == 2022_y / 1 / 2);
REQUIRE((it + 2)->at(_0) == 2022_y / 1 / 3);
REQUIRE((it + 3)->at(_0) == 2022_y / 1 / 4);

REQUIRE((it + 0)->at(_1) == false);
REQUIRE((it + 1)->at(_1) == true);
REQUIRE((it + 2)->at(_1) == true);
REQUIRE((it + 3)->at(_1) == true);

REQUIRE((it + 0)->at(_2) == missing);
REQUIRE((it + 1)->at(_2) == missing);
REQUIRE((it + 2)->at(_2) == missing);
REQUIRE((it + 3)->at(_2) == missing);

REQUIRE((it + 0)->at(_3) == missing);
REQUIRE((it + 1)->at(_3) == missing);
REQUIRE((it + 2)->at(_3) == missing);
REQUIRE((it + 3)->at(_3) == missing);
}
}

TEST_CASE("replace_missing", "[frame]")
{
frame<mi<year_month_day>, mi<double>, mi<bool>> f1;
Expand Down

0 comments on commit 7d2a64a

Please sign in to comment.