In [1]:
import sys, os
import pandas as pd
import json

sys.path.append(os.path.join(os.getcwd(), '../..'))
from datamart.joiners.rltk_joiner import RLTKJoiner



###Load the supply dataset - taxi example:

In [2]:
taxi_df = pd.read_csv('example/rltk_joiner_example/taxi/left.csv')
with open('example/rltk_joiner_example/taxi/left.json') as f:
    taxi_meta = json.load(f)
print(taxi_df.head())

   d3mIndex tpep_pickup_datetime  num_pickups      city
0         0  2018-01-01 00:00:00           67  New York
1         1  2018-01-01 01:00:00            8  New York
2         2  2018-01-01 02:00:00            0  New York
3         3  2018-01-01 03:00:00            0  New York
4         4  2018-01-01 04:00:00            7  New York


###Load the dataset to join - from NOAA:

In [3]:
taxi_df_join = pd.read_csv('example/rltk_joiner_example/taxi/right.csv')
with open('example/rltk_joiner_example/taxi/right.json') as f:
    taxi_meta_join = json.load(f)
print(taxi_df_join.head())

                  date          stationid      city  AWND
0  2018-01-01T00:00:00  GHCND:USW00014732  new york    76
1  2018-01-01T00:00:00  GHCND:USW00014734  new york    57
2  2018-01-01T00:00:00  GHCND:USW00054743  new york    29
3  2018-01-01T00:00:00  GHCND:USW00054787  new york    38
4  2018-01-01T00:00:00  GHCND:USW00094728  new york    35


###Join the two datasets: 

In [4]:
joiner = RLTKJoiner()
pd.set_option("display.width", 200)
taxi_res = joiner.join(
    left_df=taxi_df,
    right_df=taxi_df_join,
    left_columns=[[1], [3]],
    right_columns=[[0], [2]],
    left_metadata=taxi_meta,
    right_metadata=taxi_meta_join,
)
print(taxi_res.head(30))

    d3mIndex tpep_pickup_datetime  num_pickups      city          stationid  AWND
0          0  2018-01-01 00:00:00           67  New York  GHCND:USW00094789    79
1          1  2018-01-01 01:00:00            8  New York  GHCND:USW00094789    79
2          2  2018-01-01 02:00:00            0  New York  GHCND:USW00094789    79
3          3  2018-01-01 03:00:00            0  New York  GHCND:USW00094789    79
4          4  2018-01-01 04:00:00            7  New York  GHCND:USW00094789    79
5          5  2018-01-01 05:00:00           10  New York  GHCND:USW00094789    79
6          6  2018-01-01 06:00:00            9  New York  GHCND:USW00094789    79
7          7  2018-01-01 07:00:00           28  New York  GHCND:USW00094789    79
8          8  2018-01-01 08:00:00          157  New York  GHCND:USW00094789    79
9          9  2018-01-01 09:00:00          259  New York  GHCND:USW00094789    79
10        10  2018-01-01 10:00:00          301  New York  GHCND:USW00094789    79
11        11  20

###Load the supply datset - fifa example:
(Only several columns are shown because the dataset has too many columns.)

In [5]:
fifa_df = pd.read_csv('example/rltk_joiner_example/fifa/left.csv')
with open('example/rltk_joiner_example/fifa/left.json') as f:
    fifa_meta = json.load(f)
print(fifa_df[['GameID', 'Date', 'Team', 'Opponent', '1st Goal']].head())

   GameID        Date          Team      Opponent  1st Goal
0       0  14-06-2018        Russia  Saudi Arabia      12.0
1       1  14-06-2018  Saudi Arabia        Russia       NaN
2       2  15-06-2018         Egypt       Uruguay       NaN
3       3  15-06-2018       Uruguay         Egypt      89.0
4       4  15-06-2018       Morocco          Iran       NaN


###Load the dataset to join:
(Only several columns are shown because the dataset has too many columns.)

In [6]:
fifa_df_join = pd.read_csv('example/rltk_joiner_example/fifa/right.csv')
with open('example/rltk_joiner_example/fifa/right.json') as f:
    fifa_meta_join = json.load(f)
print(fifa_df_join[['matchday', 'stage', 'homeTeam_name', 'homeTeam_id', 'awayTeam_name', 'awayTeam_id']].head(20))

    matchday        stage homeTeam_name  homeTeam_id   awayTeam_name  awayTeam_id
0        1.0  GROUP_STAGE        Russia          808    Saudi Arabia          801
1        1.0  GROUP_STAGE         Egypt          825         Uruguay          758
2        1.0  GROUP_STAGE       Morocco          815            Iran          840
3        1.0  GROUP_STAGE      Portugal          765           Spain          760
4        1.0  GROUP_STAGE        France          773       Australia          779
5        1.0  GROUP_STAGE     Argentina          762         Iceland         1066
6        1.0  GROUP_STAGE          Peru          832         Denmark          782
7        1.0  GROUP_STAGE       Croatia          799         Nigeria          776
8        1.0  GROUP_STAGE    Costa Rica          793          Serbia          780
9        1.0  GROUP_STAGE       Germany          759          Mexico          769
10       1.0  GROUP_STAGE        Brazil          764     Switzerland          788
11       1.0  GR

###Join the datasets: 
(Only several columns are shown because the dataset has too many columns.)
- by checking the id and names we can see if the results are correct.

In [8]:
fifa_res = joiner.join(
    left_df=fifa_df,
    right_df=fifa_df_join,
    left_columns=[[3, 4]],
    right_columns=[[22, 24]],
    left_metadata=fifa_meta,
    right_metadata=fifa_meta_join,
)
print(fifa_res[['Date', 'Team', 'Opponent', '1st Goal', 'score_fullTime_homeTeam', 'score_fullTime_awayTeam']].head(30))

          Date            Team        Opponent  1st Goal  score_fullTime_homeTeam  score_fullTime_awayTeam
0   14-06-2018          Russia    Saudi Arabia      12.0                        5                        0
1   14-06-2018    Saudi Arabia          Russia       NaN                        5                        0
2   15-06-2018           Egypt         Uruguay       NaN                        0                        1
3   15-06-2018         Uruguay           Egypt      89.0                        0                        1
4   15-06-2018         Morocco            Iran       NaN                        0                        1
5   15-06-2018            Iran         Morocco      90.0                        0                        1
6   15-06-2018        Portugal           Spain       4.0                        3                        3
7   15-06-2018           Spain        Portugal      24.0                        3                        3
8   16-06-2018          France       