In [1]:
from datamart import search, augment
from datamart.utilities.utils import Utils
import pandas as pd
import json

In [2]:
old_df = pd.read_csv("./example/fifa_example/fifa.csv")
print("- READ THE SUPPLY DATASET -\n")
print(old_df.iloc[:10, :])

- READ THE SUPPLY DATASET -

   d3mIndex  GameID        Date          Team      Opponent  \
0         0       0  14-06-2018        Russia  Saudi Arabia   
1         1       1  14-06-2018  Saudi Arabia        Russia   
2         2       2  15-06-2018         Egypt       Uruguay   
3         3       3  15-06-2018       Uruguay         Egypt   
4         4       4  15-06-2018       Morocco          Iran   
5         5       5  15-06-2018          Iran       Morocco   
6         6       6  15-06-2018      Portugal         Spain   
7         7       7  15-06-2018         Spain      Portugal   
8         8       8  16-06-2018        France     Australia   
9         9       9  16-06-2018     Australia        France   

   Ball Possession %  Off-Target  Blocked  Offsides  Saves  Pass Accuracy %  \
0                 40           3        3         3      0               78   
1                 60           3        3         1      2               86   
2                 43           3        

In [3]:
query_json = {
    "dataset": {
        "about": "Man of Match, Soccer game, FIFA, best player, team, MVP, Goal, opponent",
        "description": ["FIFA", "worldcup", "Soccer game", "European Cup"]
    },
    "required_variables": [
        {
            "type": "dataframe_columns",
            "name": [
              "Team"
            ]
        },
        {
            "type": "dataframe_columns",
            "name": [
              "Opponent"
            ]
        }
    ],
    "desired_variables": [
        {
            "type": "generic_entity",
            "about": "score_winner",
            "variable_syntactic_type": [
                "http://schema.org/Text"
            ]
        }
    ]
}
results = search(query_json, old_df)
print("- SEARCH DATAMART BY A DESCRIPTION JSON OBJECT -\n")
print("Returned %d Datasets" % len(results))
for res in results:
    print(res.id)
    print('\t' + '\n\t'.join(json.dumps(res.metadata, indent=2).split('\n', 50)[:50]))
    print('\t... ...')

- SEARCH DATAMART BY A DESCRIPTION JSON OBJECT -

Returned 1 Datasets
127860000
	{
	  "datamart_id": 127860000,
	  "title": "FIFA World Cup",
	  "description": "FIFA World Cup",
	  "url": "https://www.football-data.org",
	  "keywords": [
	    "football",
	    "competition"
	  ],
	  "provenance": {
	    "source": "www.football-data.org"
	  },
	  "materialization": {
	    "python_path": "football_match_materializer",
	    "arguments": {
	      "uri": "/v2/competitions/2000/matches?limit=999",
	      "token": "d019bc4541c9490fabcba6806cbcc42b"
	    }
	  },
	  "variables": [
	    {
	      "datamart_id": 127860001,
	      "name": "id",
	      "semantic_type": [
	        "http://schema.org/Integer"
	      ],
	      "description": "column name: id, dtype: int64"
	    },
	    {
	      "datamart_id": 127860002,
	      "name": "season_id",
	      "semantic_type": [
	        "http://schema.org/Integer"
	      ],
	      "description": "column name: season_id, dtype: int64"
	    },
	    {
	      "d

In [4]:
new_df = Utils.get_dataset(metadata=results[0].metadata)
print("- MATERIALIZE SEARCH RESULTS -\n")
print(new_df.iloc[:10, :])

- MATERIALIZE SEARCH RESULTS -

       id  season_id season_startDate season_endDate  season_currentMatchday  \
0  200000          1       2018-06-14     2018-07-15                       3   
1  200001          1       2018-06-14     2018-07-15                       3   
2  200006          1       2018-06-14     2018-07-15                       3   
3  200007          1       2018-06-14     2018-07-15                       3   
4  200012          1       2018-06-14     2018-07-15                       3   
5  200018          1       2018-06-14     2018-07-15                       3   
6  200013          1       2018-06-14     2018-07-15                       3   
7  200019          1       2018-06-14     2018-07-15                       3   
8  200024          1       2018-06-14     2018-07-15                       3   
9  200030          1       2018-06-14     2018-07-15                       3   

                utcDate    status  matchday        stage    group  \
0  2018-06-14T15:0

In [5]:
target_dataset = results[0]
teams_col = [[old_df.columns.tolist().index(x)] for x in ["Team", "Opponent"]]
new_teams_col = [[new_df.columns.tolist().index(x)] for x in ["homeTeam_name", "awayTeam_name"]]
target_dataset.set_match(teams_col, new_teams_col)

print("- AUGMENT BY THE SEARCHED RESULT -\n")
result = augment(old_df, target_dataset)
print(result.iloc[:10, :])

- AUGMENT BY THE SEARCHED RESULT -



   d3mIndex  GameID        Date          Team      Opponent  \
0         0       0  14-06-2018        Russia  Saudi Arabia   
1         1       1  14-06-2018  Saudi Arabia        Russia   
2         2       2  15-06-2018         Egypt       Uruguay   
3         3       3  15-06-2018       Uruguay         Egypt   
4         4       4  15-06-2018       Morocco          Iran   
5         5       5  15-06-2018          Iran       Morocco   
6         6       6  15-06-2018      Portugal         Spain   
7         7       7  15-06-2018         Spain      Portugal   
8         8       8  16-06-2018        France     Australia   
9         9       9  16-06-2018     Australia        France   

   Ball Possession %  Off-Target  Blocked  Offsides  Saves  \
0                 40           3        3         3      0   
1                 60           3        3         1      2   
2                 43           3        2         1      3   
3                 57           6        4         1      3