# Ingestion from a Jupyter Notebook

When working in a Jupuyter Notebook you can send data to Tinybird Data Sources using the full range of ingestion options.

This notebook walks through the options using the example of data from recent changes to Wikipedia.

**Options for ingesting data:**

1. Rest API
2. UI
3. CLI
4. High-frequency ingestion

**Example using pandas DataFrames for options 1-3.**

- create a Data Source from 5 minutes of data in `df_wiki`

- append 5 minutes of data to the Data Source from `df_wiki_new`

Based on
https://wikitech.wikimedia.org/wiki/Event_Platform/EventStreams



## Create pandas DataFrames

In [2]:
!pip install sseclient



In [3]:
import json
import time
import pandas as pd

from sseclient import SSEClient as EventSource
from google.colab import files

In [4]:
def create_df_wiki(url='https://stream.wikimedia.org/v2/stream/recentchange', n=5):
  df_wiki = pd.DataFrame()
  t_end = time.time() + 60 * n
  change = {'timestamp': time.time()}
  for event in EventSource(url):
    if change['timestamp'] > t_end:
          break
    elif event.event == 'message':
          try:
              change = json.loads(event.data)
          except ValueError:
              pass
          else:
            if change['type']!='log':
              df=pd.DataFrame.from_dict(change)
              df_wiki=df_wiki.append(df[df.index=='domain'])
  return df_wiki

DataFrame of n minutes of data to create Data Source

In [5]:
df_wiki = create_df_wiki(n=5)
df_wiki.drop(columns=['$schema','length','revision'], inplace=True)

In [6]:
df_wiki.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7937 entries, domain to domain
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   meta                7937 non-null   object
 1   id                  7937 non-null   int64 
 2   type                7937 non-null   object
 3   namespace           7937 non-null   int64 
 4   title               7937 non-null   object
 5   comment             7937 non-null   object
 6   timestamp           7937 non-null   int64 
 7   user                7937 non-null   object
 8   bot                 7937 non-null   bool  
 9   server_url          7937 non-null   object
 10  server_name         7937 non-null   object
 11  server_script_path  7937 non-null   object
 12  wiki                7937 non-null   object
 13  parsedcomment       7937 non-null   object
 14  minor               4315 non-null   object
 15  patrolled           2678 non-null   object
dtypes: bool(1), int64(3), 

DataFrame of 5 minutes of data to append to Data Source

In [7]:
df_wiki_new = create_df_wiki(n=5)
df_wiki_new.drop(columns=['$schema','length','revision'], inplace=True)

## Option 1: Ingest to Tinybird using the Rest API

In [8]:
if token == '':
   print("Get your token from your Tinybird workspace.")

### Create Data Source from a CSV File

Column names are read from the first row, column data types inferred.

In [11]:
header = f'"Authorization: Bearer {token}"'

name = 'wiki_api_csv'
mode = 'create'
url = 'https://api.tinybird.co/v0/datasources'
endpoint = f'"{url}?mode={mode}&name={name}"'

filename = name + 'csv'
df_wiki.to_csv(filename, index=False)

!curl -H $header -X POST $endpoint -F csv=@{filename}

{
    "import_id": "588ce36f-765e-4f29-b1dc-b88b4558eb40",
    "datasource": {
        "id": "t_66ca1998b5e247fa8a0409c5cc2ba4b9",
        "name": "wiki_api_csv",
        "cluster": null,
        "tags": {},
        "created_at": "2022-02-17 11:30:57.517727",
        "updated_at": "2022-02-17 11:30:57.823016",
        "replicated": false,
        "version": 0,
        "project": null,
        "headers": {
            "dialect": {
                "header": "['meta', 'id', 'type', 'namespace', 'title', 'comment', 'timestamp', 'user', 'bot', 'server_url', 'server_name', 'server_script_path', 'wiki', 'parsedcomment', 'minor', 'patrolled']",
                "header_hash": -2164950873547256806
            }
        },
        "shared_with": [],
        "engine": {
            "engine": "MergeTree",
            "partition_key": "substring(meta, 1, 1)",
            "sorting_key": "meta, intHash32(id)",
            "sampling_key": "intHash32(id)"
        },
        "used_by": [],
        "type"

### Append to Data Source from a CSV File

In [12]:
filename = 'wiki_new_api_csv.csv'
df_wiki_new.to_csv(filename, index=False)

mode = 'append'
endpoint = f'"{url}?mode={mode}&name={name}"' 

!curl -H $header -X POST $endpoint -F csv=@{filename}

{
    "import_id": "c99bee4b-7850-4fc1-ae42-52db2e265355",
    "datasource": {
        "id": "t_66ca1998b5e247fa8a0409c5cc2ba4b9",
        "name": "wiki_api_csv",
        "cluster": null,
        "tags": {},
        "created_at": "2022-02-17 11:30:57.517727",
        "updated_at": "2022-02-17 11:30:58.815457",
        "replicated": false,
        "version": 0,
        "project": null,
        "headers": {
            "dialect": {
                "header": "['meta', 'id', 'type', 'namespace', 'title', 'comment', 'timestamp', 'user', 'bot', 'server_url', 'server_name', 'server_script_path', 'wiki', 'parsedcomment', 'minor', 'patrolled']",
                "header_hash": -2164950873547256806
            },
            "cached_delimiter": ","
        },
        "shared_with": [],
        "engine": {
            "engine": "MergeTree",
            "partition_key": "substring(meta, 1, 1)",
            "sorting_key": "meta, intHash32(id)",
            "sampling_key": "intHash32(id)"
        },


### Create Data Source from Data in Memory
based on https://gist.github.com/alrocar/9b1b860cf74ac6f2ad115c3cb2945e93

In [13]:
import csv
import requests

from io import StringIO
from requests.adapters import HTTPAdapter

from urllib3.util.retry import Retry
from urllib.parse import urlencode

In [14]:
def ingest_from_array(rows,
                      datasource, 
                      token, mode='append', 
                      endpoint='https://api.tinybird.co'):
  
  url = f'{endpoint}/v0/datasources?mode={mode}&name={datasource}'

  retry = Retry(total=5, backoff_factor=0.2)
  adapter = HTTPAdapter(max_retries=retry)
  _session = requests.Session()
  _session.mount('http://', adapter)
  _session.mount('https://', adapter)

  csv_chunk = StringIO()
  writer = csv.writer(csv_chunk, delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)

  records = 0
  for row in rows:
    writer.writerow(row)
    records += 1

    if len(rows) == records:
        data = csv_chunk.getvalue()
        headers = {
            'Authorization': f'Bearer {token}',
            'X-TB-Client': 'pltx-0.1',
        }

        ok = False
        try:
            response = _session.post(url, headers=headers, files=dict(csv=data))
            result = response.json()

            ok = response.status_code < 400
            if ok:
                csv_chunk = StringIO()
                writer = csv.writer(csv_chunk, delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
                print(f"Flushed {len(data)} bytes, datasource={datasource}, response={response.status_code}")
                print(f"Result id={result.get('import_id', None)}, error={result.get('error', False)}")
        except Exception as e:
            print(e)

  print('Done')

Column names are read from from 'rows', column data types are inferred from 'rows'.

In [16]:
rows= df_wiki.values.tolist()
# put column names in 1st row
rows.insert(0, df_wiki.columns.tolist())

datasource = 'wiki_api_mem'
mode = 'create'
endpoint = 'https://api.tinybird.co'

ingest_from_array(rows, datasource, token, mode, endpoint)

Flushed 3524852 bytes, datasource=wiki_api_mem, response=200
Result id=bfab81ca-959a-4c0a-bc84-4fb1f0552bdf, error=False
Done


### Append to Data Source from Data in Memory

In [17]:
mode = 'append'
rows= df_wiki_new.values.tolist()

ingest_from_array(rows, datasource, token, mode, endpoint)

Flushed 3557152 bytes, datasource=wiki_api_mem, response=200
Result id=2bfac483-21f3-41b9-aeaa-095400d0dace, error=False
Done


## Option 2: Download Local File then ingest to Tinybird through the UI

- CSV
- NDJSON

The inferred column names and types can be changed in the preview in the UI, for example, the column `type` can be changed to `LowCardinality(String)`.

### Format CSV

In [18]:
df_wiki.to_csv("wiki_ui_csv.csv", index=False)
files.download('wiki_ui_csv.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Format NDJSON

In [19]:
df_wiki.to_json("wiki_ui_ndjson.ndjson", orient="records", lines=True, force_ascii=0)
files.download("wiki_ui_ndjson.ndjson")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Option 3: Ingest to Tinybird from the CLI
- CSV
- NDJSON

In [20]:
!pip install tinybird-cli

Collecting tinybird-cli
  Downloading tinybird_cli-1.0.0b96-py3-none-any.whl (84 kB)
[?25l[K     |███▉                            | 10 kB 24.2 MB/s eta 0:00:01[K     |███████▊                        | 20 kB 12.7 MB/s eta 0:00:01[K     |███████████▋                    | 30 kB 6.8 MB/s eta 0:00:01[K     |███████████████▌                | 40 kB 6.4 MB/s eta 0:00:01[K     |███████████████████▍            | 51 kB 2.8 MB/s eta 0:00:01[K     |███████████████████████▎        | 61 kB 3.3 MB/s eta 0:00:01[K     |███████████████████████████▏    | 71 kB 3.5 MB/s eta 0:00:01[K     |███████████████████████████████ | 81 kB 3.8 MB/s eta 0:00:01[K     |████████████████████████████████| 84 kB 1.8 MB/s 
Collecting toposort==1.5
  Downloading toposort-1.5-py2.py3-none-any.whl (7.6 kB)
Collecting click==7.0
  Downloading Click-7.0-py2.py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 8.1 MB/s 
[?25hCollecting requests-toolbelt==0.9.1
  Downloading requests_toolbe

In [21]:
if token == '':
   print("Get your token from your Tinybird workspace.")

In [22]:
def write_text_to_file(filename, text):
  with open(filename, 'w') as f: f.write(text)

### Format CSV

In [23]:
df_wiki.to_csv("wiki_cli_csv.csv", index=False)

The schema for the Data Source can be generated from the CSV file or written from code. 

In [24]:
# generate the file wiki_cli_csv.datasource
!tb --token=$token datasource generate wiki_cli_csv.csv

[92m** Generated wiki_cli_csv.datasource
** => Create it on the server running: $ tb push wiki_cli_csv.datasource
** => Append data using: $ tb datasource append wiki_cli_csv wiki_cli_csv.csv
[0m


In [25]:
# or write the file wiki_cli_csv.datasource with data types, sorting key etc.
filename = 'wiki_cli_csv.datasource'
text='''
SCHEMA >
    `meta` LowCardinality(String),
    `id` Int64,
    `type` String,
    `namespace` Int16,
    `title` String,
    `comment` Nullable(String),
    `timestamp` Int64,
    `user` String,
    `bot` String,
    `minor` Nullable(String),
    `patrolled` Nullable(String),
    `server_url` LowCardinality(String),
    `server_name` LowCardinality(String),
    `server_script_path` String,
    `wiki` LowCardinality(String),
    `parsedcomment` Nullable(String)

ENGINE "MergeTree"
ENGINE_SORTING_KEY "timestamp"
'''

write_text_to_file(filename, text)

In [26]:
!tb --token=$token push wiki_cli_csv.datasource
!tb --token=$token datasource append wiki_cli_csv wiki_cli_csv.csv

[0m** Processing wiki_cli_csv.datasource[0m
[0m** Building dependencies[0m
[0m** Running wiki_cli_csv [0m
[92m** 'wiki_cli_csv' created[0m
[0m** Not pushing fixtures[0m
[0m** 🥚 starting import process[0m
[92m** 🐥 done[0m
[92m** Total rows in wiki_cli_csv: 7937[0m
[92m** Data appended to Data Source 'wiki_cli_csv' successfully![0m
[0m** Data pushed to wiki_cli_csv[0m


### Format NDJSON

In [27]:
df_wiki.to_json("wiki_cli_ndjson.ndjson", orient="records", lines=True, force_ascii=0)

The schema for the Data Source can be generated from the NDJSON file or written from code. 

In [28]:
# generate the file wiki_cli_ndjson.datasource
!tb --token=$token datasource generate wiki_cli_ndjson.ndjson

[92m** Generated wiki_cli_ndjson.datasource
** => Create it on the server running: $ tb push wiki_cli_ndjson.datasource
** => Append data using: $ tb datasource append wiki_cli_ndjson wiki_cli_ndjson.ndjson
[0m


In [29]:
# or write the file wiki_cli_ndjson.datasource with data types, sorting key etc.
filename = 'wiki_cli_ndjson.datasource'
text='''
SCHEMA >

    bot UInt8 `json:$.bot`,
    comment Nullable(String) `json:$.comment`,
    id Int64 `json:$.id`,
    meta LowCardinality(String) `json:$.meta`,
    minor Nullable(UInt8) `json:$.minor`,
    namespace Int16 `json:$.namespace`,
    parsedcomment Nullable(String) `json:$.parsedcomment`,
    patrolled Nullable(UInt8) `json:$.patrolled`,
    server_name String `json:$.server_name`,
    server_script_path String `json:$.server_script_path`,
    server_url String `json:$.server_url`,
    timestamp Int64 `json:$.timestamp`,
    title String `json:$.title`,
    type String `json:$.type`,
    user String `json:$.user`,
    wiki LowCardinality(String) `json:$.wiki`
    
ENGINE "MergeTree"
ENGINE_SORTING_KEY "timestamp"
'''

write_text_to_file(filename, text)

In [30]:
!tb --token=$token push wiki_cli_ndjson.datasource
!tb --token=$token datasource append wiki_cli_ndjson wiki_cli_ndjson.ndjson

[0m** Processing wiki_cli_ndjson.datasource[0m
[0m** Building dependencies[0m
[0m** Running wiki_cli_ndjson [0m
[92m** 'wiki_cli_ndjson' created[0m
[0m** Not pushing fixtures[0m
[0m** 🥚 starting import process[0m
[92m** 🐥 done[0m
[92m** Appended 0 new rows[0m
[92m** Total rows in wiki_cli_ndjson: 7937[0m
[92m** Data appended to Data Source 'wiki_cli_ndjson' successfully![0m
[0m** Data pushed to wiki_cli_ndjson[0m


## Option 4: Stream to Tinybird using High-Frequency Ingestion
Here events are streamed directly to the Data Source from the Wikipedia stream using [high-frequency ingestion](https://guides.tinybird.co/guide/high-frequency-ingestion). The data is not first written to a pandas DataFrame.

With `mode='create'` the data types are inferred. To avoid rows going into quarantine, a few more columns  need to be `Nullable` than inferred. Directly defining the schema after exploring the automatically created Data Source in the UI solves this issue. 

In [31]:
!pip install sseclient
!pip install tinybird-cli -q -U



In [32]:
import json
import requests
import time

import pandas as pd
from sseclient import SSEClient as EventSource
from urllib3.util.retry import Retry

from requests.adapters import HTTPAdapter

In [33]:
if token == '':
   print("Get your token from your Tinybird workspace.")

In [34]:
def write_text_to_file(filename, text):
  with open(filename, 'w') as f: f.write(text)

In [35]:
filename = 'wiki_hfi.datasource'
text='''
SCHEMA >
    `DOLLAR_SIGN_schema` String `json:$.['$schema']`,
    `bot` UInt8 `json:$.bot`,
    `comment` String `json:$.comment`,
    `id` Int64 `json:$.id`,
    `length_new` Nullable(Int32) `json:$.length.new`,
    `length_old` Nullable(Int32) `json:$.length.old`,
    `meta_domain` String `json:$.meta.domain`,
    `meta_dt` DateTime `json:$.meta.dt`,
    `meta_id` String `json:$.meta.id`,
    `meta_offset` Int64 `json:$.meta.offset`,
    `meta_partition` Int16 `json:$.meta.partition`,
    `meta_request_id` String `json:$.meta.request_id`,
    `meta_stream` String `json:$.meta.stream`,
    `meta_topic` String `json:$.meta.topic`,
    `meta_uri` String `json:$.meta.uri`,
    `minor` Nullable(UInt8) `json:$.minor`,
    `namespace` Int16 `json:$.namespace`,
    `parsedcomment` String `json:$.parsedcomment`,
    `patrolled` Nullable(UInt8) `json:$.patrolled`,
    `revision_new` Nullable(Int64) `json:$.revision.new`,
    `revision_old` Nullable(Int64) `json:$.revision.old`,
    `server_name` String `json:$.server_name`,
    `server_script_path` String `json:$.server_script_path`,
    `server_url` String `json:$.server_url`,
    `timestamp` Int64 `json:$.timestamp`,
    `title` String `json:$.title`,
    `type` String `json:$.type`,
    `user` String `json:$.user`,
    `wiki` String `json:$.wiki`

ENGINE "MergeTree"
ENGINE_SORTING_KEY "timestamp"
'''

write_text_to_file(filename, text)
!tb --token=$token push wiki_hfi.datasource

[0m** Processing wiki_hfi.datasource[0m
[0m** Building dependencies[0m
[0m** Running wiki_hfi [0m
[92m** 'wiki_hfi' created[0m
[0m** Not pushing fixtures[0m


In [36]:
url = 'https://api.tinybird.co/v0/events'
mode = 'append'
datasource = 'wiki_hfi'
n = 1 # minutes of data from stream

retry = Retry(total=5, backoff_factor=0.2)
adapter = HTTPAdapter(max_retries=retry)
_session = requests.Session()
_session.mount('http://', adapter)
_session.mount('https://', adapter)

params = {
        'mode': mode,
        'name': datasource,
        'token': token
        }
t_end = time.time() + n*60
change = {'timestamp': time.time()}
print('Start time:', pd.Timestamp(change['timestamp'], unit='s'))

for event in EventSource('https://stream.wikimedia.org/v2/stream/recentchange'):
      if change['timestamp'] > t_end:
        break
      elif event.event == 'message':
        try:
              change = json.loads(event.data)
        except ValueError:
              pass
        else:
            if change['type']!='log':
              r = _session.post(url, 
                                params=params, 
                                data=json.dumps(change))
print('Final timestamp:', pd.Timestamp(change['timestamp'], unit='s'))

Start time: 2022-02-17 11:39:26.494986534
Final timestamp: 2022-02-17 11:40:27
