# Perform Textual Embedding

## Loading Credentials

In [7]:
import json

with open('credentials.json', 'r') as file:
    credentials = json.load(file)

db_server_name = credentials['clickhouse']['server_name']
db_username = credentials['clickhouse']['username']
db_password = credentials['clickhouse']['password']

## Test Connection

In [12]:
from clickhouse_driver import Client

client = Client(host=db_server_name, user=db_username, password=db_password)

try:
    result = client.execute('SELECT version()')
    print("Connected to ClickHouse server. Version:", result[0][0])
except Exception as e:
    print("Failed to connect to ClickHouse server:", e)

Connected to ClickHouse server. Version: 25.4.1.1926


In [16]:
client.execute("SELECT * FROM tiingo.news where id=432557 LIMIT 2")

[('2016-01-24T11:15:34.502758+00:00',
  'Shares of Ethan Allen Interiors have slumped along with furniture sales, but pent-up demand from homebuyers and some more contemporary styles should help jumpstart the stock.',
  432557.0,
  '2004-09-29T03:59:00+00:00',
  'barrons.com',
  'Consumer Discretionary',
  'eth',
  'eth',
  'Trendy Styles May Lift Ethan Allen',
  'barrons.com',
  'http://www.barrons.com/articles/SB109639418009030197')]

In [54]:
client.execute('describe tiingo.news')

[('crawlDate', 'Nullable(String)', '', '', '', '', ''),
 ('description', 'Nullable(String)', '', '', '', '', ''),
 ('id', 'Nullable(Float64)', '', '', '', '', ''),
 ('publishedDate', 'Nullable(String)', '', '', '', '', ''),
 ('source', 'Nullable(String)', '', '', '', '', ''),
 ('tags', 'Nullable(String)', '', '', '', '', ''),
 ('ticker', 'Nullable(String)', '', '', '', '', ''),
 ('tickers_all', 'Nullable(String)', '', '', '', '', ''),
 ('title', 'Nullable(String)', '', '', '', '', ''),
 ('tld', 'Nullable(String)', '', '', '', '', ''),
 ('url', 'Nullable(String)', '', '', '', '', '')]

## Filter Tiingo Data

In [17]:
import pandas as pd
from clickhouse_driver import Client

client = Client(host=db_server_name, user=db_username, password=db_password)

# TODO: other filters?

# query btc news
query = f"""
SELECT  distinct(id) as id,
        description,
        ticker,
        tickers_all,
        title,
        parseDateTimeBestEffort(publishedDate) AS date 
FROM tiingo.news
WHERE ticker in ('btc', 'eth', 'doge', 'sol')
AND date >= '2016-01-01'
AND LENGTH(description) <= 8192
"""

# fetch the result directly as a pandas DataFrame directly
news_data_df = client.query_dataframe(query)

In [19]:
news_data_df

Unnamed: 0,id,description,ticker,tickers_all,title,date
0,7942224.0,Sentiment for Ethan Allen Interiors Inc (NYSE:...,eth,eth,Ethan Allen Interiors Inc (NYSE:ETH) Instituti...,2018-01-02 05:31:02
1,7943602.0,,btc,btc/cssi/inc,COSTAS INC. (CSSI) Completes Full Acquisition ...,2018-01-02 08:00:00
2,22921263.0,Ethan Allen Interiors (NYSE:ETH) is scheduled ...,eth,eth,Ethan Allen Interiors Q2 2020 Earnings Preview,2020-02-04 06:35:00
3,22910463.0,"Today, we'll introduce the concept of the P/E ...",eth,eth,Is Ethan Allen Interiors Inc.’s (NYSE:ETH) Hig...,2020-02-03 20:36:59
4,22902421.0,Ethan Allen Interiors (NYSE:ETH) will issue it...,eth,eth,Ethan Allen Interiors (ETH) Set to Announce Qu...,2020-02-03 16:24:00
...,...,...,...,...,...,...
23224,22536649.0,Ethan Allen Interiors Inc. (NYSE:ETH) – Analys...,eth,eth/key,Ethan Allen Interiors Inc. (NYSE:ETH) to Post ...,2020-01-16 15:02:42
23225,23356431.0,The U.S. Department of Homeland Security has r...,btc,btc/cvc/msb,US Develops Cryptocurrency Intelligence Progra...,2020-02-25 01:27:27
23226,23360489.0,“I don’t have any bitcoin. I don’t own any cry...,btc,brk-a/brk-b/btc/tron,"Warren Buffett Slates Bitcoin, Denies Owning C...",2020-02-25 04:45:09
23227,23363822.0,"At 1.8% after the halving, bitcoin’s inflation...",btc,bch/btc,Bitcoin Halving Will Drop Inflation Rate Lower...,2020-02-25 07:45:53


## Mistral
https://docs.mistral.ai/capabilities/embeddings/

### Example

In [25]:
from mistralai import Mistral

api_key = credentials['api_key']['mistral']
mistral_client = Mistral(api_key=api_key)

model = "mistral-embed"
text = "Embed this sentence."

embeddings_batch_response= mistral_client.embeddings.create(
    model=model,
    inputs=text,
)

print(embeddings_batch_response.data[0].embedding)
print('length of embedding:', len(embeddings_batch_response.data[0].embedding))

[-0.01666259765625, 0.06982421875, 0.031494140625, 0.01284027099609375, 0.020660400390625, 0.0096435546875, 0.025787353515625, 0.0018548965454101562, -0.00867462158203125, -0.0087890625, -0.039703369140625, 0.058441162109375, -0.0255584716796875, 0.00775909423828125, -0.02886962890625, 0.0404052734375, 0.05499267578125, 0.0260162353515625, 0.03173828125, 0.023284912109375, -0.05682373046875, -0.0157470703125, -0.061614990234375, 0.01226806640625, -0.046112060546875, -0.0270538330078125, -0.00775909423828125, -0.03790283203125, -0.0401611328125, 0.0010061264038085938, 0.0238494873046875, -0.030120849609375, 0.0303497314453125, -0.002353668212890625, -0.0120391845703125, -0.036285400390625, -0.0330810546875, -0.044952392578125, 0.0133514404296875, 0.00186920166015625, 0.00969696044921875, -0.00034046173095703125, -0.0308074951171875, -0.0230560302734375, -0.024871826171875, -0.0296783447265625, 0.00447845458984375, -0.0279541015625, -0.0176849365234375, -0.0330810546875, 0.00987243652343

### Run Embeddings

In [23]:
from mistralai import Mistral
import time
from tqdm import tqdm

def run_mistral_embedding(df: pd.DataFrame, api_key, model: str = 'mistral-embed', batch_size: int = 10) -> None:
    mistral_client = Mistral(api_key=api_key)

    # Initialize new column
    df['mistral_embedding'] = None

    # Process in batches
    for start_idx in tqdm(range(0, len(df), batch_size)):
        end_idx = min(start_idx + batch_size, len(df))
        batch_texts = df.loc[start_idx:end_idx - 1, 'description'].tolist()

        embeddings_batch_response = mistral_client.embeddings.create(
            model=model,
            inputs=batch_texts,
        )

        # Write result embeddings values to df
        for i, embedding in enumerate(embeddings_batch_response.data):
            df.loc[start_idx + i, 'mistral_embedding'] = str(embedding.embedding)

        # request rate is limited, mistral is annoying
        time.sleep(2)

In [None]:
api_key = credentials['api_key']['mistral']
run_mistral_embedding(news_data_df, api_key=api_key, model='mistral-embed')

### Analysis

In [3]:
import os
import pandas as pd

data_path = '../data/'
if os.path.exists(data_path):
    os.chdir(data_path)
    print('Changed directory to data path:', os.getcwd())
else:
    print('Data path does not exist:', data_path)

Changed directory to data path: /Users/Mike_Home/Desktop/NLP_on_Crypto/data


In [None]:
# column 'mistral_embedding' is of type np.array(float64)
mistral_embed_df = pd.read_parquet('embeddings/mistral_embeddings_fast.parquet')
mistral_embed_df

Unnamed: 0,id,mistral_embedding
0,95902.0,"[0.01024627685546875, 0.04156494140625, 0.0374..."
1,1418593.0,"[-0.0192718505859375, 0.0411376953125, 0.01513..."
2,45908936.0,"[-0.0278472900390625, 0.022613525390625, 0.005..."
3,11895342.0,"[0.002902984619140625, -0.002254486083984375, ..."
4,20908581.0,"[-0.0149383544921875, 0.01438140869140625, 0.0..."
...,...,...
23529,13574707.0,"[-0.0312347412109375, 0.01654052734375, 0.0451..."
23530,43212285.0,"[-0.00792694091796875, 0.017547607421875, 0.07..."
23531,36337278.0,"[-0.026611328125, 0.02947998046875, 0.01325225..."
23532,12504007.0,"[-0.00945281982421875, -0.0015096664428710938,..."


In [47]:
import pandas as pd
from clickhouse_driver import Client
import gc
import json

# load credentials
with open('credentials.json', 'r') as file:
    credentials = json.load(file)

db_server_name = credentials['clickhouse']['server_name']
db_username = credentials['clickhouse']['username']
db_password = credentials['clickhouse']['password']

# load calculated embedding values from a saved file
mistral_embed_df = pd.read_parquet('embeddings/mistral_embeddings_fast.parquet')

# query news data
client = Client(host=db_server_name, user=db_username, password=db_password)

query = f"""
SELECT  distinct(id) as id,
        description,
        ticker,
        tickers_all,
        title,
        parseDateTimeBestEffort(publishedDate) AS date 
FROM tiingo.news
WHERE ticker in ('btc', 'eth', 'doge', 'sol')
AND LENGTH(description) <= 8192
"""

# fetch the result directly as a pandas DataFrame directly
news_data_df = client.query_dataframe(query)

# Merge the two DataFrames on the 'id' column
news_mistral_embed_df = pd.merge(news_data_df, mistral_embed_df, on='id', how='left')

# release memory
del mistral_embed_df
del news_data_df
gc.collect()

news_mistral_embed_df

Unnamed: 0,id,description,ticker,tickers_all,title,date,mistral_embedding
0,24359014.0,The conviction of one prominent Bitcoin invest...,btc,btc/pypl/pyplv,Billionaire Thinks Bitcoin Can Reach a Price o...,2020-04-07 23:20:51,"[-0.007564544677734375, 0.03179931640625, 0.01..."
1,24353293.0,It seems that ETH jump was started by Bitcoin....,btc,btc/mfe,Ethereum Price Jumps 14% to Move above $175,2020-04-07 19:23:25,"[-0.005401611328125, 0.03314208984375, 0.0625,..."
2,24353297.0,"Bitcoin (BTC) price has risen above $7,350. A ...",btc,btc/dow/dow/dow/tall,"Bitcoin (BTC) Price Is above $7,350 as Traditi...",2020-04-07 19:16:51,"[-0.0021343231201171875, 0.0219879150390625, 0..."
3,24354027.0,As every crypto coin has beguns to gain moment...,btc,btc,XRP Price Hits $0.20 as Over 200M Coins Transf...,2020-04-07 20:36:37,"[-0.0233154296875, -0.0017375946044921875, 0.0..."
4,24361054.0,"Bitcoin price may increase significantly, $1 m...",btc,btc,"Bitcoin Price May Reach $1M, Believes Billiona...",2020-04-08 00:31:24,"[-0.02880859375, 0.037078857421875, 0.03054809..."
...,...,...,...,...,...,...,...
23878,43327330.0,"On Easter Sunday, the majority of cryptocurren...",btc,btc/pifi,Bitcoin Clings To $40K On Easter Sunday As Cry...,2022-04-17 23:04:22,"[-0.0041046142578125, 0.01114654541015625, 0.0..."
23879,43334340.0,"Bitcoin is still struggling below $41,500 agai...",btc,btc/pifi,"TA: Bitcoin Remains at Risk, Why 100 SMA Is Th...",2022-04-18 11:38:16,"[-0.0309295654296875, 0.0196380615234375, 0.01..."
23880,23041155.0,U.S. Securities and Exchange Commission (SEC) ...,btc,btc,”Decentralized Networks a Powerful Phenomenon”...,2020-02-09 19:29:39,"[-0.01557159423828125, 0.00727081298828125, 0...."
23881,15129425.0,SOILcoin (CURRENCY:SOIL) traded flat against t...,btc,bch/btc/btg/ltc/nano/soil,SOILcoin Price Hits $0.0016 on Top Exchanges (...,2019-02-03 09:52:42,"[-0.0163421630859375, -0.0022487640380859375, ..."


In [48]:
import duckdb
# some data filtering, let's focus on btc first
# 1. ticker = btc
# 2. after 2016-01-01
# 3. description is not empty

filter_df = duckdb.query(
    """
    SELECT *
    FROM news_mistral_embed_df
    WHERE ticker = 'btc'
    AND date >= '2016-01-01'
    AND description != ''
    """
).fetchdf()

filter_df

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,id,description,ticker,tickers_all,title,date,mistral_embedding
0,24359014.0,The conviction of one prominent Bitcoin invest...,btc,btc/pypl/pyplv,Billionaire Thinks Bitcoin Can Reach a Price o...,2020-04-07 23:20:51,"[-0.007564544677734375, 0.03179931640625, 0.01..."
1,24353293.0,It seems that ETH jump was started by Bitcoin....,btc,btc/mfe,Ethereum Price Jumps 14% to Move above $175,2020-04-07 19:23:25,"[-0.005401611328125, 0.03314208984375, 0.0625,..."
2,24353297.0,"Bitcoin (BTC) price has risen above $7,350. A ...",btc,btc/dow/dow/dow/tall,"Bitcoin (BTC) Price Is above $7,350 as Traditi...",2020-04-07 19:16:51,"[-0.0021343231201171875, 0.0219879150390625, 0..."
3,24354027.0,As every crypto coin has beguns to gain moment...,btc,btc,XRP Price Hits $0.20 as Over 200M Coins Transf...,2020-04-07 20:36:37,"[-0.0233154296875, -0.0017375946044921875, 0.0..."
4,24361054.0,"Bitcoin price may increase significantly, $1 m...",btc,btc,"Bitcoin Price May Reach $1M, Believes Billiona...",2020-04-08 00:31:24,"[-0.02880859375, 0.037078857421875, 0.03054809..."
...,...,...,...,...,...,...,...
17275,43324891.0,"Mexico's third-richest billionaire says ""save ...",btc,btc/pifi,"‘Save Your Skin’ From Inflation With BTC, The ...",2022-04-17 20:00:29,"[-0.0254974365234375, 0.0302886962890625, 0.03..."
17276,43327330.0,"On Easter Sunday, the majority of cryptocurren...",btc,btc/pifi,Bitcoin Clings To $40K On Easter Sunday As Cry...,2022-04-17 23:04:22,"[-0.0041046142578125, 0.01114654541015625, 0.0..."
17277,43334340.0,"Bitcoin is still struggling below $41,500 agai...",btc,btc/pifi,"TA: Bitcoin Remains at Risk, Why 100 SMA Is Th...",2022-04-18 11:38:16,"[-0.0309295654296875, 0.0196380615234375, 0.01..."
17278,23041155.0,U.S. Securities and Exchange Commission (SEC) ...,btc,btc,”Decentralized Networks a Powerful Phenomenon”...,2020-02-09 19:29:39,"[-0.01557159423828125, 0.00727081298828125, 0...."


In [54]:
btc_daily = pd.read_pickle('btc_daily.pkl')
btc_daily.reset_index(inplace=True)

# some processing
btc_daily.rename(columns={'Open time': 'date'}, inplace=True)
btc_daily.sort_values(by='date', inplace=True)
btc_daily['return'] = btc_daily['Close'].pct_change()
btc_daily

Unnamed: 0,date,Open,High,Low,Close,Volume,Close time,Quote volume,Trades,Taker buy base,Taker buy quote,Ignore,return
0,2017-08-17,4261.48,4485.39,4200.74,4285.08,795.150377,2017-08-17 23:59:59.999,3.454770e+06,3427,616.24854100,2678216.40060401,0,
1,2017-08-18,4285.08,4371.52,3938.77,4108.37,1199.888264,2017-08-18 23:59:59.999,5.086958e+06,5233,972.86871000,4129123.31651808,0,-0.041238
2,2017-08-19,4108.37,4184.69,3850.00,4139.98,381.309763,2017-08-19 23:59:59.999,1.549484e+06,2153,274.33604200,1118001.87008735,0,0.007694
3,2017-08-20,4120.98,4211.08,4032.62,4086.29,467.083022,2017-08-20 23:59:59.999,1.930364e+06,2321,376.79594700,1557401.33373730,0,-0.012969
4,2017-08-21,4069.13,4119.62,3911.79,4016.00,691.743060,2017-08-21 23:59:59.999,2.797232e+06,3972,557.35610700,2255662.55315837,0,-0.017201
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2753,2025-03-01,84349.95,86558.00,83824.78,86064.53,25785.054640,2025-03-01 23:59:59.999,2.194004e+09,3700728,11748.10769000,1000093822.53835250,0,0.020327
2754,2025-03-02,86064.54,95000.00,85050.60,94270.00,54889.090450,2025-03-02 23:59:59.999,4.972550e+09,7403218,29273.81031000,2652973441.15453900,0,0.095341
2755,2025-03-03,94269.99,94416.46,85117.11,86220.61,59171.102180,2025-03-03 23:59:59.999,5.321124e+09,9797860,27915.43650000,2513464571.35691100,0,-0.085387
2756,2025-03-04,86221.16,88967.52,81500.00,87281.98,55609.107060,2025-03-04 23:59:59.999,4.704901e+09,10326631,27584.71590000,2335087296.23928690,0,0.012310


In [55]:
# normalise date before merging
filter_df['date'] = filter_df['date'].dt.normalize()
btc_daily['date'] = btc_daily['date'].dt.normalize()

mistal_embedding_btc_df = pd.merge(
    filter_df,
    btc_daily,
    on='date',
    how='inner'
)

mistal_embedding_btc_df

Unnamed: 0,id,description,ticker,tickers_all,title,date,mistral_embedding,Open,High,Low,Close,Volume,Close time,Quote volume,Trades,Taker buy base,Taker buy quote,Ignore,return
0,24359014.0,The conviction of one prominent Bitcoin invest...,btc,btc/pypl/pyplv,Billionaire Thinks Bitcoin Can Reach a Price o...,2020-04-07,"[-0.007564544677734375, 0.03179931640625, 0.01...",7329.90,7459.69,7077.00,7197.32,103585.168918,2020-04-07 23:59:59.999,7.572018e+08,924436,48811.26778900,357008427.72553911,0,-0.018088
1,24353293.0,It seems that ETH jump was started by Bitcoin....,btc,btc/mfe,Ethereum Price Jumps 14% to Move above $175,2020-04-07,"[-0.005401611328125, 0.03314208984375, 0.0625,...",7329.90,7459.69,7077.00,7197.32,103585.168918,2020-04-07 23:59:59.999,7.572018e+08,924436,48811.26778900,357008427.72553911,0,-0.018088
2,24353297.0,"Bitcoin (BTC) price has risen above $7,350. A ...",btc,btc/dow/dow/dow/tall,"Bitcoin (BTC) Price Is above $7,350 as Traditi...",2020-04-07,"[-0.0021343231201171875, 0.0219879150390625, 0...",7329.90,7459.69,7077.00,7197.32,103585.168918,2020-04-07 23:59:59.999,7.572018e+08,924436,48811.26778900,357008427.72553911,0,-0.018088
3,24354027.0,As every crypto coin has beguns to gain moment...,btc,btc,XRP Price Hits $0.20 as Over 200M Coins Transf...,2020-04-07,"[-0.0233154296875, -0.0017375946044921875, 0.0...",7329.90,7459.69,7077.00,7197.32,103585.168918,2020-04-07 23:59:59.999,7.572018e+08,924436,48811.26778900,357008427.72553911,0,-0.018088
4,24361054.0,"Bitcoin price may increase significantly, $1 m...",btc,btc,"Bitcoin Price May Reach $1M, Believes Billiona...",2020-04-08,"[-0.02880859375, 0.037078857421875, 0.03054809...",7197.32,7420.00,7150.00,7361.28,76059.145838,2020-04-08 23:59:59.999,5.551226e+08,750516,39000.55256500,284685809.77746603,0,0.022781
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17275,43324891.0,"Mexico's third-richest billionaire says ""save ...",btc,btc/pifi,"‘Save Your Skin’ From Inflation With BTC, The ...",2022-04-17,"[-0.0254974365234375, 0.0302886962890625, 0.03...",40378.70,40595.67,39546.17,39678.12,19988.492590,2022-04-17 23:59:59.999,8.034142e+08,590241,9578.91533000,385124871.46651260,0,-0.017350
17276,43327330.0,"On Easter Sunday, the majority of cryptocurren...",btc,btc/pifi,Bitcoin Clings To $40K On Easter Sunday As Cry...,2022-04-17,"[-0.0041046142578125, 0.01114654541015625, 0.0...",40378.70,40595.67,39546.17,39678.12,19988.492590,2022-04-17 23:59:59.999,8.034142e+08,590241,9578.91533000,385124871.46651260,0,-0.017350
17277,43334340.0,"Bitcoin is still struggling below $41,500 agai...",btc,btc/pifi,"TA: Bitcoin Remains at Risk, Why 100 SMA Is Th...",2022-04-18,"[-0.0309295654296875, 0.0196380615234375, 0.01...",39678.11,41116.73,38536.51,40801.13,54243.495750,2022-04-18 23:59:59.999,2.153575e+09,1157741,27097.19375000,1076513006.70581750,0,0.028303
17278,23041155.0,U.S. Securities and Exchange Commission (SEC) ...,btc,btc,”Decentralized Networks a Powerful Phenomenon”...,2020-02-09,"[-0.01557159423828125, 0.00727081298828125, 0....",9895.04,10166.00,9880.75,10151.75,43408.475616,2020-02-09 23:59:59.999,4.374417e+08,536848,22408.27732700,225799470.00274661,0,0.025942


In [69]:
daily_embed_df = (
    mistal_embedding_btc_df
    .groupby('date')
    .apply(
        lambda g: pd.Series(
            {
                # because group, pick last row return
                'return': g['return'].iloc[-1],
                
                # average each positions (index) of embeddings, each day
                'embedding': np.mean(np.stack(g['mistral_embedding'].values), axis=0) 
            }
        )
    )
)

  mistal_embedding_btc_df


In [70]:
daily_embed_df

Unnamed: 0_level_0,return,embedding
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-12-05,0.010449,"[-0.01399993896484375, 0.03564453125, 0.026916..."
2017-12-06,0.158125,"[-0.0077728271484375, 0.038189697265625, 0.025..."
2017-12-07,0.225014,"[-0.016308148701985676, 0.017751057942708332, ..."
2017-12-08,-0.043316,"[-0.01314544677734375, 0.05572509765625, 0.053..."
2017-12-09,-0.077074,"[-0.003265380859375, 0.032379150390625, 0.0102..."
...,...,...
2022-09-04,0.008491,"[-0.0014928181966145833, 0.027737935384114582,..."
2022-09-05,-0.010173,"[-0.006416667591441761, 0.026627974076704544, ..."
2022-09-06,-0.050828,"[-0.007169519151960101, 0.01981707981654576, 0..."
2022-09-07,0.026728,"[-0.01027508576711019, 0.02695353825887044, 0...."
