# Perform Textual Embedding

## Loading Credentials

In [2]:
import json

with open('credentials.json', 'r') as file:
    credentials = json.load(file)

db_server_name = credentials['clickhouse']['server_name']
db_username = credentials['clickhouse']['username']
db_password = credentials['clickhouse']['password']

## Test Connection

In [51]:
from clickhouse_driver import Client

client = Client(host=db_server_name, user=db_username, password=db_password)

try:
    result = client.execute('SELECT version()')
    print("Connected to ClickHouse server. Version:", result[0][0])
except Exception as e:
    print("Failed to connect to ClickHouse server:", e)

Connected to ClickHouse server. Version: 25.4.1.1926


In [53]:
client.execute('SELECT * FROM tiingo.news LIMIT 2')

[('2018-05-02T12:14:50.841934+00:00',
  'Yamana Gold Inc. (NYSE:AUY) shares are down more than -8.65% this year and recently decreased -0.70% or -$0.02 to settle at $2.85. SM Energy Company (NYSE:SM), on the other hand, is up 6.61% year to date as of 05/01/2018. It currently trades at $23.54 and has returned 3.11% during the past week. Yamana Gold Inc.…',
  10471811.0,
  '2018-05-02T11:06:33+00:00',
  'stocknewsgazette.com',
  'Energy/Materials/Stock',
  'sm',
  'auy/sm',
  'Yamana Gold Inc. (AUY) vs. SM Energy Company (SM): Which is the Better Investment?',
  'stocknewsgazette.com',
  'https://stocknewsgazette.com/2018/05/02/yamana-gold-inc-auy-vs-sm-energy-company-sm-which-is-the-better-investment/'),
 ('2018-05-02T12:15:20.345795+00:00',
  'AGNC Investment Corp. (NASDAQ:AGNC) shares are down more than -6.59% this year and recently decreased -0.32% or -$0.06 to settle at $18.86. B2Gold Corp. (NYSE:BTG), on the other hand, is down -9.03% year to date as of 05/01/2018. It currently tra

In [54]:
client.execute('describe tiingo.news')

[('crawlDate', 'Nullable(String)', '', '', '', '', ''),
 ('description', 'Nullable(String)', '', '', '', '', ''),
 ('id', 'Nullable(Float64)', '', '', '', '', ''),
 ('publishedDate', 'Nullable(String)', '', '', '', '', ''),
 ('source', 'Nullable(String)', '', '', '', '', ''),
 ('tags', 'Nullable(String)', '', '', '', '', ''),
 ('ticker', 'Nullable(String)', '', '', '', '', ''),
 ('tickers_all', 'Nullable(String)', '', '', '', '', ''),
 ('title', 'Nullable(String)', '', '', '', '', ''),
 ('tld', 'Nullable(String)', '', '', '', '', ''),
 ('url', 'Nullable(String)', '', '', '', '', '')]

## Filter Tiingo Data

In [None]:
import pandas as pd
from clickhouse_driver import Client

client = Client(host=db_server_name, user=db_username, password=db_password)

# TODO: other filters?

# query btc news
query = f"""
SELECT  distinct(id) as id,
        description,
        tickers_all,
        title,
        parseDateTimeBestEffort(publishedDate) AS date 
FROM tiingo.news
WHERE ticker = 'btc'
"""

# fetch the result directly as a pandas DataFrame directly
news_data_df = client.query_dataframe(query)

In [7]:
news_data_df

Unnamed: 0,id,description,tickers_all,title,date
0,46671121.0,NFL legend and seven-time Super Bowl winner To...,btc/dis/dkng/nfl/pifi/twtr,Touchdown Or Interception? Here's How Much An ...,2022-08-16 21:21:55
1,12380290.0,Piracy could cost the television and movie ind...,btc,How Blockchain Can Fight Piracy,2018-08-10 18:00:00
2,26148838.0,"CryptoAltum, a popular MT5 platform, executes ...",bch/btc/gold/gold/ltc,"CryptoAltum, The CFD Trading Platform With 1:5...",2020-06-23 20:00:41
3,26168605.0,If an individual was to invest $10 a week into...,btc,Average Price of Bitcoin More Than Quadrupled ...,2020-06-24 01:05:58
4,26221910.0,"One ETH proponent noted, during the last 16 da...",bch/btc,Ethereum Network Fees Jump Above Bitcoin Trans...,2020-06-24 06:33:13
...,...,...,...,...,...
17336,19332185.0,Whoever believes that the crypto space is all ...,btc,Crypto Warning: Hackers Are Using Browser Exte...,2019-09-17 18:06:00
17337,20250757.0,"Mike Pompeo, United States Secretary of State,...",btc,BTC Should Be Regulated Like SWIFT And Other E...,2019-10-09 13:43:02
17338,33552285.0,"South Africa recently accepted Mercury FX, a U...",bsv/btc,Ripple Partner Mercury FX Accepted Into ‘First...,2021-04-04 20:30:43
17339,33547176.0,"Bitcoin price is correcting gains from $60,000...",btc,Why Bitcoin Price Could Restart Rally Unless I...,2021-04-04 12:28:50


## Mistral
https://docs.mistral.ai/capabilities/embeddings/

### Example

In [None]:
from mistralai import Mistral

api_key = credentials['api_key']['mistral']
mistral_client = Mistral(api_key=api_key)

model = "mistral-embed"
text = "Embed this sentence."

embeddings_batch_response= mistral_client.embeddings.create(
    model=model,
    inputs=text,
)

print(embeddings_batch_response.data[0].embedding)
print('length of embedding:', len(embeddings_batch_response.data[0].embedding))

[-0.01666259765625, 0.06982421875, 0.031494140625, 0.01284027099609375, 0.020660400390625, 0.0096435546875, 0.025787353515625, 0.0018548965454101562, -0.00867462158203125, -0.0087890625, -0.039703369140625, 0.058441162109375, -0.0255584716796875, 0.00775909423828125, -0.02886962890625, 0.0404052734375, 0.05499267578125, 0.0260162353515625, 0.03173828125, 0.023284912109375, -0.05682373046875, -0.0157470703125, -0.061614990234375, 0.01226806640625, -0.046112060546875, -0.0270538330078125, -0.00775909423828125, -0.03790283203125, -0.0401611328125, 0.0010061264038085938, 0.0238494873046875, -0.030120849609375, 0.0303497314453125, -0.002353668212890625, -0.0120391845703125, -0.036285400390625, -0.0330810546875, -0.044952392578125, 0.0133514404296875, 0.00186920166015625, 0.00969696044921875, -0.00034046173095703125, -0.0308074951171875, -0.0230560302734375, -0.024871826171875, -0.0296783447265625, 0.00447845458984375, -0.0279541015625, -0.0176849365234375, -0.0330810546875, 0.00987243652343

In [23]:
from mistralai import Mistral
import time
from tqdm import tqdm

def run_mistral_embedding(df: pd.DataFrame, api_key, model: str = 'mistral-embed', batch_size: int = 10) -> None:
    mistral_client = Mistral(api_key=api_key)

    # Initialize new column
    df['mistral_embedding'] = None

    # Process in batches
    for start_idx in tqdm(range(0, len(df), batch_size)):
        end_idx = min(start_idx + batch_size, len(df))
        batch_texts = df.loc[start_idx:end_idx - 1, 'description'].tolist()

        embeddings_batch_response = mistral_client.embeddings.create(
            model=model,
            inputs=batch_texts,
        )

        # Write result embeddings values to df
        for i, embedding in enumerate(embeddings_batch_response.data):
            df.loc[start_idx + i, 'mistral_embedding'] = str(embedding.embedding)

        # request rate is limited, mistral is annoying
        time.sleep(2)

In [None]:
api_key = credentials['api_key']['mistral']
run_mistral_embedding(news_data_df, api_key=api_key, model='mistral-embed')