In [1]:
import os
import pandas as pd
from datetime import datetime

# Path where CSV files are stored on Google Drive or locally
path = "./market_data"
path2 = "./tweets"

#1 Binance history

In [2]:
# Names of the attributes for the dataframe
column_names = ["OpenTime", "OpenPrice", "HighPrice", "LowPrice", "ClosePrice", "Volume", "CloseTime", "QuoteVolume", "TradesN", "TakerVol", "TakerQVol", "Ignore"]  # Replace with your actual column names

# Store all the dataframes from each CSV file
dfs = []

# Loop through all files in the directory
for filename in os.listdir(path):
    if filename.endswith(".csv"):
        df = pd.read_csv(os.path.join(path, filename), names=column_names)

        # Assuming "date" is the name of the column that has unix timestamps
        df['OpenTime'] = pd.to_datetime(df['OpenTime'], unit='ms')  # Convert Unix timestamp to datetime
        df['CloseTime'] = pd.to_datetime(df['CloseTime'], unit='ms')  # Convert Unix timestamp to datetime

        dfs.append(df)

# Concatenate all dataframes
result = pd.concat(dfs)

# Sort the dataframe by date
result = result.sort_values(by=['OpenTime'])

# Reset the index of the final dataframe
result = result.reset_index(drop=True)

In [3]:
print(result)

                  OpenTime  OpenPrice  HighPrice  LowPrice  ClosePrice  \
0      2023-04-01 00:00:00    1821.52    1822.38   1821.51     1822.30   
1      2023-04-01 00:01:00    1822.29    1822.30   1821.66     1821.67   
2      2023-04-01 00:02:00    1821.67    1822.16   1821.50     1821.86   
3      2023-04-01 00:03:00    1821.87    1821.87   1821.32     1821.32   
4      2023-04-01 00:04:00    1821.32    1821.33   1821.32     1821.33   
...                    ...        ...        ...       ...         ...   
115195 2023-06-19 23:55:00    1735.13    1735.36   1735.12     1735.36   
115196 2023-06-19 23:56:00    1735.35    1736.29   1735.35     1736.28   
115197 2023-06-19 23:57:00    1736.28    1736.76   1735.93     1736.76   
115198 2023-06-19 23:58:00    1736.75    1737.99   1736.73     1737.70   
115199 2023-06-19 23:59:00    1737.70    1737.73   1737.04     1737.11   

          Volume               CloseTime    QuoteVolume  TradesN  TakerVol  \
0       174.2847 2023-04-01 00:00

#2 Filtering rise / fall events

In [4]:
def find_rapid_price_changes(df, change_pct, interval_minutes):
    # Create an empty list to store the result rows
    result_rows = []

    # Iterate over the rows of the DataFrame
    for i in range(len(df) - interval_minutes):
        # Select the current row and the rows in the next interval_minutes
        window_df = df.iloc[i : i + interval_minutes]

        # Calculate the maximum percentage change in the interval
        max_change_fall = (window_df['HighPrice'].max() - window_df['LowPrice'].min()) / window_df['HighPrice'].max()
        max_change_rise = (window_df['HighPrice'].max() - window_df['LowPrice'].min()) / window_df['LowPrice'].min()

        # If the maximum percentage change is greater than change_pct, add the current row to the result
        if max_change_fall > change_pct / 100 or max_change_rise > change_pct / 100:
            result_row = df.iloc[i].copy()
            if window_df.iloc[0]['OpenPrice'] < window_df.iloc[-1]['ClosePrice']:
                result_row['Direction'] = 'rise'
            else:
                result_row['Direction'] = 'fall'
            result_rows.append(result_row)

    # Concatenate all result rows into a DataFrame
    result = pd.concat(result_rows, axis=1).transpose()

    return result

In [5]:
# More than X% in Y minutes (1,5% and 5 minutes in the code below)
priceDif = 1.5
timeFrame = 5
rapid_changes_df = find_rapid_price_changes(result, priceDif, timeFrame)

In [6]:
print(rapid_changes_df)

                  OpenTime OpenPrice HighPrice LowPrice ClosePrice  \
4983   2023-04-04 11:03:00   1831.96   1834.03   1830.0    1833.28   
4984   2023-04-04 11:04:00   1833.29   1836.34  1833.28    1835.72   
4985   2023-04-04 11:05:00   1835.72   1837.51  1835.54    1836.62   
4986   2023-04-04 11:06:00   1836.62    1838.0  1836.32    1837.75   
4987   2023-04-04 11:07:00   1837.75   1871.11  1837.43    1866.82   
...                    ...       ...       ...      ...        ...   
111217 2023-06-17 05:37:00   1725.68   1726.57  1725.68    1726.57   
111218 2023-06-17 05:38:00   1726.57   1726.57  1725.69    1725.98   
111219 2023-06-17 05:39:00   1725.98   1727.13  1725.97     1726.9   
111220 2023-06-17 05:40:00    1726.9   1728.84  1726.89    1728.39   
111221 2023-06-17 05:41:00   1728.38    1770.0  1728.38    1757.15   

            Volume               CloseTime      QuoteVolume TradesN  \
4983     1925.8777 2023-04-04 11:03:59.999   3526731.165971    1560   
4984     1832.773

In [7]:
fall_df = rapid_changes_df[rapid_changes_df['Direction'] == 'fall']
print(fall_df.count())

OpenTime       106
OpenPrice      106
HighPrice      106
LowPrice       106
ClosePrice     106
Volume         106
CloseTime      106
QuoteVolume    106
TradesN        106
TakerVol       106
TakerQVol      106
Ignore         106
Direction      106
dtype: int64


In [8]:
rise_df = rapid_changes_df[rapid_changes_df['Direction'] == 'rise']
print(rise_df.count())

OpenTime       66
OpenPrice      66
HighPrice      66
LowPrice       66
ClosePrice     66
Volume         66
CloseTime      66
QuoteVolume    66
TradesN        66
TakerVol       66
TakerQVol      66
Ignore         66
Direction      66
dtype: int64


Finding "Eventful" days and hours to plan the implementation of the model being developed.

In [12]:
# Extract hour and day of week from 'OpenTime' 
rapid_changes_df['hour'] = rapid_changes_df['OpenTime'].dt.hour
rapid_changes_df['day_of_week'] = rapid_changes_df['OpenTime'].dt.dayofweek

# Group by 'day_of_week' and 'hour' and calculate the count of events
weekday_hourly_events = rapid_changes_df.groupby(['day_of_week', 'hour']).size()

# Sort by day_of_week and then by the count of events within each hour
weekday_hourly_events_sorted = weekday_hourly_events.sort_index().groupby(level=0, group_keys=False).apply(lambda x: x.sort_values(ascending=False))

# Check the result
print(weekday_hourly_events_sorted)




day_of_week  hour
0            1        9
             15       4
             16       4
             17       4
1            11       5
2            19      19
             20      15
             8       10
             12      10
             17       9
             0        5
             23       4
             2        1
3            0        8
             1        5
             6        3
             16       3
             8        1
             15       1
4            0        8
             12       5
             19       5
             18       4
             16       3
             15       2
5            5        5
             14       5
             4        3
             7        3
             8        2
             18       2
6            22       3
             21       2
dtype: int64


We need a "no change" class as well, cuz "no change" is the most popular event to happen, and our classificator needs to be familiar with it.

In [13]:
from datetime import timedelta

#result_df - overall dataframe
#change_df - rapid changes dataframe
# N - desirable number of events to choose
# M - hours to exclude around the rapid change
def generate_no_change_events(result_df, change_df, N, M):
   
    # Create a list to store the dates to exclude
    exclude_dates = []

    # For each event in change_df, add all dates in a range of M hours around the event to exclude_dates
    for date in change_df['OpenTime']:
        start_date = date - timedelta(hours=M/2)
        end_date = date + timedelta(hours=M/2)
        date_range = pd.date_range(start=start_date, end=end_date, freq='min')
        exclude_dates.extend(date_range)

    # Exclude all rows from result_df that are in exclude_dates
    no_change_df = result_df.loc[~result_df['OpenTime'].isin(exclude_dates)]

    # Randomly select N rows from no_change_df
    no_change_df = no_change_df.sample(N)

    return no_change_df


In [14]:
# Using the function to generate "no change events"
no_change_df = generate_no_change_events(result, rapid_changes_df, 200, 20) 

print(no_change_df)

                  OpenTime  OpenPrice  HighPrice  LowPrice  ClosePrice  \
47917  2023-05-04 06:37:00    1898.31    1899.10   1898.31     1899.09   
2037   2023-04-02 09:57:00    1821.67    1821.68   1821.63     1821.64   
98991  2023-06-08 17:51:00    1844.01    1844.33   1844.00     1844.32   
105100 2023-06-12 23:40:00    1742.74    1742.99   1742.73     1742.99   
90854  2023-06-03 02:14:00    1901.69    1901.70   1901.69     1901.69   
...                    ...        ...        ...       ...         ...   
45867  2023-05-02 20:27:00    1871.34    1871.69   1869.98     1869.99   
108796 2023-06-15 13:16:00    1642.55    1643.05   1642.55     1642.95   
80175  2023-05-26 16:15:00    1829.41    1830.00   1829.41     1829.99   
57936  2023-05-11 05:36:00    1827.96    1827.96   1827.50     1827.79   
114474 2023-06-19 11:54:00    1725.95    1726.04   1725.74     1725.75   

          Volume               CloseTime    QuoteVolume  TradesN  TakerVol  \
47917   160.3148 2023-05-04 06:37

#3 Twitter preparation part

In [15]:
# Names of the attributes for the dataframe
column_names = ["ID", "Link", "datetime", "Account", "AccountName", "text", "Media", "Engage1", "Engage2", "Engage3", "Engage4"]  # Replace with your actual column names

# Store all the dataframes from each CSV file
dfs = []

# Loop through all files in the directory
for filename in os.listdir(path2):
    if filename.endswith(".csv"):
        df = pd.read_csv(os.path.join(path2, filename), names=column_names)

        # Assuming "date" is the name of the column that has unix timestamps
        df['datetime'] = pd.to_datetime(df['datetime'], format='%b %d, %Y · %I:%M %p %Z')

        dfs.append(df)

# Concatenate all dataframes
tweets_df = pd.concat(dfs)

# Sort the dataframe by date
tweets_df = tweets_df.sort_values(by=['datetime'])

# Reset the index of the final dataframe
tweets_df = tweets_df.reset_index(drop=True)

In [16]:
print(tweets_df)

                        ID                                               Link  \
0      1641954253114580994  https://twitter.com/financialjuice/status/1641...   
1      1641954189826826246  https://twitter.com/financialjuice/status/1641...   
2      1641954241953628161  https://twitter.com/financialjuice/status/1641...   
3      1642129051803348992  https://twitter.com/financialjuice/status/1642...   
4      1642129207781195776  https://twitter.com/financialjuice/status/1642...   
...                    ...                                                ...   
10718  1670869603843440643  https://twitter.com/financialjuice/status/1670...   
10719  1670875129067237384  https://twitter.com/financialjuice/status/1670...   
10720  1670893526664507397  https://twitter.com/financialjuice/status/1670...   
10721  1670904077268426754  https://twitter.com/financialjuice/status/1670...   
10722  1670929350613426177  https://twitter.com/financialjuice/status/1670...   

                       date

#4 Dataset preparation

In [17]:
def generate_combined_df(rise_fall_df, no_change_df, tweets_df):
    # Convert the 'datetime' column of tweets_df to datetime if it isn't already
    tweets_df['datetime'] = pd.to_datetime(tweets_df['datetime'])
    # Convert all tweet texts to strings
    tweets_df['text'] = tweets_df['text'].astype(str)

    # Concatenate rise_fall_df and no_change_df
    combined_df = pd.concat([rise_fall_df, no_change_df])

    # Sort combined_df by 'OpenTime'
    combined_df = combined_df.sort_values('OpenTime')

    # Create a 'tweets' column in combined_df
    combined_df['tweets'] = ''

    # Convert 'OpenTime' to UTC to match the timezone of 'datetime' in tweets_df
    combined_df['OpenTime'] = combined_df['OpenTime'].dt.tz_localize('UTC')

    # For each row in combined_df, find all tweets in tweets_df that occurred within the 8 hours before the event
    for i, row in combined_df.iterrows():
        try:
            start_time = row['OpenTime'] - timedelta(hours=8)
            relevant_tweets = tweets_df[(tweets_df['datetime'] >= start_time) & (tweets_df['datetime'] < row['OpenTime'])]

            # Sort the relevant tweets by 'datetime'
            relevant_tweets = relevant_tweets.sort_values('datetime')

            # Join the tweet texts together with ' // ' as the separator and add them to the 'tweets' column
            #combined_df.loc[i, 'tweets'] = ' // '.join(relevant_tweets['text'])
            combined_df.loc[i, 'tweets'] = '\n'.join(relevant_tweets['text'])
        except TypeError as e:
            print(f'Error at index {i}: {e}')
            print('Value causing error:', relevant_tweets['text'])

    return combined_df

In [18]:
# Use the function
combined_df = generate_combined_df(rapid_changes_df, no_change_df, tweets_df)

print(combined_df)

                        OpenTime OpenPrice HighPrice LowPrice ClosePrice  \
733    2023-04-01 12:13:00+00:00   1824.71   1824.87  1824.71    1824.79   
1101   2023-04-01 18:21:00+00:00   1814.32   1814.43  1813.78    1814.43   
1952   2023-04-02 08:32:00+00:00   1818.74   1818.91  1818.74     1818.9   
2037   2023-04-02 09:57:00+00:00   1821.67   1821.68  1821.63    1821.64   
2429   2023-04-02 16:29:00+00:00   1792.79    1794.1  1792.79    1792.99   
...                          ...       ...       ...      ...        ...   
112779 2023-06-18 07:39:00+00:00   1731.02   1731.03  1730.97    1730.98   
113085 2023-06-18 12:45:00+00:00   1735.71   1736.27  1734.83    1736.23   
113710 2023-06-18 23:10:00+00:00   1723.84   1724.34  1723.22    1724.33   
114400 2023-06-19 10:40:00+00:00   1725.08   1725.08  1724.69    1725.07   
114474 2023-06-19 11:54:00+00:00   1725.95   1726.04  1725.74    1725.75   

          Volume               CloseTime    QuoteVolume TradesN  TakerVol  \
733     20

In [19]:
#let's check random rows to be sure the format's OK
import random

# Set X to the number of random rows you want to view
X = 3

# Generate X random indices
random_indices = random.sample(range(len(combined_df)), X)

# Print the 'tweets' field of the random rows
for i in random_indices:
    print(f'Row {i} \n{combined_df.iloc[i]["OpenTime"]} tweets:\n{combined_df.iloc[i]["tweets"]}\n')

Row 33 
2023-04-09 02:25:00+00:00 tweets:
CHINA LAUNCHED MILITARY DRILLS AFTER TAIWAN PRESIDENT TSAI ING-WEN’S US TRIP - FT. ft.com/content/e25807de-16ed…

Row 177 
2023-05-05 12:29:00+00:00 tweets:
SINGAPORE RETAIL SALES YOY ACTUAL 4.5% (FORECAST -1.1%, PREVIOUS 12.7%) $MACRO
SINGAPORE RETAIL SALES MOM ACTUAL 2.2% (FORECAST -, PREVIOUS 3.9%) $MACRO
MORNING JUICE - EUROPEAN SESSION PREP financialjuice.com/News/7392…
SWISS UNEMPLOYMENT RATE UNADJUSTED ACTUAL 2.0% (FORECAST -, PREVIOUS 2.0%) $MACRO
SWISS UNEMPLOYMENT RATE ADJUSTED ACTUAL 1.9% (FORECAST 1.9%, PREVIOUS 1.9%) $MACRO
GERMAN INDUSTRIAL ORDERS MOM ACTUAL -10.7% (FORECAST -2.3%, PREVIOUS 4.8%) $MACRO
ECB'S VILLEROY: THE CHANGE IN THE RATE INCREASE RYTHM IS AN IMPORTANT SIGNAL.
ECB'S VILLEROY: I FAVOR SMALLER ECB RATE INCREASES.
ECB'S VILLEROY: THERE WILL LIKELY BE SEVERAL MORE HIKES, BUT WE HAVE DONE THE ESSENTIAL.
ECB'S VILLEROY: THE ECONOMIC SITUATION IS RESILIENT IN FRANCE AND IN THE EURO ZONE.
ECB'S VILLEROY: WE WILL BRING 

In [20]:
# Let's filter out empty rows (happens sometime)
# Create a boolean mask for rows where 'tweets' is empty
empty_tweets_mask = combined_df['tweets'] == ''

# Filter combined_df using the mask
empty_tweets_df = combined_df[empty_tweets_mask]

# Print the rows with empty 'tweets'
print(empty_tweets_df)

# Print the number of rows with empty 'tweets'
print(f'Number of rows with empty "tweets": {len(empty_tweets_df)}')

                        OpenTime OpenPrice HighPrice LowPrice ClosePrice  \
1952   2023-04-02 08:32:00+00:00   1818.74   1818.91  1818.74     1818.9   
2037   2023-04-02 09:57:00+00:00   1821.67   1821.68  1821.63    1821.64   
2429   2023-04-02 16:29:00+00:00   1792.79    1794.1  1792.79    1792.99   
10644  2023-04-08 09:24:00+00:00   1869.56   1869.57  1869.55    1869.56   
11885  2023-04-09 06:05:00+00:00   1828.84   1830.32  1828.47    1829.83   
12069  2023-04-09 09:09:00+00:00   1838.01   1838.01  1837.63    1837.64   
12074  2023-04-09 09:14:00+00:00   1838.19    1838.2  1838.19     1838.2   
12239  2023-04-09 11:59:00+00:00   1839.44   1839.44  1839.21    1839.22   
21149  2023-04-15 16:29:00+00:00   2103.01   2103.85   2103.0    2103.26   
22233  2023-04-16 10:33:00+00:00   2086.24   2086.54  2086.24    2086.53   
22368  2023-04-16 12:48:00+00:00   2088.35   2088.36  2088.34    2088.34   
22438  2023-04-16 13:58:00+00:00   2083.65   2087.44  2083.64    2085.29   
22569  2023-

In [21]:
# Negate the mask to get a mask for rows where 'tweets' is not empty
non_empty_tweets_mask = combined_df['tweets'] != ''

# Filter combined_df using the mask
combined_df = combined_df[non_empty_tweets_mask]

# Check the dataframe
print(combined_df)

                        OpenTime OpenPrice HighPrice LowPrice ClosePrice  \
733    2023-04-01 12:13:00+00:00   1824.71   1824.87  1824.71    1824.79   
1101   2023-04-01 18:21:00+00:00   1814.32   1814.43  1813.78    1814.43   
3078   2023-04-03 03:18:00+00:00   1774.72   1775.07  1774.52    1775.07   
3407   2023-04-03 08:47:00+00:00   1809.99   1811.11  1807.29    1809.38   
3677   2023-04-03 13:17:00+00:00   1803.59   1804.29  1803.59    1804.29   
...                          ...       ...       ...      ...        ...   
110525 2023-06-16 18:05:00+00:00   1700.84    1710.0  1700.73    1705.72   
113085 2023-06-18 12:45:00+00:00   1735.71   1736.27  1734.83    1736.23   
113710 2023-06-18 23:10:00+00:00   1723.84   1724.34  1723.22    1724.33   
114400 2023-06-19 10:40:00+00:00   1725.08   1725.08  1724.69    1725.07   
114474 2023-06-19 11:54:00+00:00   1725.95   1726.04  1725.74    1725.75   

           Volume               CloseTime     QuoteVolume TradesN   TakerVol  \
733    

#5 Dataset to OpenAI rules formatting

In [22]:
# Define the function to create 'completion' column based on 'Direction'
def generate_completion(direction):
    if direction == 'rise':
        return "Go long! I expect the eth price to rise!"
    elif direction == 'fall':
        return "Go short! I feel the eth price's gonna fall hard!"
    else:
        return "I don't see anything unusual, no prediction, sir."

In [23]:
# Replace NaN values in 'Direction' with 'no_change'
combined_df['Direction'].fillna('no_change', inplace=True)

# Create the 'prompt' column
combined_df['prompt'] = "Current time is " + combined_df['OpenTime'].dt.strftime('%Y-%m-%d %H:%M:%S') + "\nFinancial tweets for the last 8 hours: \n" + combined_df['tweets'] + "\nWhat is your prediction on ETH price movement? ->"

# Create the 'completion' column
combined_df['completion'] = "\n\n " + combined_df['Direction'].apply(generate_completion) + " ###"

# Create the new dataframe 'dataset' with only 'prompt' and 'completion' columns
dataset = combined_df[['prompt', 'completion', 'Direction']]

# Check the dataframe
print(dataset)

                                                   prompt  \
733     Current time is 2023-04-01 12:13:00\nFinancial...   
1101    Current time is 2023-04-01 18:21:00\nFinancial...   
3078    Current time is 2023-04-03 03:18:00\nFinancial...   
3407    Current time is 2023-04-03 08:47:00\nFinancial...   
3677    Current time is 2023-04-03 13:17:00\nFinancial...   
...                                                   ...   
110525  Current time is 2023-06-16 18:05:00\nFinancial...   
113085  Current time is 2023-06-18 12:45:00\nFinancial...   
113710  Current time is 2023-06-18 23:10:00\nFinancial...   
114400  Current time is 2023-06-19 10:40:00\nFinancial...   
114474  Current time is 2023-06-19 11:54:00\nFinancial...   

                                               completion  Direction  
733     \n\n I don't see anything unusual, no predicti...  no_change  
1101    \n\n I don't see anything unusual, no predicti...  no_change  
3078    \n\n I don't see anything unusual, no predicti

In [24]:
# let's check any row here
print(dataset.iloc[145]["completion"])



 I don't see anything unusual, no prediction, sir. ###


In [None]:
#from sklearn.model_selection import train_test_split

# Split the data into training and test sets, with stratification over the 'Direction' column
#train_df, test_df = train_test_split(dataset, test_size=0.2, random_state=42, stratify=dataset['Direction'])

# Check the lengths of the training and test sets
#len(train_df), len(test_df)
#train_df[['prompt', 'completion']].to_json(path+"training2.jsonl", orient='records', lines=True)

In [58]:
#migrating from Goole colab to local machine, so I will upload my dataframes here
#import pandas as pd

#test_df = pd.read_csv('./test_df.csv')
#train_df = pd.read_csv('./train_df.csv')


In [7]:
import openai

# OpenAI completion function with prompt, model, max_tokens, temperature, and top_p as inputs
def completion(prompt, model):
  openai.api_key = os.getenv("OPENAI_API_KEY")
  output = openai.Completion.create(
        model=model,
        prompt= str(prompt)
    )
  return output.choices[0]['text'].strip(" \n")


In [9]:
%env OPENAI_API_KEY=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

env: OPENAI_API_KEY=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


In [10]:
print(os.getenv("OPENAI_API_KEY"))

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


In [80]:
# service block to count tokens, you can skip it

def count_words(text):
    words = text.split()  # This splits the text into words at spaces
    return len(words)

# Apply function to every row in 'prompt'
test_df['word_count'] = test_df['prompt'].apply(count_words)

# Filter DataFrame where 'Direction' equals 'rise'
filtered_df = test_df[test_df['Direction'] == 'fall']

print(filtered_df[['Direction', 'word_count']])


   Direction  word_count
1       fall         656
10      fall        1207
12      fall         904
15      fall        2119
19      fall        1405
23      fall         656
24      fall        1794
26      fall         791
27      fall         357
29      fall        1418
32      fall        1182
33      fall         357
35      fall        1080
36      fall        1824
53      fall        1081
55      fall        1220
57      fall         904
60      fall         791
63      fall          39


In [81]:
print(test_df.iloc[1]["prompt"])

Current time is 2023-04-21 19:04:00
Financial tweets for the last 8 hours: 
GAZPROM: RUSSIAN GAS PRICE REPORTS FOR KAZAKHSTAN ARE INACCURATE.
MORNING JUICE - US SESSION PREP financialjuice.com/News/7354…
FITCH RATINGS: QT DECREASING US BANKS LIQUIDITY MAY WORSEN THE CREDIT SQUEEZE.
QT REDUCING US BANKING LIQUIDITY, COULD AMPLIFY CREDIT SQUEEZE - FITCH RATINGS fitchratings.com/research/so…
COINBASE CONSIDERS US FUTURE AS CRACKDOWN ROLLS ON - FT. ft.com/content/4ab352b5-5152…
CANADIAN RETAIL SALES MOM ACTUAL -0.2% (FORECAST -0.6%, PREVIOUS 1.4%) $MACRO
CANADIAN CORE RETAIL SALES MOM ACTUAL -0.7% (FORECAST 0%, PREVIOUS 0.9%) $MACRO
CANADIAN RETAIL SALES FEBRUARY 2023 FULL REPORT www150.statcan.gc.ca/n1/dail…
FANNIE MAE'S ESR GROUP: SEE ADDITIONAL, INCREMENTAL TIGHTENING IN CREDIT CONDITIONS TO CONTRIBUTE TO A MODEST RECESSION BEGINNING IN THE H2 OF 2023.
FANNIE MAE'S ESR GROUP: EXPECTS ONLY ONE ADDITIONAL 25 BASIS POINT HIKE FROM THE FED IN MAY.
FANNIE MAE'S ESR GROUP: DUE TO THE CONTINUA

In [89]:
print(completion(test_df.iloc[63]["prompt"], "davinci:ft-myself:gmx-2023-06-22-12-52-07"))

I don't see anything unusual, no prediction, sir. ### I


***
Finally!
Ok, works randomly)) but lots of fun, keep moving, let's build an autimated tool

In [3]:
import os
import pandas as pd
import subprocess
from datetime import datetime, timedelta
import openai

# OpenAI completion function with prompt, model, max_tokens, temperature, and top_p as inputs
def completion(prompt, model):
  openai.api_key = os.getenv("OPENAI_API_KEY")
  output = openai.Completion.create(
        model=model,
        prompt= str(prompt)
    )
  return output.choices[0]['text'].strip(" \n")

def generate_prompt(tweets_df):
    # Get current time
    current_time = datetime.now()

    # Format current time into string
    current_time_str = current_time.strftime('%Y-%m-%d %H:%M:%S')

    # Combine all tweets with ' // ' separator
    combined_tweets = "\n".join(tweets_df['text'].values)

    # Construct the prompt
    prompt = "Current time is " + current_time_str + "\nFinancial tweets for the last 8 hours: \n" + combined_tweets + "\nWhat is your prediction on ETH price movement? ->"

    return prompt

%env OPENAI_API_KEY=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

# Calculate the start and end dates
end_date = datetime.now()
start_date = end_date - timedelta(hours=8)

# Format the dates
start_date_str = start_date.strftime('%Y-%m-%d_%H:%M:%S')
end_date_str = end_date.strftime('%Y-%m-%d_%H:%M:%S')

# Define the Go command
cmd = [
    'go', 'run', 'main.go',
    '-Query', f'(from:financialjuice) since:{start_date_str}_UTC until:{end_date_str}_UTC',
    '-Instance', 'nitter.lacontrevoie.fr',
    '-Format', 'csv'
]

# Run the Go command and save the output to a file
with open('/Users/Vladimir/Desktop/actual_tweets.csv', 'w') as f:
    subprocess.run(cmd, stdout=f, cwd='/Users/Vladimir/DEV/twint-zero')

# Names of the attributes for the dataframe
column_names = ["ID", "Link", "datetime", "Account", "AccountName", "text", "Media", "Engage1", "Engage2", "Engage3", "Engage4"] 

# Load the CSV file into a DataFrame
df = pd.read_csv('/Users/Vladimir/Desktop/actual_tweets.csv', names=column_names)[['text']]


print(completion(generate_prompt(df), "davinci:ft-myself:gmx-2023-06-22-12-52-07"))


env: OPENAI_API_KEY=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
I don't see anything unusual, no prediction, sir. ### Go
