In [49]:
import os

import numpy as np
import pandas as pd

In [67]:
BACKTEST_CCY = 'USDRUB'
BACKTEST_DAYS = 5

PATH_TEXTS = 'data/telegram'
PATH_SPOT_PNL = 'data/moex'
PATH_OPT_PNL = 'data/pnl'

In [51]:
# Get all available data_sources
sources = os.listdir(PATH_TEXTS)
sources

['cbonds.csv',
 'themovchans.csv',
 'headlines_QUANTS.csv',
 'War_Wealth_Wisdom.csv',
 'mmi.csv',
 'vts.csv',
 'signal.csv',
 '.gitignore',
 'rshb_invest.csv',
 'Alfa_Wealth.csv',
 'sky_bond.csv',
 'bitkogan.csv']

In [52]:
# Generate dataframes
data_sources = []

for s in sources:
    if s != '.gitignore':
        source_data = pd.read_csv(f'{PATH_TEXTS}/{s}')
        data_sources.append((s.split('.')[0], source_data))

In [100]:
spot_pnl = pd.read_csv(f'{PATH_SPOT_PNL}/{BACKTEST_CCY}.csv')
spot_pnl.dropna(inplace=True)
spot_pnl['timestamp'] = pd.to_datetime(spot_pnl['timestamp']).dt.strftime('%Y-%m-%d')
spot_pnl.drop_duplicates(subset=['timestamp'], keep='first', inplace=True)
spot_pnl.reset_index(inplace=True, drop=True)
spot_pnl

Unnamed: 0,timestamp,Bid,Ask
0,2022-03-01,94.994073,96.879993
1,2022-03-02,95.803033,95.971040
2,2022-03-03,114.920925,114.999880
3,2022-03-04,105.000043,108.100490
4,2022-03-09,116.958480,117.096750
...,...,...,...
153,2022-10-18,61.484698,61.978255
154,2022-10-19,61.588507,62.014587
155,2022-10-20,61.600813,62.025917
156,2022-10-21,61.144888,61.667988


In [53]:
# Create target variable dataframe
pnl = pd.read_csv(f'{PATH_OPT_PNL}/Backtest_{BACKTEST_CCY}_{BACKTEST_DAYS}_days.txt')
pnl['date_start'] = pd.to_datetime(pnl['date_start']).dt.strftime('%Y-%m-%d')
pnl['pnl_sign'] = pnl['pnl'].apply(lambda x: 1 if x >= 0 else 0)
pnl

Unnamed: 0,date_start,pnl,pnl_sign
0,2021-01-04,291264.097914,1
1,2021-01-05,-411993.830320,0
2,2021-01-06,521491.686795,1
3,2021-01-08,62842.634116,1
4,2021-01-11,-537598.706217,0
...,...,...,...
245,2021-12-20,-407527.554561,0
246,2021-12-21,-176881.417077,0
247,2021-12-22,-206943.414418,0
248,2021-12-23,23073.596468,1


In [54]:
# Get balance of the sample
pnl['pnl_sign'].sum() / pnl.shape[0]

0.536

In [12]:
# df = df[(df['date'] >= '2022-03-01') & (df['date'] <= '2022-10-31')]

In [55]:
data_sources[0][1]

Unnamed: 0,id,date,text
0,9,2017-07-28T10:56:14,Cbonds.ru –∑–∞–ø—É—Å—Ç–∏–ª–æ –∫–∞–Ω–∞–ª –≤ Telegram. –ü–ª–∞–Ω–∏—Ä—É...
1,10,2017-08-03T15:28:17,–û–±–ª–∏–≥–∞—Ü–∏–æ–Ω–Ω—ã–π –±—é–ª–ª–µ—Ç–µ–Ω—å Cbonds ‚Äì –≤—Å–µ –¥–∞–Ω–Ω—ã–µ –æ ...
2,11,2017-08-03T15:41:02,Cbonds prepared monthly report: CBONDS GLOBAL ...
3,12,2017-08-03T18:11:32,–î–∞–π–¥–∂–µ—Å—Ç Cbonds –æ—Ç 3 –∞–≤–≥—É—Å—Ç–∞:¬´–ì—Ä—É–ø–ø–∞ –ö–æ–º–ø–∞–Ω–∏–π...
4,13,2017-08-04T12:42:41,–°–æ–≤–∫–æ–º–±–∞–Ω–∫ –æ—Ç–∫—Ä—ã–ª –∫–Ω–∏–≥—É –∑–∞—è–≤–æ–∫ –Ω–∞ –≤—Ç–æ—Ä–∏—á–Ω–æ–µ —Ä–∞...
...,...,...,...
11997,13124,2022-11-04T12:38:56,‚Äã‚Äã #–†–µ–π—Ç–∏–Ω–≥–∏–°–ù–ì üóÇ –ü—Ä–∏—Å–≤–æ–µ–Ω–∏–µ / –ò–∑–º–µ–Ω–µ–Ω–∏–µ —Ä–µ–π—Ç...
11998,13125,2022-11-04T13:17:23,#CbondsNewsletter üåéüì∞ –í –æ–∫—Ç—è–±—Ä–µ –æ–±—ä–µ–º –Ω–æ–≤—ã—Ö –≤—ã...
11999,13126,2022-11-04T14:30:00,"‚Äã‚Äã üè¢–û–Ω–ª–∞–π–Ω-—Å–µ–º–∏–Ω–∞—Ä ¬´–ö–ª—é—á–µ–≤—ã–µ —Ñ–∞–∫—Ç—ã, —Ñ–∏–Ω–∞–Ω—Å–æ–≤—ã–µ..."
12000,13128,2022-11-05T11:30:11,#–ê–Ω–∞–ª–∏—Ç–∏–∫–∞ üèÜ Research Hub Cbonds: —Å–∞–º–æ–µ –ø–æ–ø—É...


In [14]:
# df = pnl
# for source_name, source_dataframe in data_sources:
#     df[f'{source_name}'] = None
    
#     i = 0
#     texts = []
#     for _, row in source_dataframe.iterrows():
#         if i >= df.shape[0]:
#             break

#         if pd.to_datetime(row['date']).strftime('%Y-%m-%d') <= df['date_start'][i]:
#             texts.append(row['text'])
#         else:
#             df.at[i, source_name] = texts
#             texts = []
#             i += 1
#             continue

In [56]:
df = pnl
for source_name, source_dataframe in data_sources:
    df[f'{source_name}'] = ''
    
    i = 0
    texts = ''
    for _, row in source_dataframe.iterrows():
        if i >= df.shape[0]:
            break

        if pd.to_datetime(row['date']).strftime('%Y-%m-%d') <= df['date_start'][i]:
            texts += ' ' + row['text']
        else:
            df.at[i, source_name] = texts
            texts = ''
            i += 1
            continue

In [73]:
df = df.iloc[1:]
df

Unnamed: 0,date_start,pnl,pnl_sign,cbonds,themovchans,mmi,signal,rshb_invest,Alfa_Wealth,sky_bond,bitkogan
2,2021-01-06,521491.686795,1,"‚ö°Ô∏è –†–æ—Å—Å–∏–π—Å–∫–∏–µ –∫–æ–º–ø–∞–Ω–∏–∏: –æ—Å–Ω–æ–≤–Ω—ã–µ —Å–æ–±—ã—Ç–∏—è, 6 —è...",,–í–°–ï–ú–ò–†–ù–´–ô –ë–ê–ù–ö: \n‚Ä¢ –†–ò–°–ö–ò –ù–ï–ì–ê–¢–ò–í–ù–û–ì–û –†–ê–ó–í–ò–¢...,"$=73,72—Ä. RTS +0,68% BRENT +0,71%üìå LIVE –ö–∞—Ä—Ç...",,,,–ù–µ—Ñ—Ç—å –º–∞—Ä–∫–∏ Brent. ¬´–ö–æ–º–º—É–Ω–∏—Å—Ç—ã –†–æ—Å—Å–∏–∏¬ª –∂–≥—É—Ç. ...
3,2021-01-08,62842.634116,1,ü§î –ö–ª—é—á–µ–≤–∞—è —Å—Ç–∞–≤–∫–∞ –¶–ë –†–§ –æ—Å—Ç–∞–Ω–µ—Ç—Å—è –Ω–∞ —Ç–µ–∫—É—â–µ–º...,,üü¢ - Risk-on –Ω–∞ —Ä—ã–Ω–∫–∞—Ö –ø—Ä–æ–¥–æ–ª–∂–∞–µ—Ç—Å—è. –ë–µ—Å–ø–æ—Ä—è–¥–∫...,–£–∂–µ –¥–∞–∂–µ –ë—ç—Ç–º–µ–Ω –æ–∫–æ–ª–æ –ö–∞–ø–∏—Ç–æ–ª–∏—è #–ú–∞–∫—Ä–æ #–í—ã–±...,,,,–î—Ä—É–∑—å—è!\n–ù–∞—à —Ö—Ä—É–ø–∫–∏–π –º–∏—Ä –¥–µ—Ä–∂–∏—Ç—Å—è –Ω–∞ —Ü–µ–ª–æ–π —Å–∏...
4,2021-01-11,-537598.706217,0,"‚ö°Ô∏è –†–æ—Å—Å–∏–π—Å–∫–∏–µ –∫–æ–º–ø–∞–Ω–∏–∏: –æ—Å–Ω–æ–≤–Ω—ã–µ —Å–æ–±—ã—Ç–∏—è, 11 ...",,–ì–õ–û–ë–ê–õ–¨–ù–´–ô –†–û–°–¢ –°–¢–ê–í–û–ö –í –ù–ê–ß–ê–õ–ê –ì–û–î–ê –ü–û–ö–ê –ù–ï ...,#–í–∏—Ä—É—Å #–ú–∞–∫—Ä–æ \n‚ö°Ô∏è –ö–û–õ–ò–ß–ï–°–¢–í–û –ó–ê–†–ï–ì–ò–°–¢–†–ò–†–û...,üá∫üá∏ –í –º–æ–º–µ–Ω—Ç–µ –¥–æ—Ö–æ–¥–Ω–æ—Å—Ç—å –ø–æ –¥–µ—Å—è—Ç–∏–ª–µ—Ç–Ω–µ–π —Ç—Ä–µ–∂–µ...,,,–í–æ –≤—Ä–µ–º—è –ø—Ä–æ—Ç–µ—Å—Ç–æ–≤ –∑–∞–¥–µ—Ä–∂–∞–Ω–∞ –∂–µ–Ω—â–∏–Ω–∞ –º–æ–ª–¥–∞–≤...
5,2021-01-12,-359059.057843,0,üèõ–ù–µ–∫–≤–∞–ª–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã–º –∏–Ω–≤–µ—Å—Ç–æ—Ä–∞–º –Ω–µ –±—É–¥—É—Ç –ø—Ä–æ–¥...,,–ú–ê–õ–ê–ô–ó–ò–Ø –í–í–û–î–ò–¢ –ü–û–õ–£–ì–û–î–ò–ß–ù–û–ï –ß–†–ï–ó–í–´–ß–ê–ô–ù–û–ï –ü–û–õ...,"$=74,14—Ä. RTS +0,90% BRENT +1,26%üìå LIVE –ö–∞—Ä—Ç...",–ì–õ–ê–í–ù–û–ï –í –†–û–°–°–ò–ò –°–®–ê –æ–±–≤–∏–Ω–∏–ª–∏ –†–æ—Å—Å–∏—é –≤ –ø–æ–ø—ã—Ç–∫...,,,"–ö—Å—Ç–∞—Ç–∏, –ø—Ä–æ —Ä—É–±–ª—å. \n \n –í–æ–ø—Ä–µ–∫–∏ –≤—Å–µ–º –æ–±–µ—â–∞–Ω–∏..."
6,2021-01-13,181132.689784,1,üÜô –û–±–∑–æ—Ä —Ä–æ—Å—Å–∏–π—Å–∫–æ–≥–æ —Ä—ã–Ω–∫–∞ –Ω–∞ 13 —è–Ω–≤–∞—Ä—è: —Ä—É–±–ª...,,üü¢ - —Ä–æ—Å—Ç —Ü–µ–Ω –Ω–∞ –Ω–µ—Ñ—Ç—å —É—Å–∏–ª–∏–≤–∞–µ—Ç—Å—è –Ω–∞ –Ω–æ–≤–æ—Å—Ç—è—Ö...,#WTI #–°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ \nüí• –ò–∑–º–µ–Ω–µ–Ω–∏–µ –∑–∞–ø–∞—Å–æ–≤ —Å—ã—Ä–æ...,–ì–õ–ê–í–ù–û–ï –í –†–û–°–°–ò–ò –†–æ—Å—Å—Ç–∞—Ç –ø–æ–¥—Ç–≤–µ—Ä–¥–∏–ª –æ—Ü–µ–Ω–∫—É –∏–Ω...,,,–ü—Ä–æ–¥–æ–ª–∂–∞—è —Ç–µ–º—É —Å–≤–æ—Ä–∞—á–∏–≤–∞–Ω–∏—è –ø—Ä–æ–≥—Ä–∞–º–º—ã –ø–æ–∫—É–ø–∫–∏...
...,...,...,...,...,...,...,...,...,...,...,...
245,2021-12-20,-407527.554561,0,#–ò–Ω—Ç–µ—Ä–µ—Å–Ω–æ–µ üå± üìàESG: –ë–∞–Ω–∫–∏-–¥–µ–±—é—Ç–∞–Ω—Ç—ã –ï–° –∏ —Ä–æ—Å...,,–≠–ö–û–ù–û–ú–ò–ö–ê ‚Äì –ü–û–°–¢–ö–†–ò–ó–ò–°–ù–´–ï –¢–†–ï–ù–î–´ –†–æ—Å—Å—Ç–∞—Ç –Ω–∞ ...,üî¶ –§–æ–∫—É—Å –Ω–µ–¥–µ–ª–∏ #–§–ù –°–µ–≥–æ–¥–Ω—è\nüôã‚Äç‚ôÇÔ∏è –°–∞–º–æ–ª–µ—Ç ( ...,–ì–õ–ê–í–ù–û–ï –í –†–û–°–°–ò–ò \n¬†\nüìà –ö–ª—é—á–µ–≤–∞—è —Å—Ç–∞–≤–∫–∞ —É–≤–µ–ª–∏...,,,4Ô∏è‚É£ –£–∫—Ä–µ–ø–ª–µ–Ω–∏–µ –ø–æ–ª–æ–∂–µ–Ω–∏—è –Ω–∞ —Ä—ã–Ω–∫–µ . –ù–µ–∫–æ—Ç–æ—Ä—ã...
246,2021-12-21,-176881.417077,0,‚Äã‚Äã üìù‚ùì –£–∂–µ —Å–µ–≥–æ–¥–Ω—è –≤ 16:30 (–º—Å–∫): –æ–Ω–ª–∞–π–Ω-—Å–µ–º–∏–Ω...,,–£–†–û–í–ï–ù–¨ –ó–ê–ë–û–õ–ï–í–ê–ï–ú–û–°–¢–ò –í –ú–ò–†–ï –í–ù–û–í–¨ –†–ê–°–¢–ï–¢ –ó–∞...,üî¶ –§–æ–∫—É—Å –Ω–µ–¥–µ–ª–∏ #–§–ù –°–µ–≥–æ–¥–Ω—è\nü§∑‚Äç‚ôÇÔ∏è22 –¥–µ–∫–∞–±—Ä—è\...,"–ì–õ–ê–í–ù–û–ï –í –†–û–°–°–ò–ò üìä ""–†–µ–Ω–µ—Å—Å–∞–Ω—Å —Å—Ç—Ä–∞—Ö–æ–≤–∞–Ω–∏–µ"" —Ä–∞...",,,"Micron –ø–æ–º–æ–≥–∞–µ—Ç –æ–±—ã–≥—Ä—ã–≤–∞—Ç—å —Ä—ã–Ω–∫–∏ –í—á–µ—Ä–∞, –∫–æ–≥–¥–∞..."
247,2021-12-22,-206943.414418,0,#–ù–æ–≤–æ—Å—Ç–∏–ö–æ–º–ø–∞–Ω–∏–π ‚ö°Ô∏è–†–æ—Å—Å–∏–π—Å–∫–∏–µ –∫–æ–º–ø–∞–Ω–∏–∏: –æ—Å–Ω–æ...,,–†–û–°–¢ –ü–ê–ù–î–ï–ú–ò–ò –í –°–®–ê –£–°–ò–õ–ò–í–ê–ï–¢–°–Ø –ó–∞ –ø–æ—Å–ª–µ–¥–Ω–∏–µ ...,üî¶ –§–æ–∫—É—Å –Ω–µ–¥–µ–ª–∏ #–§–ù –°–µ–≥–æ–¥–Ω—è\nüí∞ –î–µ—Ç—Å–∫–∏–π –º–∏—Ä (...,–ì–õ–ê–í–ù–û–ï –í –†–û–°–°–ò–ò üìä Reuters: –≤ 2022 –≥–æ–¥—É ¬´–ê—Ç–æ–Ω...,,,China Mobile –≤ –®–∞–Ω—Ö–∞–µ ‚Äì –≤—Ç–æ—Ä–æ–π –ø–æ—Å–ª–µ Rivian –∫...
248,2021-12-23,23073.596468,1,‚Äã‚Äã üìùüí° –ú–∞–∫—Ä–æ—ç–∫–æ–Ω–æ–º–∏—á–µ—Å–∫–∏–µ –∏–Ω–¥–∏–∫–∞—Ç–æ—Ä—ã –ø–æ –≤—Å–µ–º —Å...,,"‚Ä¢ –†—ã–Ω–æ–∫ –û–§–ó –≤–µ–¥—ë—Ç —Å–µ–±—è —Ç–∞–∫, –∫–∞–∫ –±—É–¥—Ç–æ —É–≤–µ—Ä–µ...",üî¶ –§–æ–∫—É—Å –Ω–µ–¥–µ–ª–∏ #–§–ù –°–µ–≥–æ–¥–Ω—è\n‚ùóÔ∏è –ï–∂–µ–≥–æ–¥–Ω–∞—è –ø—Ä...,–ì–õ–ê–í–ù–û–ï –í –†–û–°–°–ò–ò üìä –ò–Ω—Ñ–ª—è—Ü–∏—è –≤ –†–æ—Å—Å–∏–∏ —Å 14 –ø–æ ...,,,–£–∂–µ –Ω–µ —Ç–∞–∫–æ–π —Å—Ç—Ä–∞—à–Ω—ã–π –û–º–∏–∫—Ä–æ–Ω –Ω–∞–ø—É–≥–∞–ª –°–∏–∞–Ω—å ....


In [74]:
df = df[~df.isin([''])].dropna(axis=1, how='all')

In [85]:
# df_final = df.drop(['cbonds', 'signal', 'rshb_invest', 'Alfa_Wealth', 'bitkogan'], axis=1)
df_final = df
df_final

Unnamed: 0,date_start,pnl,pnl_sign,cbonds,mmi,signal,rshb_invest,Alfa_Wealth,bitkogan
2,2021-01-06,521491.686795,1,"‚ö°Ô∏è –†–æ—Å—Å–∏–π—Å–∫–∏–µ –∫–æ–º–ø–∞–Ω–∏–∏: –æ—Å–Ω–æ–≤–Ω—ã–µ —Å–æ–±—ã—Ç–∏—è, 6 —è...",–í–°–ï–ú–ò–†–ù–´–ô –ë–ê–ù–ö: \n‚Ä¢ –†–ò–°–ö–ò –ù–ï–ì–ê–¢–ò–í–ù–û–ì–û –†–ê–ó–í–ò–¢...,"$=73,72—Ä. RTS +0,68% BRENT +0,71%üìå LIVE –ö–∞—Ä—Ç...",,,–ù–µ—Ñ—Ç—å –º–∞—Ä–∫–∏ Brent. ¬´–ö–æ–º–º—É–Ω–∏—Å—Ç—ã –†–æ—Å—Å–∏–∏¬ª –∂–≥—É—Ç. ...
3,2021-01-08,62842.634116,1,ü§î –ö–ª—é—á–µ–≤–∞—è —Å—Ç–∞–≤–∫–∞ –¶–ë –†–§ –æ—Å—Ç–∞–Ω–µ—Ç—Å—è –Ω–∞ —Ç–µ–∫—É—â–µ–º...,üü¢ - Risk-on –Ω–∞ —Ä—ã–Ω–∫–∞—Ö –ø—Ä–æ–¥–æ–ª–∂–∞–µ—Ç—Å—è. –ë–µ—Å–ø–æ—Ä—è–¥–∫...,–£–∂–µ –¥–∞–∂–µ –ë—ç—Ç–º–µ–Ω –æ–∫–æ–ª–æ –ö–∞–ø–∏—Ç–æ–ª–∏—è #–ú–∞–∫—Ä–æ #–í—ã–±...,,,–î—Ä—É–∑—å—è!\n–ù–∞—à —Ö—Ä—É–ø–∫–∏–π –º–∏—Ä –¥–µ—Ä–∂–∏—Ç—Å—è –Ω–∞ —Ü–µ–ª–æ–π —Å–∏...
4,2021-01-11,-537598.706217,0,"‚ö°Ô∏è –†–æ—Å—Å–∏–π—Å–∫–∏–µ –∫–æ–º–ø–∞–Ω–∏–∏: –æ—Å–Ω–æ–≤–Ω—ã–µ —Å–æ–±—ã—Ç–∏—è, 11 ...",–ì–õ–û–ë–ê–õ–¨–ù–´–ô –†–û–°–¢ –°–¢–ê–í–û–ö –í –ù–ê–ß–ê–õ–ê –ì–û–î–ê –ü–û–ö–ê –ù–ï ...,#–í–∏—Ä—É—Å #–ú–∞–∫—Ä–æ \n‚ö°Ô∏è –ö–û–õ–ò–ß–ï–°–¢–í–û –ó–ê–†–ï–ì–ò–°–¢–†–ò–†–û...,üá∫üá∏ –í –º–æ–º–µ–Ω—Ç–µ –¥–æ—Ö–æ–¥–Ω–æ—Å—Ç—å –ø–æ –¥–µ—Å—è—Ç–∏–ª–µ—Ç–Ω–µ–π —Ç—Ä–µ–∂–µ...,,–í–æ –≤—Ä–µ–º—è –ø—Ä–æ—Ç–µ—Å—Ç–æ–≤ –∑–∞–¥–µ—Ä–∂–∞–Ω–∞ –∂–µ–Ω—â–∏–Ω–∞ –º–æ–ª–¥–∞–≤...
5,2021-01-12,-359059.057843,0,üèõ–ù–µ–∫–≤–∞–ª–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã–º –∏–Ω–≤–µ—Å—Ç–æ—Ä–∞–º –Ω–µ –±—É–¥—É—Ç –ø—Ä–æ–¥...,–ú–ê–õ–ê–ô–ó–ò–Ø –í–í–û–î–ò–¢ –ü–û–õ–£–ì–û–î–ò–ß–ù–û–ï –ß–†–ï–ó–í–´–ß–ê–ô–ù–û–ï –ü–û–õ...,"$=74,14—Ä. RTS +0,90% BRENT +1,26%üìå LIVE –ö–∞—Ä—Ç...",–ì–õ–ê–í–ù–û–ï –í –†–û–°–°–ò–ò –°–®–ê –æ–±–≤–∏–Ω–∏–ª–∏ –†–æ—Å—Å–∏—é –≤ –ø–æ–ø—ã—Ç–∫...,,"–ö—Å—Ç–∞—Ç–∏, –ø—Ä–æ —Ä—É–±–ª—å. \n \n –í–æ–ø—Ä–µ–∫–∏ –≤—Å–µ–º –æ–±–µ—â–∞–Ω–∏..."
6,2021-01-13,181132.689784,1,üÜô –û–±–∑–æ—Ä —Ä–æ—Å—Å–∏–π—Å–∫–æ–≥–æ —Ä—ã–Ω–∫–∞ –Ω–∞ 13 —è–Ω–≤–∞—Ä—è: —Ä—É–±–ª...,üü¢ - —Ä–æ—Å—Ç —Ü–µ–Ω –Ω–∞ –Ω–µ—Ñ—Ç—å —É—Å–∏–ª–∏–≤–∞–µ—Ç—Å—è –Ω–∞ –Ω–æ–≤–æ—Å—Ç—è—Ö...,#WTI #–°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ \nüí• –ò–∑–º–µ–Ω–µ–Ω–∏–µ –∑–∞–ø–∞—Å–æ–≤ —Å—ã—Ä–æ...,–ì–õ–ê–í–ù–û–ï –í –†–û–°–°–ò–ò –†–æ—Å—Å—Ç–∞—Ç –ø–æ–¥—Ç–≤–µ—Ä–¥–∏–ª –æ—Ü–µ–Ω–∫—É –∏–Ω...,,–ü—Ä–æ–¥–æ–ª–∂–∞—è —Ç–µ–º—É —Å–≤–æ—Ä–∞—á–∏–≤–∞–Ω–∏—è –ø—Ä–æ–≥—Ä–∞–º–º—ã –ø–æ–∫—É–ø–∫–∏...
...,...,...,...,...,...,...,...,...,...
245,2021-12-20,-407527.554561,0,#–ò–Ω—Ç–µ—Ä–µ—Å–Ω–æ–µ üå± üìàESG: –ë–∞–Ω–∫–∏-–¥–µ–±—é—Ç–∞–Ω—Ç—ã –ï–° –∏ —Ä–æ—Å...,–≠–ö–û–ù–û–ú–ò–ö–ê ‚Äì –ü–û–°–¢–ö–†–ò–ó–ò–°–ù–´–ï –¢–†–ï–ù–î–´ –†–æ—Å—Å—Ç–∞—Ç –Ω–∞ ...,üî¶ –§–æ–∫—É—Å –Ω–µ–¥–µ–ª–∏ #–§–ù –°–µ–≥–æ–¥–Ω—è\nüôã‚Äç‚ôÇÔ∏è –°–∞–º–æ–ª–µ—Ç ( ...,–ì–õ–ê–í–ù–û–ï –í –†–û–°–°–ò–ò \n¬†\nüìà –ö–ª—é—á–µ–≤–∞—è —Å—Ç–∞–≤–∫–∞ —É–≤–µ–ª–∏...,,4Ô∏è‚É£ –£–∫—Ä–µ–ø–ª–µ–Ω–∏–µ –ø–æ–ª–æ–∂–µ–Ω–∏—è –Ω–∞ —Ä—ã–Ω–∫–µ . –ù–µ–∫–æ—Ç–æ—Ä—ã...
246,2021-12-21,-176881.417077,0,‚Äã‚Äã üìù‚ùì –£–∂–µ —Å–µ–≥–æ–¥–Ω—è –≤ 16:30 (–º—Å–∫): –æ–Ω–ª–∞–π–Ω-—Å–µ–º–∏–Ω...,–£–†–û–í–ï–ù–¨ –ó–ê–ë–û–õ–ï–í–ê–ï–ú–û–°–¢–ò –í –ú–ò–†–ï –í–ù–û–í–¨ –†–ê–°–¢–ï–¢ –ó–∞...,üî¶ –§–æ–∫—É—Å –Ω–µ–¥–µ–ª–∏ #–§–ù –°–µ–≥–æ–¥–Ω—è\nü§∑‚Äç‚ôÇÔ∏è22 –¥–µ–∫–∞–±—Ä—è\...,"–ì–õ–ê–í–ù–û–ï –í –†–û–°–°–ò–ò üìä ""–†–µ–Ω–µ—Å—Å–∞–Ω—Å —Å—Ç—Ä–∞—Ö–æ–≤–∞–Ω–∏–µ"" —Ä–∞...",,"Micron –ø–æ–º–æ–≥–∞–µ—Ç –æ–±—ã–≥—Ä—ã–≤–∞—Ç—å —Ä—ã–Ω–∫–∏ –í—á–µ—Ä–∞, –∫–æ–≥–¥–∞..."
247,2021-12-22,-206943.414418,0,#–ù–æ–≤–æ—Å—Ç–∏–ö–æ–º–ø–∞–Ω–∏–π ‚ö°Ô∏è–†–æ—Å—Å–∏–π—Å–∫–∏–µ –∫–æ–º–ø–∞–Ω–∏–∏: –æ—Å–Ω–æ...,–†–û–°–¢ –ü–ê–ù–î–ï–ú–ò–ò –í –°–®–ê –£–°–ò–õ–ò–í–ê–ï–¢–°–Ø –ó–∞ –ø–æ—Å–ª–µ–¥–Ω–∏–µ ...,üî¶ –§–æ–∫—É—Å –Ω–µ–¥–µ–ª–∏ #–§–ù –°–µ–≥–æ–¥–Ω—è\nüí∞ –î–µ—Ç—Å–∫–∏–π –º–∏—Ä (...,–ì–õ–ê–í–ù–û–ï –í –†–û–°–°–ò–ò üìä Reuters: –≤ 2022 –≥–æ–¥—É ¬´–ê—Ç–æ–Ω...,,China Mobile –≤ –®–∞–Ω—Ö–∞–µ ‚Äì –≤—Ç–æ—Ä–æ–π –ø–æ—Å–ª–µ Rivian –∫...
248,2021-12-23,23073.596468,1,‚Äã‚Äã üìùüí° –ú–∞–∫—Ä–æ—ç–∫–æ–Ω–æ–º–∏—á–µ—Å–∫–∏–µ –∏–Ω–¥–∏–∫–∞—Ç–æ—Ä—ã –ø–æ –≤—Å–µ–º —Å...,"‚Ä¢ –†—ã–Ω–æ–∫ –û–§–ó –≤–µ–¥—ë—Ç —Å–µ–±—è —Ç–∞–∫, –∫–∞–∫ –±—É–¥—Ç–æ —É–≤–µ—Ä–µ...",üî¶ –§–æ–∫—É—Å –Ω–µ–¥–µ–ª–∏ #–§–ù –°–µ–≥–æ–¥–Ω—è\n‚ùóÔ∏è –ï–∂–µ–≥–æ–¥–Ω–∞—è –ø—Ä...,–ì–õ–ê–í–ù–û–ï –í –†–û–°–°–ò–ò üìä –ò–Ω—Ñ–ª—è—Ü–∏—è –≤ –†–æ—Å—Å–∏–∏ —Å 14 –ø–æ ...,,–£–∂–µ –Ω–µ —Ç–∞–∫–æ–π —Å—Ç—Ä–∞—à–Ω—ã–π –û–º–∏–∫—Ä–æ–Ω –Ω–∞–ø—É–≥–∞–ª –°–∏–∞–Ω—å ....


In [86]:
X = df_final.drop(['date_start', 'pnl', 'pnl_sign'], axis=1)
y = df_final['pnl_sign']

In [87]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=121)

In [88]:
import nltk
nltk.download("stopwords")

from nltk.corpus import stopwords

russian_stopwords = stopwords.words("russian")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/buchkovv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [89]:
from scipy.sparse import coo_matrix, hstack
from sklearn.feature_extraction.text import TfidfVectorizer

X_train_transformed = []
X_val_transformed = []
for i in range(len(X_train.columns)):
    tf_idf = TfidfVectorizer(stop_words=russian_stopwords, ngram_range=(1,1))
    
    X_train_transformed.append(coo_matrix(tf_idf.fit_transform(X_train.iloc[:,i].astype(str))))
    X_val_transformed.append(coo_matrix(tf_idf.transform(X_val.iloc[:,i].astype(str))))

In [90]:
for x in X_train_transformed:
    print(x.shape)

(198, 22962)
(198, 21692)
(198, 25564)
(198, 12621)
(198, 2315)
(198, 47666)


In [91]:
X_train_transformed = hstack(X_train_transformed)
X_val_transformed = hstack(X_val_transformed)
X_train_transformed

<198x132820 sparse matrix of type '<class 'numpy.float64'>'
	with 697493 stored elements in COOrdinate format>

In [92]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(multi_class='auto', solver='lbfgs', verbose=False)
log_reg.fit(X_train_transformed, y_train)

preds = log_reg.predict(X_val_transformed)
accuracy_score(preds, y_val)

0.66