In [49]:
import os

import numpy as np
import pandas as pd

In [67]:
BACKTEST_CCY = 'USDRUB'
BACKTEST_DAYS = 5

PATH_TEXTS = 'data/telegram'
PATH_SPOT_PNL = 'data/moex'
PATH_OPT_PNL = 'data/pnl'

In [51]:
# Get all available data_sources
sources = os.listdir(PATH_TEXTS)
sources

['cbonds.csv',
 'themovchans.csv',
 'headlines_QUANTS.csv',
 'War_Wealth_Wisdom.csv',
 'mmi.csv',
 'vts.csv',
 'signal.csv',
 '.gitignore',
 'rshb_invest.csv',
 'Alfa_Wealth.csv',
 'sky_bond.csv',
 'bitkogan.csv']

In [52]:
# Generate dataframes
data_sources = []

for s in sources:
    if s != '.gitignore':
        source_data = pd.read_csv(f'{PATH_TEXTS}/{s}')
        data_sources.append((s.split('.')[0], source_data))

In [128]:
spot_pnl = pd.read_csv(f'{PATH_SPOT_PNL}/{BACKTEST_CCY}.csv')
spot_pnl.dropna(inplace=True)
spot_pnl['date_start'] = pd.to_datetime(spot_pnl['timestamp']).dt.strftime('%Y-%m-%d')
spot_pnl.drop(['timestamp'], axis=1, inplace=True)

spot_pnl.drop_duplicates(subset=['date_start'], keep='first', inplace=True)
spot_pnl.reset_index(inplace=True, drop=True)

spot_pnl['pnl'] = spot_pnl['Bid'].div(spot_pnl['Bid'].shift(1)) - 1
spot_pnl['pnl_sign'] = spot_pnl['pnl'].apply(lambda x: 1 if x >= 0 else 0)
spot_pnl

Unnamed: 0,Bid,Ask,date_start,pnl,pnl_sign
0,94.994073,96.879993,2022-03-01,,0
1,95.803033,95.971040,2022-03-02,0.008516,1
2,114.920925,114.999880,2022-03-03,0.199554,1
3,105.000043,108.100490,2022-03-04,-0.086328,0
4,116.958480,117.096750,2022-03-09,0.113890,1
...,...,...,...,...,...
153,61.484698,61.978255,2022-10-18,-0.003440,0
154,61.588507,62.014587,2022-10-19,0.001688,1
155,61.600813,62.025917,2022-10-20,0.000200,1
156,61.144888,61.667988,2022-10-21,-0.007401,0


In [120]:
# Create target variable dataframe
pnl = pd.read_csv(f'{PATH_OPT_PNL}/Backtest_{BACKTEST_CCY}_{BACKTEST_DAYS}_days.txt')
pnl['date_start'] = pd.to_datetime(pnl['date_start']).dt.strftime('%Y-%m-%d')
pnl['pnl_sign'] = pnl['pnl'].apply(lambda x: 1 if x >= 0 else 0)
pnl

Unnamed: 0,date_start,pnl,pnl_sign
0,2021-01-04,291264.097914,1
1,2021-01-05,-411993.830320,0
2,2021-01-06,521491.686795,1
3,2021-01-08,62842.634116,1
4,2021-01-11,-537598.706217,0
...,...,...,...
245,2021-12-20,-407527.554561,0
246,2021-12-21,-176881.417077,0
247,2021-12-22,-206943.414418,0
248,2021-12-23,23073.596468,1


In [121]:
# Get balance of the sample
# pnl['pnl_sign'].sum() / pnl.shape[0]
spot_pnl['pnl_sign'].sum() / spot_pnl.shape[0]

0.4741847826086957

In [12]:
# df = df[(df['date'] >= '2022-03-01') & (df['date'] <= '2022-10-31')]

In [125]:
data_sources[0][1]

Unnamed: 0,id,date,text
0,9,2017-07-28T10:56:14,Cbonds.ru –∑–∞–ø—É—Å—Ç–∏–ª–æ –∫–∞–Ω–∞–ª –≤ Telegram. –ü–ª–∞–Ω–∏—Ä—É...
1,10,2017-08-03T15:28:17,–û–±–ª–∏–≥–∞—Ü–∏–æ–Ω–Ω—ã–π –±—é–ª–ª–µ—Ç–µ–Ω—å Cbonds ‚Äì –≤—Å–µ –¥–∞–Ω–Ω—ã–µ –æ ...
2,11,2017-08-03T15:41:02,Cbonds prepared monthly report: CBONDS GLOBAL ...
3,12,2017-08-03T18:11:32,–î–∞–π–¥–∂–µ—Å—Ç Cbonds –æ—Ç 3 –∞–≤–≥—É—Å—Ç–∞:¬´–ì—Ä—É–ø–ø–∞ –ö–æ–º–ø–∞–Ω–∏–π...
4,13,2017-08-04T12:42:41,–°–æ–≤–∫–æ–º–±–∞–Ω–∫ –æ—Ç–∫—Ä—ã–ª –∫–Ω–∏–≥—É –∑–∞—è–≤–æ–∫ –Ω–∞ –≤—Ç–æ—Ä–∏—á–Ω–æ–µ —Ä–∞...
...,...,...,...
11997,13124,2022-11-04T12:38:56,‚Äã‚Äã #–†–µ–π—Ç–∏–Ω–≥–∏–°–ù–ì üóÇ –ü—Ä–∏—Å–≤–æ–µ–Ω–∏–µ / –ò–∑–º–µ–Ω–µ–Ω–∏–µ —Ä–µ–π—Ç...
11998,13125,2022-11-04T13:17:23,#CbondsNewsletter üåéüì∞ –í –æ–∫—Ç—è–±—Ä–µ –æ–±—ä–µ–º –Ω–æ–≤—ã—Ö –≤—ã...
11999,13126,2022-11-04T14:30:00,"‚Äã‚Äã üè¢–û–Ω–ª–∞–π–Ω-—Å–µ–º–∏–Ω–∞—Ä ¬´–ö–ª—é—á–µ–≤—ã–µ —Ñ–∞–∫—Ç—ã, —Ñ–∏–Ω–∞–Ω—Å–æ–≤—ã–µ..."
12000,13128,2022-11-05T11:30:11,#–ê–Ω–∞–ª–∏—Ç–∏–∫–∞ üèÜ Research Hub Cbonds: —Å–∞–º–æ–µ –ø–æ–ø—É...


In [14]:
# df = pnl
# for source_name, source_dataframe in data_sources:
#     df[f'{source_name}'] = None
    
#     i = 0
#     texts = []
#     for _, row in source_dataframe.iterrows():
#         if i >= df.shape[0]:
#             break

#         if pd.to_datetime(row['date']).strftime('%Y-%m-%d') <= df['date_start'][i]:
#             texts.append(row['text'])
#         else:
#             df.at[i, source_name] = texts
#             texts = []
#             i += 1
#             continue

In [129]:
df = spot_pnl
for source_name, source_dataframe in data_sources:
    df[f'{source_name}'] = ''
    
    i = 0
    texts = ''
    for _, row in source_dataframe.iterrows():
        if i >= df.shape[0]:
            break

        if pd.to_datetime(row['date']).strftime('%Y-%m-%d') <= df['date_start'][i]:
            texts += ' ' + row['text']
        else:
            df.at[i, source_name] = texts
            texts = ''
            i += 1
            continue

In [130]:
df = df.iloc[1:]
df

Unnamed: 0,Bid,Ask,date_start,pnl,pnl_sign,cbonds,themovchans,headlines_QUANTS,War_Wealth_Wisdom,mmi,vts,signal,rshb_invest,Alfa_Wealth,sky_bond,bitkogan
1,95.803033,95.971040,2022-03-02,0.008516,1,#–ù–æ–≤–æ—Å—Ç–∏–ö–æ–º–ø–∞–Ω–∏–π ‚ö°Ô∏è –†–æ—Å—Å–∏–π—Å–∫–∏–µ –∫–æ–º–ø–∞–Ω–∏–∏: –æ—Å–Ω...,,,,"üü¢üî¥ - –ó–∞–æ–∫–µ–∞–Ω—Å–∫–∏–µ –∏–Ω–¥–µ–∫—Å—ã –∑–∞–∫—Ä—ã–ª–∏—Å—å –≤ –º–∏–Ω—É—Å–µ, ...",,#–ú–∞–∫—Ä–æ \nüìä –†—ã–Ω–æ–∫ —Å–º–∞—Ä—Ç—Ñ–æ–Ω–æ–≤ –≤ –†–§ –ø–æ –ø—Ä–æ–∏–∑–≤–æ–¥–∏...,–ù–û–í–û–°–¢–ò –î–ù–Ø üìä –°–®–ê –∏ —Å—Ç—Ä–∞–Ω—ã –ú–≠–ê –≤—ã—Å–≤–æ–±–æ–¥—è—Ç 60 ...,,‚Äã‚Äã PetroRio\n \n–ö—Ä—É–ø–Ω–µ–π—à–∞—è –±—Ä–∞–∑–∏–ª—å—Å–∫–∞—è –Ω–µ–∑–∞–≤–∏...,–î—Ä—É–∑—å—è! –°–æ–±—ã—Ç–∏—è –≤ –º–∏—Ä–µ —Ä–∞–∑–≤–æ—Ä–∞—á–∏–≤–∞—é—Ç—Å—è —Å —Ç–∞–∫–æ...
2,114.920925,114.999880,2022-03-03,0.199554,1,‚ö°Ô∏è–í–∞–∂–Ω–æ–µ –Ω–∞ —Ä—ã–Ω–∫–∞—Ö: üèõ–ë–∞–Ω–∫ –†–æ—Å—Å–∏–∏ –¥–æ –∫–æ–Ω—Ü–∞ –≥–æ–¥...,,,,‚ÄºÔ∏è ( Reuters ) –†–æ—Å—Å–∏–π—Å–∫–∏–π –¶–µ–Ω—Ç—Ä–æ–±–∞–Ω–∫ –≤–≤–µ–ª —Å 3...,‚Äã‚Äã –ë–æ—Ä—å–±–∞ –Ω–∞ –≤–∞–ª—é—Ç–Ω–æ–º —Ä—ã–Ω–∫–µ . \n–ì—Ä–∞—Ñ–∏–∫ —Ä—É–±–ª—å/...,#GAZP #ROSN #–°–∞–Ω–∫—Ü–∏–∏ \nüá∫üá∏ –†–µ—Å–ø—É–±–ª–∏–∫–∞–Ω—Ü—ã ...,–ì–õ–ê–í–ù–û–ï –í –†–û–°–°–ò–ò üìà –ü—Ä–∞–≤–∏—Ç–µ–ª—å—Å—Ç–≤–æ –ø—Ä–∏–æ—Å—Ç–∞–Ω–æ–≤–∏—Ç...,,"‚Äã‚Äã Alibaba\n \n Alibaba Group , –Ω–∞–≤–µ—Ä–Ω–æ, —Å–∞–º–∞...","Fitch. –ß—Ç–æ –∑–∞ –¥–∏–≤–µ—Ä—Å–∏—è? –í Fitch –æ–ø–∞—Å–∞—é—Ç—Å—è ,..."
3,105.000043,108.100490,2022-03-04,-0.086328,0,"‚ö°Ô∏è–í–∞–∂–Ω–æ–µ –Ω–∞ —Ä—ã–Ω–∫–∞—Ö: üèõüíµ–ë–∞–Ω–∫ –†–æ—Å—Å–∏–∏, –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª...",,,,‚ÄºÔ∏è- –ë–∞–Ω–∫ –†–æ—Å—Å–∏–∏ –ø—Ä–∏–Ω—è–ª —Ä–µ—à–µ–Ω–∏–µ –Ω–µ –≤–æ–∑–æ–±–Ω–æ–≤–ª—è...,,"$=109,54—Ä. RTS 0,00% BRENT +3,35%üìå LIVE –ö–∞—Ä—Ç...",–ù–û–í–û–°–¢–ò –î–ù–Ø üìä –§–†–° –æ–∂–∏–¥–∞–µ—Ç —Ä–æ—Å—Ç–∞ –∏–Ω—Ñ–ª—è—Ü–∏–∏ –≤ –°–®...,,"‚Äã‚Äã –ù–µ —Ç–∞–∫ –¥–∞–≤–Ω–æ —è —Ä–∞—Å—Å–∫–∞–∑—ã–≤–∞–ª –æ —Ç–æ–º, —á—Ç–æ –æ–±...",–ö–∞–∫ –≤—Å–µ –≤—ã—à–µ–ø–µ—Ä–µ—á–∏—Å–ª–µ–Ω–Ω–æ–µ –≤–ª–∏—è–µ—Ç –Ω–∞ –∂–∏–∑–Ω—å –∫–æ–Ω...
4,116.958480,117.096750,2022-03-09,0.113890,1,#–†–µ–π—Ç–∏–Ω–≥–∏–ú–∏—Ä üìä –†–µ–π—Ç–∏–Ω–≥–æ–≤—ã–µ –¥–µ–π—Å—Ç–≤–∏—è –≤ –∏–Ω–æ—Å—Ç—Ä–∞...,,,–ï—â—ë –±–æ–ª–µ–µ –¥—Ä–∞–º–∞—Ç–∏—á–Ω–∞—è —Å–∏—Ç—É–∞—Ü–∏—è —Å –º–∏—Ä–æ–≤—ã–º–∏ —Ü–µ–Ω...,‚ÄºÔ∏è –í—Å–µ –∫–∞—Ä—Ç—ã –º–µ–∂–¥—É–Ω–∞—Ä–æ–¥–Ω—ã—Ö –ø–ª–∞—Ç–µ–∂–Ω—ã—Ö —Å–∏—Å—Ç–µ–º V...,–ù–µ–¥–æ—Ä—É–±–ª–∏\n –î–≤–µ –Ω–æ–≤–æ—Å—Ç–∏. Visa –∏ Mastercard —Ä–æ...,#–°–ú–ò \nüì∞ –ê–º–µ—Ä–∏–∫–∞–Ω—Å–∫–∏–µ –°–ú–ò Bloomberg –∏ CNN –æ–±—ä...,–ì–õ–ê–í–ù–û–ï –í –†–û–°–°–ò–ò üìà –¢–æ—Ä–≥–∏ –Ω–∞ –æ—Å–Ω–æ–≤–Ω–æ–π —Å–µ—Å—Å–∏–∏ –ú...,,‚Äã‚Äã –ù–∞ —Å—á–µ—Ç —Å—Ç–∞—Ç—å–∏ –†–ë–ö –æ–± –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–∏—è—Ö –∏ —Å–ª–æ...,–ñ—É—Ä–Ω–∞–ª–∏—Å—Ç—ã Bloomberg –ø—Ä–æ—â–∞—é—Ç—Å—è —Å –†–æ—Å—Å–∏–µ–π –ê–≥–µ–Ω...
5,120.600105,120.751750,2022-03-10,0.031136,1,üíª JPMorgan Chase & Co. –æ—Ç–∫–∞–∑–∞–ª—Å—è –æ—Ç —Ä–æ–ª–∏ –∫–æ–Ω...,,,,üü¢ - –ù–µ—Ñ—Ç—å –≤ –ø–æ—Å–ª–µ–¥–Ω–∏–µ —Å—É—Ç–∫–∏ –∫—Ä–∞–π–Ω–µ –≤–æ–ª–∞—Ç–∏–ª—å–Ω...,,#–°–ú–ò \nüì∞ Reuters —É–∑–Ω–∞–ª –æ –ø–ª–∞–Ω–∞—Ö –ï–° –æ—Ç–∫–∞–∑–∞—Ç—å—Å...,–ì–õ–ê–í–ù–û–ï –í –†–û–°–°–ò–ò üìä –ò–Ω—Ñ–ª—è—Ü–∏—è –≤ –†–æ—Å—Å–∏–∏ –∑–∞ –Ω–µ–¥–µ–ª...,–†—ã–Ω–æ–∫ –∂–¥—ë—Ç –æ—Ç–∫—Ä—ã—Ç–∏—è —Ç–æ—Ä–≥–æ–≤ –Ω–∞ —Å–ª–µ–¥—É—é—â–µ–π –Ω–µ–¥–µ–ª...,,"–ì–æ–¥–æ–≤–∞—è –∏–Ω—Ñ–ª—è—Ü–∏—è –≤ –†–æ—Å—Å–∏–∏ –¥–æ—Å—Ç–∏–≥–ª–∞ 10,4% . –†–æ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,61.484698,61.978255,2022-10-18,-0.003440,0,‚è∞ Last-call: –ù–ï–î–ï–õ–Ø –¥–æ XI –∫–æ–Ω—Ñ–µ—Ä–µ–Ω—Ü–∏–∏ ¬´–ò–ø–æ—Ç–µ...,,,,–†–´–ù–û–ö –ñ–ò–õ–û–ô –ù–ï–î–í–ò–ñ–ò–ú–û–°–¢–ò –í –ú–û–°–ö–í–ï: –î–ò–ù–ê–ú–ò–ö–ê –í...,,üî¶ –§–æ–∫—É—Å –Ω–µ–¥–µ–ª–∏ #–§–ù –°–µ–≥–æ–¥–Ω—è\n‚ùóÔ∏è XX —Å—ä–µ–∑–¥ –ö–æ–º–ø...,üìà¬† –ú–µ—Ç–∞–ª–ª—É—Ä–≥–∏ –ø—Ä–æ—Ç–∏–≤ —Ä–æ—Å—Ç–∞ —Ç–∞—Ä–∏—Ñ–æ–≤ –û–ê–û ¬´–†–ñ–î¬ª....,–í –î—É–±–∞–π —á–µ—Ä–µ–∑ –ê–ª—å—Ñ–∞-–ö–∞–ø–∏—Ç–∞–ª ( –ø—Ä–æ–¥–æ–ª–∂–µ–Ω–∏–µ )–í...,,–ì—Ä–∞—Ñ–∏–∫ –∏–Ω–¥–µ–∫—Å–∞ DXY @bitkogan –î–æ—Ö–æ–¥–Ω–æ—Å—Ç—å 10-...
154,61.588507,62.014587,2022-10-19,0.001688,1,#–ï–≤—Ä–æ–æ–±–ª–∏–≥–∞—Ü–∏–∏ #–û–§–ó üìù –í—ã–ø–ª–∞—Ç—ã –∫—É–ø–æ–Ω–æ–≤ –ø–æ –û–§...,,,,–ò–ù–§–õ–Ø–¶–ò–Ø –í UK: –¶–ï–ù–û–í–û–ô –ö–û–®–ú–ê–† –í–ù–û–í–¨ –£–°–ò–õ–ò–í–ê–ï–¢...,,"$=61,87—Ä. RTS -2,61% BRENT +0,97%üìå LIVE –ö–∞—Ä—Ç...",üìà¬† ¬´–í–µ–¥–æ–º–æ—Å—Ç–∏¬ª: ¬´–†–æ—Å–Ω–∞–Ω–æ¬ª –º–æ–≥—É—Ç —Ä–∞—Å—Ñ–æ—Ä–º–∏—Ä–æ–≤–∞—Ç...,,,–ù–∞ —Ä—ã–Ω–∫–∞—Ö –ø—Ä–æ–¥–æ–ª–∂–∞–µ—Ç—Å—è —Ñ–∏–µ—Å—Ç–∞ –í–µ–ª–∏–∫–æ–±—Ä–∏—Ç–∞–Ω–∏–∏...
155,61.600813,62.025917,2022-10-20,0.000200,1,#–ï–≤—Ä–æ–æ–±–ª–∏–≥–∞—Ü–∏–∏ #–û–§–ó üìù –í—ã–ø–ª–∞—Ç—ã –∫—É–ø–æ–Ω–æ–≤ –ø–æ –û–§...,,,–ê–∫—Ü–∏–∏ Tesla —É–ø–∞–ª–∏ –ø–æ—Å–ª–µ –ø—É–±–ª–∏–∫–∞—Ü–∏–∏ –æ—Ç—á–µ—Ç–Ω–æ—Å—Ç–∏...,üî¥üü¢ - –°–Ω–∏–∂–µ–Ω–∏–µ –≤ –ø—Ä–µ–¥–µ–ª–∞—Ö –ø—Ä–æ—Ü–µ–Ω—Ç–∞ –Ω–∞ WallStre...,,üî¶ –§–æ–∫—É—Å –Ω–µ–¥–µ–ª–∏ #–§–ù –°–µ–≥–æ–¥–Ω—è\nüá∑üá∫ –†–æ—Å—Å–∏—è: –∑–∞—Å–µ–¥...,üìà¬† –†–æ—Å—Å—Ç–∞—Ç —Å–æ–æ–±—â–∏–ª –æ —Ä–æ—Å—Ç–µ —Ü–µ–Ω –ø—Ä–æ–º–ø—Ä–æ–∏–∑–≤–æ–¥–∏—Ç...,,,–ê —Ç–µ–ø–µ—Ä—å –æ —Ä—ã–Ω–∫–∞—Ö. –ß—Ç–æ –∏–Ω—Ç–µ—Ä–µ—Å–Ω–æ–≥–æ —Ç–∞–º?‚ñ™Ô∏è–ò–Ω–¥–µ...
156,61.144888,61.667988,2022-10-21,-0.007401,0,‚Äã‚Äã #–†–µ–π—Ç–∏–Ω–≥–∏–°–ù–ì üóÇ –ü—Ä–∏—Å–≤–æ–µ–Ω–∏–µ / –ò–∑–º–µ–Ω–µ–Ω–∏–µ —Ä–µ–π...,,,,üî¥üü¢ - –î–æ—Ö–æ–¥–Ω–æ—Å—Ç–∏ –æ—Å–Ω–æ–≤–Ω—ã—Ö —Å—É–≤–µ—Ä–µ–Ω–Ω—ã—Ö –±–µ–Ω—á–º–∞—Ä–∫–æ...,,üî¶ –§–æ–∫—É—Å –Ω–µ–¥–µ–ª–∏ #–§–ù –°–µ–≥–æ–¥–Ω—è\nüá∑üá∫ –†–æ—Å—Å–∏—è: –∑–∞—Å–µ–¥...,üìà¬† Softline –æ–∫–æ–Ω—á–∞—Ç–µ–ª—å–Ω–æ —Ä–∞–∑–¥–µ–ª–∏–ª–∞—Å—å –Ω–∞ –¥–≤–µ –∫...,,,–ï—â–µ –Ω–µ–º–Ω–æ–≥–æ –æ —Ä—ã–Ω–∫–∞—Ö. –í—Å–µ–º–∏—Ä–Ω–æ –∏–∑–≤–µ—Å—Ç–Ω—ã–π —ç–∫–æ...


In [131]:
df = df[~df.isin([''])].dropna(axis=1, how='all')

In [132]:
# df_final = df.drop(['cbonds', 'signal', 'rshb_invest', 'Alfa_Wealth', 'bitkogan'], axis=1)
df_final = df
df_final

Unnamed: 0,Bid,Ask,date_start,pnl,pnl_sign,cbonds,War_Wealth_Wisdom,mmi,vts,signal,rshb_invest,Alfa_Wealth,sky_bond,bitkogan
1,95.803033,95.971040,2022-03-02,0.008516,1,#–ù–æ–≤–æ—Å—Ç–∏–ö–æ–º–ø–∞–Ω–∏–π ‚ö°Ô∏è –†–æ—Å—Å–∏–π—Å–∫–∏–µ –∫–æ–º–ø–∞–Ω–∏–∏: –æ—Å–Ω...,,"üü¢üî¥ - –ó–∞–æ–∫–µ–∞–Ω—Å–∫–∏–µ –∏–Ω–¥–µ–∫—Å—ã –∑–∞–∫—Ä—ã–ª–∏—Å—å –≤ –º–∏–Ω—É—Å–µ, ...",,#–ú–∞–∫—Ä–æ \nüìä –†—ã–Ω–æ–∫ —Å–º–∞—Ä—Ç—Ñ–æ–Ω–æ–≤ –≤ –†–§ –ø–æ –ø—Ä–æ–∏–∑–≤–æ–¥–∏...,–ù–û–í–û–°–¢–ò –î–ù–Ø üìä –°–®–ê –∏ —Å—Ç—Ä–∞–Ω—ã –ú–≠–ê –≤—ã—Å–≤–æ–±–æ–¥—è—Ç 60 ...,,‚Äã‚Äã PetroRio\n \n–ö—Ä—É–ø–Ω–µ–π—à–∞—è –±—Ä–∞–∑–∏–ª—å—Å–∫–∞—è –Ω–µ–∑–∞–≤–∏...,–î—Ä—É–∑—å—è! –°–æ–±—ã—Ç–∏—è –≤ –º–∏—Ä–µ —Ä–∞–∑–≤–æ—Ä–∞—á–∏–≤–∞—é—Ç—Å—è —Å —Ç–∞–∫–æ...
2,114.920925,114.999880,2022-03-03,0.199554,1,‚ö°Ô∏è–í–∞–∂–Ω–æ–µ –Ω–∞ —Ä—ã–Ω–∫–∞—Ö: üèõ–ë–∞–Ω–∫ –†–æ—Å—Å–∏–∏ –¥–æ –∫–æ–Ω—Ü–∞ –≥–æ–¥...,,‚ÄºÔ∏è ( Reuters ) –†–æ—Å—Å–∏–π—Å–∫–∏–π –¶–µ–Ω—Ç—Ä–æ–±–∞–Ω–∫ –≤–≤–µ–ª —Å 3...,‚Äã‚Äã –ë–æ—Ä—å–±–∞ –Ω–∞ –≤–∞–ª—é—Ç–Ω–æ–º —Ä—ã–Ω–∫–µ . \n–ì—Ä–∞—Ñ–∏–∫ —Ä—É–±–ª—å/...,#GAZP #ROSN #–°–∞–Ω–∫—Ü–∏–∏ \nüá∫üá∏ –†–µ—Å–ø—É–±–ª–∏–∫–∞–Ω—Ü—ã ...,–ì–õ–ê–í–ù–û–ï –í –†–û–°–°–ò–ò üìà –ü—Ä–∞–≤–∏—Ç–µ–ª—å—Å—Ç–≤–æ –ø—Ä–∏–æ—Å—Ç–∞–Ω–æ–≤–∏—Ç...,,"‚Äã‚Äã Alibaba\n \n Alibaba Group , –Ω–∞–≤–µ—Ä–Ω–æ, —Å–∞–º–∞...","Fitch. –ß—Ç–æ –∑–∞ –¥–∏–≤–µ—Ä—Å–∏—è? –í Fitch –æ–ø–∞—Å–∞—é—Ç—Å—è ,..."
3,105.000043,108.100490,2022-03-04,-0.086328,0,"‚ö°Ô∏è–í–∞–∂–Ω–æ–µ –Ω–∞ —Ä—ã–Ω–∫–∞—Ö: üèõüíµ–ë–∞–Ω–∫ –†–æ—Å—Å–∏–∏, –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª...",,‚ÄºÔ∏è- –ë–∞–Ω–∫ –†–æ—Å—Å–∏–∏ –ø—Ä–∏–Ω—è–ª —Ä–µ—à–µ–Ω–∏–µ –Ω–µ –≤–æ–∑–æ–±–Ω–æ–≤–ª—è...,,"$=109,54—Ä. RTS 0,00% BRENT +3,35%üìå LIVE –ö–∞—Ä—Ç...",–ù–û–í–û–°–¢–ò –î–ù–Ø üìä –§–†–° –æ–∂–∏–¥–∞–µ—Ç —Ä–æ—Å—Ç–∞ –∏–Ω—Ñ–ª—è—Ü–∏–∏ –≤ –°–®...,,"‚Äã‚Äã –ù–µ —Ç–∞–∫ –¥–∞–≤–Ω–æ —è —Ä–∞—Å—Å–∫–∞–∑—ã–≤–∞–ª –æ —Ç–æ–º, —á—Ç–æ –æ–±...",–ö–∞–∫ –≤—Å–µ –≤—ã—à–µ–ø–µ—Ä–µ—á–∏—Å–ª–µ–Ω–Ω–æ–µ –≤–ª–∏—è–µ—Ç –Ω–∞ –∂–∏–∑–Ω—å –∫–æ–Ω...
4,116.958480,117.096750,2022-03-09,0.113890,1,#–†–µ–π—Ç–∏–Ω–≥–∏–ú–∏—Ä üìä –†–µ–π—Ç–∏–Ω–≥–æ–≤—ã–µ –¥–µ–π—Å—Ç–≤–∏—è –≤ –∏–Ω–æ—Å—Ç—Ä–∞...,–ï—â—ë –±–æ–ª–µ–µ –¥—Ä–∞–º–∞—Ç–∏—á–Ω–∞—è —Å–∏—Ç—É–∞—Ü–∏—è —Å –º–∏—Ä–æ–≤—ã–º–∏ —Ü–µ–Ω...,‚ÄºÔ∏è –í—Å–µ –∫–∞—Ä—Ç—ã –º–µ–∂–¥—É–Ω–∞—Ä–æ–¥–Ω—ã—Ö –ø–ª–∞—Ç–µ–∂–Ω—ã—Ö —Å–∏—Å—Ç–µ–º V...,–ù–µ–¥–æ—Ä—É–±–ª–∏\n –î–≤–µ –Ω–æ–≤–æ—Å—Ç–∏. Visa –∏ Mastercard —Ä–æ...,#–°–ú–ò \nüì∞ –ê–º–µ—Ä–∏–∫–∞–Ω—Å–∫–∏–µ –°–ú–ò Bloomberg –∏ CNN –æ–±—ä...,–ì–õ–ê–í–ù–û–ï –í –†–û–°–°–ò–ò üìà –¢–æ—Ä–≥–∏ –Ω–∞ –æ—Å–Ω–æ–≤–Ω–æ–π —Å–µ—Å—Å–∏–∏ –ú...,,‚Äã‚Äã –ù–∞ —Å—á–µ—Ç —Å—Ç–∞—Ç—å–∏ –†–ë–ö –æ–± –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–∏—è—Ö –∏ —Å–ª–æ...,–ñ—É—Ä–Ω–∞–ª–∏—Å—Ç—ã Bloomberg –ø—Ä–æ—â–∞—é—Ç—Å—è —Å –†–æ—Å—Å–∏–µ–π –ê–≥–µ–Ω...
5,120.600105,120.751750,2022-03-10,0.031136,1,üíª JPMorgan Chase & Co. –æ—Ç–∫–∞–∑–∞–ª—Å—è –æ—Ç —Ä–æ–ª–∏ –∫–æ–Ω...,,üü¢ - –ù–µ—Ñ—Ç—å –≤ –ø–æ—Å–ª–µ–¥–Ω–∏–µ —Å—É—Ç–∫–∏ –∫—Ä–∞–π–Ω–µ –≤–æ–ª–∞—Ç–∏–ª—å–Ω...,,#–°–ú–ò \nüì∞ Reuters —É–∑–Ω–∞–ª –æ –ø–ª–∞–Ω–∞—Ö –ï–° –æ—Ç–∫–∞–∑–∞—Ç—å—Å...,–ì–õ–ê–í–ù–û–ï –í –†–û–°–°–ò–ò üìä –ò–Ω—Ñ–ª—è—Ü–∏—è –≤ –†–æ—Å—Å–∏–∏ –∑–∞ –Ω–µ–¥–µ–ª...,–†—ã–Ω–æ–∫ –∂–¥—ë—Ç –æ—Ç–∫—Ä—ã—Ç–∏—è —Ç–æ—Ä–≥–æ–≤ –Ω–∞ —Å–ª–µ–¥—É—é—â–µ–π –Ω–µ–¥–µ–ª...,,"–ì–æ–¥–æ–≤–∞—è –∏–Ω—Ñ–ª—è—Ü–∏—è –≤ –†–æ—Å—Å–∏–∏ –¥–æ—Å—Ç–∏–≥–ª–∞ 10,4% . –†–æ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,61.484698,61.978255,2022-10-18,-0.003440,0,‚è∞ Last-call: –ù–ï–î–ï–õ–Ø –¥–æ XI –∫–æ–Ω—Ñ–µ—Ä–µ–Ω—Ü–∏–∏ ¬´–ò–ø–æ—Ç–µ...,,–†–´–ù–û–ö –ñ–ò–õ–û–ô –ù–ï–î–í–ò–ñ–ò–ú–û–°–¢–ò –í –ú–û–°–ö–í–ï: –î–ò–ù–ê–ú–ò–ö–ê –í...,,üî¶ –§–æ–∫—É—Å –Ω–µ–¥–µ–ª–∏ #–§–ù –°–µ–≥–æ–¥–Ω—è\n‚ùóÔ∏è XX —Å—ä–µ–∑–¥ –ö–æ–º–ø...,üìà¬† –ú–µ—Ç–∞–ª–ª—É—Ä–≥–∏ –ø—Ä–æ—Ç–∏–≤ —Ä–æ—Å—Ç–∞ —Ç–∞—Ä–∏—Ñ–æ–≤ –û–ê–û ¬´–†–ñ–î¬ª....,–í –î—É–±–∞–π —á–µ—Ä–µ–∑ –ê–ª—å—Ñ–∞-–ö–∞–ø–∏—Ç–∞–ª ( –ø—Ä–æ–¥–æ–ª–∂–µ–Ω–∏–µ )–í...,,–ì—Ä–∞—Ñ–∏–∫ –∏–Ω–¥–µ–∫—Å–∞ DXY @bitkogan –î–æ—Ö–æ–¥–Ω–æ—Å—Ç—å 10-...
154,61.588507,62.014587,2022-10-19,0.001688,1,#–ï–≤—Ä–æ–æ–±–ª–∏–≥–∞—Ü–∏–∏ #–û–§–ó üìù –í—ã–ø–ª–∞—Ç—ã –∫—É–ø–æ–Ω–æ–≤ –ø–æ –û–§...,,–ò–ù–§–õ–Ø–¶–ò–Ø –í UK: –¶–ï–ù–û–í–û–ô –ö–û–®–ú–ê–† –í–ù–û–í–¨ –£–°–ò–õ–ò–í–ê–ï–¢...,,"$=61,87—Ä. RTS -2,61% BRENT +0,97%üìå LIVE –ö–∞—Ä—Ç...",üìà¬† ¬´–í–µ–¥–æ–º–æ—Å—Ç–∏¬ª: ¬´–†–æ—Å–Ω–∞–Ω–æ¬ª –º–æ–≥—É—Ç —Ä–∞—Å—Ñ–æ—Ä–º–∏—Ä–æ–≤–∞—Ç...,,,–ù–∞ —Ä—ã–Ω–∫–∞—Ö –ø—Ä–æ–¥–æ–ª–∂–∞–µ—Ç—Å—è —Ñ–∏–µ—Å—Ç–∞ –í–µ–ª–∏–∫–æ–±—Ä–∏—Ç–∞–Ω–∏–∏...
155,61.600813,62.025917,2022-10-20,0.000200,1,#–ï–≤—Ä–æ–æ–±–ª–∏–≥–∞—Ü–∏–∏ #–û–§–ó üìù –í—ã–ø–ª–∞—Ç—ã –∫—É–ø–æ–Ω–æ–≤ –ø–æ –û–§...,–ê–∫—Ü–∏–∏ Tesla —É–ø–∞–ª–∏ –ø–æ—Å–ª–µ –ø—É–±–ª–∏–∫–∞—Ü–∏–∏ –æ—Ç—á–µ—Ç–Ω–æ—Å—Ç–∏...,üî¥üü¢ - –°–Ω–∏–∂–µ–Ω–∏–µ –≤ –ø—Ä–µ–¥–µ–ª–∞—Ö –ø—Ä–æ—Ü–µ–Ω—Ç–∞ –Ω–∞ WallStre...,,üî¶ –§–æ–∫—É—Å –Ω–µ–¥–µ–ª–∏ #–§–ù –°–µ–≥–æ–¥–Ω—è\nüá∑üá∫ –†–æ—Å—Å–∏—è: –∑–∞—Å–µ–¥...,üìà¬† –†–æ—Å—Å—Ç–∞—Ç —Å–æ–æ–±—â–∏–ª –æ —Ä–æ—Å—Ç–µ —Ü–µ–Ω –ø—Ä–æ–º–ø—Ä–æ–∏–∑–≤–æ–¥–∏—Ç...,,,–ê —Ç–µ–ø–µ—Ä—å –æ —Ä—ã–Ω–∫–∞—Ö. –ß—Ç–æ –∏–Ω—Ç–µ—Ä–µ—Å–Ω–æ–≥–æ —Ç–∞–º?‚ñ™Ô∏è–ò–Ω–¥–µ...
156,61.144888,61.667988,2022-10-21,-0.007401,0,‚Äã‚Äã #–†–µ–π—Ç–∏–Ω–≥–∏–°–ù–ì üóÇ –ü—Ä–∏—Å–≤–æ–µ–Ω–∏–µ / –ò–∑–º–µ–Ω–µ–Ω–∏–µ —Ä–µ–π...,,üî¥üü¢ - –î–æ—Ö–æ–¥–Ω–æ—Å—Ç–∏ –æ—Å–Ω–æ–≤–Ω—ã—Ö —Å—É–≤–µ—Ä–µ–Ω–Ω—ã—Ö –±–µ–Ω—á–º–∞—Ä–∫–æ...,,üî¶ –§–æ–∫—É—Å –Ω–µ–¥–µ–ª–∏ #–§–ù –°–µ–≥–æ–¥–Ω—è\nüá∑üá∫ –†–æ—Å—Å–∏—è: –∑–∞—Å–µ–¥...,üìà¬† Softline –æ–∫–æ–Ω—á–∞—Ç–µ–ª—å–Ω–æ —Ä–∞–∑–¥–µ–ª–∏–ª–∞—Å—å –Ω–∞ –¥–≤–µ –∫...,,,–ï—â–µ –Ω–µ–º–Ω–æ–≥–æ –æ —Ä—ã–Ω–∫–∞—Ö. –í—Å–µ–º–∏—Ä–Ω–æ –∏–∑–≤–µ—Å—Ç–Ω—ã–π —ç–∫–æ...


In [133]:
X = df_final.drop(['date_start', 'pnl', 'pnl_sign'], axis=1)
y = df_final['pnl_sign']

In [134]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=121)

In [135]:
import nltk
nltk.download("stopwords")

from nltk.corpus import stopwords

russian_stopwords = stopwords.words("russian")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/buchkovv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [136]:
from scipy.sparse import coo_matrix, hstack
from sklearn.feature_extraction.text import TfidfVectorizer

X_train_transformed = []
X_val_transformed = []
for i in range(len(X_train.columns)):
    tf_idf = TfidfVectorizer(stop_words=russian_stopwords, ngram_range=(1,1))
    
    X_train_transformed.append(coo_matrix(tf_idf.fit_transform(X_train.iloc[:,i].astype(str))))
    X_val_transformed.append(coo_matrix(tf_idf.transform(X_val.iloc[:,i].astype(str))))

In [137]:
for x in X_train_transformed:
    print(x.shape)

(125, 163)
(125, 166)
(125, 20789)
(125, 5791)
(125, 16115)
(125, 2569)
(125, 24279)
(125, 18334)
(125, 2627)
(125, 925)
(125, 41341)


In [138]:
X_train_transformed = hstack(X_train_transformed)
X_val_transformed = hstack(X_val_transformed)
X_train_transformed

<125x133099 sparse matrix of type '<class 'numpy.float64'>'
	with 555498 stored elements in COOrdinate format>

In [139]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(multi_class='auto', solver='lbfgs', verbose=False)
log_reg.fit(X_train_transformed, y_train)

preds = log_reg.predict(X_val_transformed)
accuracy_score(preds, y_val)

0.53125