<a href="https://colab.research.google.com/github/vasilyryabtsev/futures-price-prediction/blob/dev/ds/twitter/notebooks/ml_methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, roc_curve, auc, precision_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator, TransformerMixin
from transformers import BertTokenizer, BertForSequenceClassification, pipeline, AutoTokenizer, AutoModelForSequenceClassification
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import download
from nltk.stem import PorterStemmer
import re

RANDOM_STATE = 42

In [10]:
df = pd.read_csv('https://github.com/vasilyryabtsev/futures-price-prediction/blob/dev/ds/twitter/data/processed.csv?raw=true')
df

Unnamed: 0,text,is_quote_status,has_card,1_day_after,is_in_reply_to,is_urls,year,month,day_of_week
0,"['meta', 'zuckerberg', 'said', 'would', 'twice...",0,0,0,0,0,2024,10,5
1,"['follow', 'market', 'news', 'includ', 'appl',...",0,0,1,1,1,2024,9,5
2,"['appl', 'aapl', 'reportedli', 'longer', 'invo...",0,0,1,0,0,2024,9,5
3,"['appl', 'aapl', 'longer', 'growth', 'engin', ...",0,0,1,0,0,2024,9,3
4,"['hottest', 'activ', 'bullish', 'bearish', 'op...",0,0,1,0,1,2024,9,0
...,...,...,...,...,...,...,...,...,...
8393,"['probabl', 'time', 'go', 'overweight', 'sbux'...",0,0,0,0,0,2014,1,1
8394,"['problem', 'brew', 'wall', 'street', 'coffe',...",0,0,1,0,1,2014,1,1
8395,"['get', 'hedgeyehwp', 'nrnpowerlist', 'dri', '...",0,0,0,0,0,2014,1,0
8396,"['seriou', 'question', 'need', 'white', 'peopl...",0,0,0,0,0,2014,1,5


In [11]:
df.sample(1).text.values

array(["['hiddenpivot', 'warn', 'past', 'week', 'undisclos', 'sec', 'investig', 'netflix', 'report', 'earn', 'close', 'tuesday', '20jan', 'nflx']"],
      dtype=object)

In [12]:
Xtrain, Xtest, ytrain, ytest = train_test_split(df[['text']], df['1_day_after'], test_size=0.2, random_state=RANDOM_STATE)

In [19]:

curves = {}

def add_curve(model, name):
    """
    Построение ROC кривой.
    """
    fpr, tpr, thresholds = roc_curve(ytest, model.predict_proba(Xtest)[:, 1])
    roc_auc = auc(fpr, tpr)

    curves[name] = {'fpr': fpr,
                    'tpr': tpr,
                    'thresholds': thresholds,
                    'roc_auc': roc_auc}

def print_metrics(model):
    print(f'ROC-AUC train: {roc_auc_score(ytrain, model.predict_proba(Xtrain)[:, 1])}')
    print(f'ROC-AUC test: {roc_auc_score(ytest, model.predict_proba(Xtest)[:, 1])}')
    print('Classification report for test:')
    print(classification_report(ytest, model.predict(Xtest)))

## Текстовые признаки

### Bag Of Words

In [20]:
ct = ColumnTransformer([
    ("vectorizer", CountVectorizer(), 'text')
])

pl = Pipeline([
    ("prepocessing",ct),
    ("classifier", LogisticRegression(random_state=42))
])

pl.fit(Xtrain, ytrain)

print(f"Number of words in vocabulary: {pl.named_steps['prepocessing']['vectorizer'].get_feature_names_out().shape[0]}")

print_metrics(pl)

Number of words in vocabulary: 15401
ROC-AUC train: 0.9894876421015903
ROC-AUC test: 0.5637237215909091
Classification report for test:
              precision    recall  f1-score   support

           0       0.52      0.50      0.51       800
           1       0.56      0.57      0.57       880

    accuracy                           0.54      1680
   macro avg       0.54      0.54      0.54      1680
weighted avg       0.54      0.54      0.54      1680



In [26]:
ct = ColumnTransformer([
    ("vectorizer", CountVectorizer(), 'text')
])

pl = Pipeline([
    ("prepocessing",ct),
    ("pca", PCA(n_components=10)),
    ("classifier", LogisticRegression(max_iter=500, random_state=42))
])

pl.fit(Xtrain, ytrain)

print(f"Number of words in vocabulary: {pl.named_steps['prepocessing']['vectorizer'].get_feature_names_out().shape[0]}")

print_metrics(pl)

Number of words in vocabulary: 15401
ROC-AUC train: 0.5253882982700366
ROC-AUC test: 0.5397088068181818
Classification report for test:
              precision    recall  f1-score   support

           0       0.53      0.09      0.16       800
           1       0.53      0.93      0.67       880

    accuracy                           0.53      1680
   macro avg       0.53      0.51      0.42      1680
weighted avg       0.53      0.53      0.43      1680



In [32]:
param_grid = {
    "prepocessing__vectorizer__max_features": [50, 500, 1000, None],
    "prepocessing__vectorizer__ngram_range": [(1, 1), (1, 2)],
    "classifier__C": [0.1, 1, 10]
}

gs = GridSearchCV(pl, param_grid, cv=3, scoring="roc_auc", verbose=2, n_jobs=-1)

gs.fit(Xtrain, ytrain)

gs.best_estimator_, gs.best_score_

Fitting 3 folds for each of 24 candidates, totalling 72 fits


(Pipeline(steps=[('prepocessing',
                  ColumnTransformer(transformers=[('vectorizer',
                                                   CountVectorizer(max_features=50,
                                                                   ngram_range=(1,
                                                                                2)),
                                                   'text')])),
                 ('pca', PCA(n_components=10)),
                 ('classifier',
                  LogisticRegression(C=0.1, max_iter=500, random_state=42))]),
 0.5165843530995408)

In [33]:
bow_lr = gs.best_estimator_

add_curve(bow_lr, 'bow_lr')

print_metrics(bow_lr)

ROC-AUC train: 0.5298306529997543
ROC-AUC test: 0.5177791193181818
Classification report for test:
              precision    recall  f1-score   support

           0       0.52      0.09      0.16       800
           1       0.53      0.92      0.67       880

    accuracy                           0.53      1680
   macro avg       0.53      0.51      0.42      1680
weighted avg       0.53      0.53      0.43      1680



In [34]:
score = cross_val_score(bow_lr, Xtrain, ytrain, scoring='roc_auc', cv=5, n_jobs=-1)
print(score, score.mean())

[0.49449569 0.52664195 0.52082073 0.50452096 0.51769743] 0.5128353526077524


## Tf-idf

In [37]:
ct = ColumnTransformer([
    ("vectorizer", TfidfVectorizer(), 'text')
])

pl = Pipeline([
    ("prepocessing",ct),
    ("classifier", LogisticRegression(max_iter=500, random_state=42))
])

pl.fit(Xtrain, ytrain)

pl.named_steps['prepocessing']['vectorizer'].get_feature_names_out().shape

(15401,)

In [38]:
param_grid = {
    "prepocessing__vectorizer__max_features": [50, 500, 1000, None],
    "prepocessing__vectorizer__ngram_range": [(1, 1), (1, 2)],
    "prepocessing__vectorizer__use_idf": [True, False],
    "prepocessing__vectorizer__smooth_idf": [True, False],
    "classifier__C": [0.1, 1, 10],
}

gs = GridSearchCV(pl, param_grid, cv=5, scoring="roc_auc", verbose=2, n_jobs=-1)

gs.fit(Xtrain, ytrain)

gs.best_estimator_, gs.best_score_

Fitting 5 folds for each of 96 candidates, totalling 480 fits


(Pipeline(steps=[('prepocessing',
                  ColumnTransformer(transformers=[('vectorizer',
                                                   TfidfVectorizer(ngram_range=(1,
                                                                                2)),
                                                   'text')])),
                 ('classifier',
                  LogisticRegression(C=10, max_iter=500, random_state=42))]),
 0.5928314878908232)

In [39]:
tfidf_lr = gs.best_estimator_

add_curve(tfidf_lr, 'tfidf_lr')

print_metrics(tfidf_lr)

ROC-AUC train: 0.9983188576331651
ROC-AUC test: 0.5772904829545454
Classification report for test:
              precision    recall  f1-score   support

           0       0.55      0.50      0.52       800
           1       0.58      0.62      0.60       880

    accuracy                           0.56      1680
   macro avg       0.56      0.56      0.56      1680
weighted avg       0.56      0.56      0.56      1680



In [40]:
score = cross_val_score(tfidf_lr, Xtrain, ytrain, scoring='roc_auc', cv=5, n_jobs=-1)
print(score, np.std(score))

[0.57807107 0.60967913 0.60050862 0.60536056 0.57053806] 0.015586057125138256


### [finbert-tone](https://huggingface.co/yiyanghkust/finbert-tone?text=growth+is+strong+and+we+have+plenty+of+liquidity)

In [41]:
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
vectorizer = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Device set to use cpu


In [None]:
pos_neg_ans = list(filter(lambda x: x[0] != 'Neutral',
                     zip([res['label'] for res in vectorizer(df['text'].tolist())], df['1_day_after'])))

len(pos_neg_ans) / df['1_day_after'].shape[0] * 100

In [None]:
pred_true_ans = list(map(lambda x: (int(x[0] == 'Positive'), x[1]), pos_neg_ans))

print(classification_report([i[1] for i in pred_true_ans], [i[0] for i in pred_true_ans]))

### [finbert ProcusAI](https://huggingface.co/ProsusAI/finbert)

In [None]:
nlp = pipeline("text-classification", model="ProsusAI/finbert")

In [None]:
pos_neg_ans = list(filter(lambda x: x[0] != 'neutral',
                     zip([res['label'] for res in nlp(df['text'].tolist())], df['1_day_after'])))

len(pos_neg_ans) / df['1_day_after'].shape[0] * 100