In [1]:
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, roc_curve, auc, precision_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator, TransformerMixin
from transformers import BertTokenizer, BertForSequenceClassification, pipeline, AutoTokenizer, AutoModelForSequenceClassification
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import download
from nltk.stem import PorterStemmer
import re

RANDOM_STATE = 42

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('https://github.com/vasilyryabtsev/futures-price-prediction/blob/dev/ds/twitter/data/processed.csv?raw=true')
df

Unnamed: 0,text,is_quote_status,has_card,1_day_after,is_in_reply_to,is_urls,year,month,day_of_week
0,"['meta', 'zuckerberg', 'said', 'would', 'twice...",0,0,0,0,0,2024,10,5
1,"['follow', 'market', 'news', 'includ', 'appl',...",0,0,1,1,1,2024,9,5
2,"['appl', 'aapl', 'reportedli', 'longer', 'invo...",0,0,1,0,0,2024,9,5
3,"['appl', 'aapl', 'longer', 'growth', 'engin', ...",0,0,1,0,0,2024,9,3
4,"['hottest', 'activ', 'bullish', 'bearish', 'op...",0,0,1,0,1,2024,9,0
...,...,...,...,...,...,...,...,...,...
8393,"['probabl', 'time', 'go', 'overweight', 'sbux'...",0,0,0,0,0,2014,1,1
8394,"['problem', 'brew', 'wall', 'street', 'coffe',...",0,0,1,0,1,2014,1,1
8395,"['get', 'hedgeyehwp', 'nrnpowerlist', 'dri', '...",0,0,0,0,0,2014,1,0
8396,"['seriou', 'question', 'need', 'white', 'peopl...",0,0,0,0,0,2014,1,5


In [10]:
df.sample(1).text.values

array(["['earn', 'tesla', 'q2', 'loss', '133', 'vs', '182', 'est', 'q2', 'rev', '279b', 'vs', '251b', 'est', 'tsla', 'httpstcokt3k21uy1a', 'httpstcobbdvocwnk8']"],
      dtype=object)

In [11]:
Xtrain, Xtest, ytrain, ytest = train_test_split(df[['text']], df['1_day_after'], test_size=0.2, random_state=RANDOM_STATE)

In [12]:
curves = {}

def add_curve(model, name):
    """
    Построение ROC кривой.
    """
    fpr, tpr, thresholds = roc_curve(ytest, model.predict_proba(Xtest)[:, 1])
    roc_auc = auc(fpr, tpr)

    curves[name] = {'fpr': fpr,
                    'tpr': tpr,
                    'thresholds': thresholds,
                    'roc_auc': roc_auc}

def print_metrics(model):
    print(f'ROC-AUC train: {roc_auc_score(ytrain, model.predict_proba(Xtrain)[:, 1])}')
    print(f'ROC-AUC test: {roc_auc_score(ytest, model.predict_proba(Xtest)[:, 1])}')
    print('Classification report for test:')
    print(classification_report(ytest, model.predict(Xtest)))

# Текстовые признаки

In [18]:
ct = ColumnTransformer([
    ("vectorizer", TfidfVectorizer(), 'text')
])

pl = Pipeline([
    ("prepocessing",ct),
    ("classifier", LogisticRegression(random_state=42))
])

pl.fit(Xtrain, ytrain)

print(f"Number of words in vocabulary: {pl.named_steps['prepocessing']['vectorizer'].get_feature_names_out().shape[0]}")

print_metrics(pl)

Number of words in vocabulary: 15401
ROC-AUC train: 0.9287792436439245
ROC-AUC test: 0.5486129261363637
Classification report for test:
              precision    recall  f1-score   support

           0       0.52      0.43      0.47       800
           1       0.55      0.64      0.59       880

    accuracy                           0.54      1680
   macro avg       0.53      0.53      0.53      1680
weighted avg       0.53      0.54      0.53      1680



In [21]:
ct = ColumnTransformer([
    ("vectorizer", TfidfVectorizer(), 'text')
])

pl = Pipeline([
    ("prepocessing",ct),
    ("pca", PCA(n_components=500)),
    ("classifier", LogisticRegression(random_state=42))
])

pl.fit(Xtrain, ytrain)

print(f"Number of words in vocabulary: {pl.named_steps['prepocessing']['vectorizer'].get_feature_names_out().shape[0]}")

print_metrics(pl)

: 