In [2]:
%reload_ext autoreload
%autoreload 2

In [91]:
import ast
import numpy as np
import pandas as pd
from pathlib import Path
from nlp_dauphine.preprocess_texts import (
    load_data,
    categorical_other,
)
from nlp_dauphine.embeddings import (
    vocabulary,
    print_neighbors,
    sentence_representations,
    co_occurence_matrix
)
from nlp_dauphine.preprocess_time_series import compute_rolling
from nlp_dauphine.utils import set_working_dir, cosine, euclidean
from category_encoders import WOEEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier

In [4]:
set_working_dir()

Current working dir: c:/Users/Hugo/Documents/Travail/A5/Dauphine_challenge/nlp_dauphine/notebooks
New working dir: c:/Users/Hugo/Documents/Travail/A5/Dauphine_challenge/nlp_dauphine


# Load Data

In [5]:
path = Path("data")
files_path = {
    "ecb": Path(path,"ecb_data_translate_cleaned.csv"),
    "fed": Path(path,"fed_data_translate_cleaned.csv"),
    "train_series": Path(path,"train_series_clean_no_0.csv"),
}

In [6]:
# LOAD TEXT DATA
df_ecb, df_fed, df_train_series = load_data(files_path)
print(df_ecb.shape, df_fed.shape, df_train_series.shape)

(1772, 6) (739, 6) (8240, 20)


In [None]:
df_train_series.list_speakers_ecb = df_train_series.list_speakers_ecb.apply(ast.literal_eval)
df_train_series.list_speakers_fed = df_train_series.list_speakers_fed.apply(ast.literal_eval)
df_train_series.id_ecb = df_train_series.id_ecb.apply(ast.literal_eval)
df_train_series.id_fed = df_train_series.id_fed.apply(ast.literal_eval)
df_train_series.list_languages_ecb = df_train_series.list_languages_ecb.apply(ast.literal_eval)

# Data Modelisation

## Train Test Split

In [96]:
# Train test split 
X=df_train_series.loc[:,df_train_series.columns!="Index + 1"]
y=np.sign(df_train_series[["Index + 1"]])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2, random_state=42)
print(X_train.shape,y_train.shape)

(6592, 19) (6592, 1)


# Categorical Encoding 

For the speakers WOEEncoder rather than One-hot encoding

In [97]:
col_speakers = ["speaker_"+str(i+1) for i in range(4)]

X_train = pd.concat([X_train, X_train['list_speakers_ecb'].apply(pd.Series)], axis=1)
X_train.rename({i:col for i, col in enumerate(col_speakers)},axis=1,inplace=True)

X_test = pd.concat([X_test, X_test['list_speakers_ecb'].apply(pd.Series)], axis=1)
X_test.rename({i:col for i, col in enumerate(col_speakers)},axis=1,inplace=True)

enc = WOEEncoder(cols=col_speakers)
X_train = enc.fit_transform(X_train, y_train)
X_test = enc.transform(X_test)

X_train.fillna(0,inplace=True)
X_test.fillna(0,inplace=True)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


for the index and the languages, one hot encoding 

In [98]:
X_train = pd.get_dummies(X_train, prefix ="",columns=["Index Name"], drop_first=False)
X_test = pd.get_dummies(X_test, prefix ="",columns=["Index Name"], drop_first=False)

In [99]:
df_lang_train = (
    pd.get_dummies(X_train.list_languages_ecb.apply(pd.Series).stack(), prefix="ecb")
    .groupby(level=0)
    .sum()
)
X_train = pd.concat([X_train, df_lang_train], axis=1)
df_lang_test = (
    pd.get_dummies(X_test.list_languages_ecb.apply(pd.Series).stack(), prefix="ecb")
    .groupby(level=0)
    .sum()
)
X_test = pd.concat([X_test, df_lang_test], axis=1)


In [100]:
cols_to_drop = [
    "id_ecb",
    "id_fed",
    "id_series",
    "list_speakers_fed",
    "list_speakers_ecb",
    "list_languages_ecb",
]
X_test.drop(cols_to_drop, axis=1, inplace=True)
X_train.drop(cols_to_drop, axis=1, inplace=True)


In [101]:
X_test.columns

Index(['Index - 9', 'Index - 8', 'Index - 7', 'Index - 6', 'Index - 5',
       'Index - 4', 'Index - 3', 'Index - 2', 'Index - 1', 'Index - 0',
       'text_concat_ecb', 'text_concat_fed', 'speaker_1', 'speaker_2',
       'speaker_3', 'speaker_4', '_CVIX Index', '_EURUSD Curncy',
       '_EURUSDV1M Curncy', '_MOVE Index', '_SPX Index', '_SRVIX Index',
       '_SX5E Index', '_V2X Index', '_VIX Index', 'ecb_de', 'ecb_en', 'ecb_es',
       'ecb_fr', 'ecb_it'],
      dtype='object')

# Text Embeddings

## Word Embeddings


### 1st Method: Vocabulary -> Co-Occurence Matrix -> SVD 

#### Vocabulary 

In [102]:
word2id_7500,id2word_7500 = vocabulary(X_train.text_concat_ecb,7500)

#### Co-occurence Matrix

In [103]:
M5dist_ecb = co_occurence_matrix(X_train.text_concat_ecb, word2id_7500, window=5, distance_weighting=True)
M5dist_fed = co_occurence_matrix(X_train.text_concat_fed, word2id_7500, window=5, distance_weighting=True)

co-occurence matrix: 100%|██████████| 6592/6592 [02:05<00:00, 52.62it/s]
co-occurence matrix: 100%|██████████| 6592/6592 [02:41<00:00, 40.81it/s]


#### Scaling

In [104]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
M5dist_ecb = scaler.fit_transform(M5dist_ecb)
M5dist_fed = scaler.fit_transform(M5dist_fed)

#### SVD

In [105]:
svd = TruncatedSVD(n_components=200)
SVDEmbeddings_ecb = svd.fit_transform(M5dist_ecb)
SVDEmbeddings_fed = svd.fit_transform(M5dist_fed)
print(SVDEmbeddings_ecb.shape, SVDEmbeddings_fed.shape)

(7501, 200) (7501, 200)


In [106]:
svd.explained_variance_ratio_.sum()

0.6076067283772807

#### Analysis

In [107]:
print_neighbors(cosine, word2id_7500, SVDEmbeddings_ecb, 'financial')
print_neighbors(cosine, word2id_7500, SVDEmbeddings_fed, 'financial')

Plus proches voisins de financial selon la distance 'cosine': 
[['turmoil', 'instability', 'turbulence', 'contagion', 'market', 'vulnerability', 'assess', 'various', 'crisis']]
Plus proches voisins de financial selon la distance 'cosine': 
[['pose', 'crisis', 'function', 'system', 'strengthen', 'severe', 'intermediary', 'market', 'UNK']]


In [108]:
print_neighbors(euclidean, word2id_7500, SVDEmbeddings_ecb, 'financial')
print_neighbors(euclidean, word2id_7500, SVDEmbeddings_fed, 'financial')

Plus proches voisins de financial selon la distance 'euclidean': 
[['system', 'stability', 'crisis', 'sector', 'global', 'institution', 'also', 'market', 'economy']]
Plus proches voisins de financial selon la distance 'euclidean': 
[['crisis', 'system', 'stability', 'institution', 'firm', 'large', 'condition', 'global', 'risk']]


## Document Representation

In [109]:
txt_ecb = X_train.text_concat_ecb
txt_fed = X_train.text_concat_fed
txt_ecb_val = X_test.text_concat_ecb
txt_fed_val = X_test.text_concat_fed

In [121]:
train_rep_ecb_svd = sentence_representations(txt_ecb, word2id_7500, SVDEmbeddings_ecb, np_func=np.mean)
train_rep_fed_svd = sentence_representations(txt_fed, word2id_7500, SVDEmbeddings_fed, np_func=np.mean)

100%|██████████| 6592/6592 [00:36<00:00, 178.55it/s]
100%|██████████| 6592/6592 [00:36<00:00, 181.82it/s]


In [122]:
val_rep_ecb_svd = sentence_representations(txt_ecb_val, word2id_7500, SVDEmbeddings_ecb, np_func=np.mean)
val_rep_fed_svd = sentence_representations(txt_fed_val, word2id_7500, SVDEmbeddings_fed, np_func=np.mean)

100%|██████████| 1648/1648 [00:06<00:00, 238.66it/s]
100%|██████████| 1648/1648 [00:08<00:00, 192.71it/s]


# Time Series Feature Engineering 

In [112]:
operations = ["std", "mean", "quantile_2", "quantile_8", "sum"]
nb_quantiles = [0.8, 0.2]
time_windows = [2, 5, 7]
params = []
for op in operations:
    for tw in time_windows:
        if "quantile" in op:
            for q in nb_quantiles:
                params.append(
                    {
                        "operation": op,
                        "time_window": tw,
                        "nb_quantile": q,
                    }
                )
        else:
            params.append(
                {
                    "operation": op,
                    "time_window": tw,
                }
            )

In [113]:
X_train_series = X_train.iloc[:,:10]
X_train_series = compute_rolling(X_train_series, params)

In [114]:
X_test_series = X_test.iloc[:,:10]
X_test_series = compute_rolling(X_test_series, params)

# Full Model 

In [123]:
X_train_concat = pd.concat(
    [
        X_train.iloc[:, 12:].reset_index(drop=True),# Speaker, Indices, languages
        X_train_series.reset_index(drop=True),# Time series FT
        pd.DataFrame(train_rep_ecb_svd), # Embeddings ecb
        pd.DataFrame(train_rep_fed_svd), # Embeddings fed
    ],
    axis=1,
    ignore_index=True,
)
X_train_concat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,433,434,435,436,437,438,439,440,441,442
0,0.666153,-0.495941,-0.04413,0.046529,1,0,0,0,0,0,...,-0.312653,0.276224,-0.203955,0.334315,-0.393745,-0.087759,0.037425,-0.070001,-0.310489,0.039887
1,0.000000,-0.495941,-0.04413,0.046529,0,0,0,0,0,1,...,-0.163149,-0.418829,-0.293803,0.257392,0.258844,0.269726,0.003460,-0.319064,-0.164829,0.007226
2,0.000000,-0.495941,-0.04413,0.046529,0,0,0,0,0,0,...,-0.466952,-0.413560,-0.198511,0.490700,-0.056497,-0.597316,-0.092702,-0.187958,-0.301901,0.177358
3,0.498468,-0.495941,-0.04413,0.046529,0,0,0,0,0,0,...,-0.454291,-0.109789,-0.158905,0.108117,-0.248746,0.249720,0.301472,-0.131956,-0.377513,0.207526
4,-0.099496,-0.495941,-0.04413,0.046529,0,0,0,0,0,0,...,-0.688525,-1.313616,0.649612,0.090788,-0.113689,0.006412,0.362838,0.176866,-0.303969,-0.413351
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6587,0.763221,0.155216,-0.04413,0.046529,0,0,0,0,0,1,...,0.137478,0.195205,0.100220,0.060263,0.540283,0.143059,-0.101112,-0.185647,0.351528,0.540297
6588,0.413505,-0.495941,-0.04413,0.046529,0,0,0,0,1,0,...,-0.190972,-0.324409,-0.239854,0.096372,-0.069942,0.047084,0.377393,-0.392810,0.092898,0.173991
6589,0.551298,-0.495941,-0.04413,0.046529,0,0,0,0,0,0,...,-0.159309,-0.035907,-0.026614,0.021840,-0.291797,-0.043380,0.047435,0.181565,0.067074,-0.160170
6590,0.551298,-0.495941,-0.04413,0.046529,0,0,0,1,0,0,...,0.362621,-0.231037,-0.111379,0.037460,-0.132487,0.077561,-0.035158,0.264266,-0.140541,-0.277351


In [124]:
X_test_concat = pd.concat(
    [
        X_test.iloc[:, 12:].reset_index(drop=True),
        X_test_series.reset_index(drop=True),
        pd.DataFrame(val_rep_ecb_svd),
        pd.DataFrame(val_rep_fed_svd),
    ],
    axis=1,
    ignore_index=True,
)
X_test_concat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,433,434,435,436,437,438,439,440,441,442
0,0.000000,-0.495941,-0.04413,0.046529,0,1,0,0,0,0,...,-0.105280,0.933763,0.266603,0.343758,-0.150544,0.576630,0.799150,0.443434,0.583290,-0.049560
1,0.413505,-0.495941,-0.04413,0.046529,0,0,1,0,0,0,...,0.042194,0.140775,0.208525,0.217843,0.208869,0.309854,0.020556,0.237851,-0.116545,0.003218
2,0.000000,-0.495941,-0.04413,0.046529,0,0,0,0,0,0,...,0.029928,0.052541,-0.104819,0.444066,-0.249767,0.472856,0.142212,-0.365125,-0.461031,-0.013606
3,0.551298,-0.495941,-0.04413,0.046529,0,0,0,0,1,0,...,-0.006881,0.200419,0.172042,-0.203065,-0.162200,0.102636,-0.147460,0.086100,-0.202693,-0.122235
4,0.666153,-0.495941,-0.04413,0.046529,0,0,0,0,0,1,...,0.662184,-0.391552,0.584898,0.150066,-0.149922,0.227169,-0.126602,-0.126941,0.186169,0.203481
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1643,0.551298,-0.495941,-0.04413,0.046529,0,0,0,0,0,0,...,-0.006881,0.200419,0.172042,-0.203065,-0.162200,0.102636,-0.147460,0.086100,-0.202693,-0.122235
1644,0.763221,0.983183,-0.04413,0.046529,0,0,0,0,0,0,...,0.259121,0.094952,-0.281573,0.494271,-0.502008,0.336785,-0.315178,-0.166258,-0.102352,0.223391
1645,0.000000,-0.495941,-0.04413,0.046529,0,0,0,0,0,1,...,0.108904,0.390455,0.264830,-0.043537,0.187119,0.270618,0.441176,0.192290,0.079865,0.043672
1646,0.666153,-0.495941,-0.04413,0.046529,0,0,0,0,1,0,...,0.662184,-0.391552,0.584898,0.150066,-0.149922,0.227169,-0.126602,-0.126941,0.186169,0.203481


In [125]:
X_train_concat.replace([np.inf, -np.inf], 0, inplace=True)
X_test_concat.replace([np.inf, -np.inf], 0, inplace=True)

## Logistic Regression

In [126]:
# Fit the classifier on the transformed training data and test it on the transformed validation data
clf = LogisticRegression(random_state=42,max_iter=20_000).fit(X_train_concat, np.array(y_train).ravel())
clf.score(X_train_concat, y_train)

0.6134708737864077

1st Method

In [127]:
clf.score(X_test_concat,y_test)

0.5242718446601942

## HistGradientBoosting

In [157]:
clf_boost = HistGradientBoostingClassifier(random_state=42).fit(X_train_concat, np.array(y_train).ravel())
clf_boost.score(X_train_concat, y_train)

0.9347694174757282

In [158]:
clf_boost.score(X_test_concat,y_test)

0.558252427184466

In [156]:
mask_categorical = [True if (x <=17) & (x>=4) else False for x in range(X_train_concat.shape[1])]

In [160]:
clf_boost2 = HistGradientBoostingClassifier(
    learning_rate=0.1,
    max_iter=200,
    categorical_features=mask_categorical,
    random_state=42,
).fit(X_train_concat, np.array(y_train).ravel())
clf_boost2.score(X_train_concat, y_train)


0.990746359223301

In [161]:
clf_boost2.score(X_test_concat,y_test)

0.5782766990291263