In [150]:
import pandas as pd, tensorflow as tf, numpy as np, spacy
from tensorflow import keras
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from scikeras.wrappers import KerasRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [36]:
headlines_df = pd.read_csv('headlines.csv')

headlines = headlines_df.copy()

headlines_df.head()

Unnamed: 0,date,fortnight,fortnight_day,section,title,diff
0,2010-01-03,0,5,business,China and the other Brics will rebuild a new w...,
1,2010-01-03,0,5,business,Five countries that crashed and burned in the ...,
2,2010-01-03,0,5,business,New Year's resolutions for bewildered business...,
3,2010-01-03,0,5,business,Amnesty for investors with offshore accounts e...,
4,2010-01-03,0,5,business,Credit collapse damages hopes of UK economic r...,


In [3]:
nlp = spacy.load('en_core_web_md')

In [151]:
def tokenize_stop(webTitle):
    return " ".join(WordNetLemmatizer().lemmatize(word) for word in word_tokenize(webTitle.lower()) if word not in stopwords.words('english'))

vectors = headlines_df['title'].apply(lambda x: nlp(tokenize_stop(x)).vector)

vectors

0        [0.2617054, 0.23225714, -1.7147286, 0.9521144,...
1        [-0.64733106, 0.94788337, -2.6637132, 0.536621...
2        [0.19291282, 2.211837, -2.4748442, -0.44703716...
3        [-1.8742359, -0.91912, -0.836394, -0.30892998,...
4        [1.0122558, -2.2971587, -3.4531572, -0.3152615...
                               ...                        
30497    [-0.54334146, -0.8787821, -3.6869075, -0.07091...
30498    [1.357052, 0.3505843, 1.3363856, -1.8190631, 2...
30499    [-0.5141576, -1.8938138, -9.037562, 1.7126626,...
30500    [-0.9433935, 0.9113064, -0.82893634, -2.138742...
30501    [0.11983752, 0.10852668, -0.40609387, -1.10829...
Name: title, Length: 30502, dtype: object

In [152]:
vectors_df = pd.DataFrame(vectors.tolist())

headlines = pd.concat([headlines_df.drop(columns=['date', 'title']), vectors_df], axis=1)

headlines = headlines[[column for column in headlines.columns if column != 'diff'] + ['diff']]

headlines.tail()

Unnamed: 0,fortnight,fortnight_day,section,0,1,2,3,4,5,6,...,291,292,293,294,295,296,297,298,299,diff
30497,385,14,business,-0.543341,-0.878782,-3.686908,-0.070919,1.843863,0.207011,0.402912,...,-0.75382,-0.46541,0.775106,-1.945194,1.207746,1.174764,0.519885,-1.181702,0.696373,91.28
30498,385,14,business,1.357052,0.350584,1.336386,-1.819063,2.826956,-0.020071,3.358418,...,-1.944344,-0.236419,-3.276941,-0.950788,-0.744464,-1.858176,-2.471674,-4.371157,0.650712,91.28
30499,385,14,business,-0.514158,-1.893814,-9.037562,1.712663,1.272862,1.435257,1.72551,...,2.223394,-1.237203,0.419046,-0.330314,1.150378,-1.78758,-0.028114,-1.895388,2.859775,91.28
30500,385,14,technology,-0.943394,0.911306,-0.828936,-2.138742,2.364239,2.449101,1.555706,...,-1.996924,0.437416,-2.275802,-1.187877,0.304066,-0.135838,-1.374996,-3.22977,2.565259,91.28
30501,385,14,business,0.119838,0.108527,-0.406094,-1.108299,2.592848,-0.341429,2.929493,...,-3.030391,-0.774531,-2.850812,-2.103477,0.053737,-0.457459,-1.444245,-3.114886,2.82994,91.28


In [153]:
split_date = round(385 * 2/3)

df_train, df_test = headlines[(headlines['fortnight'] < split_date) & (headlines['fortnight'] > 0)], headlines[headlines['fortnight'] >= split_date]

X_train, y_train = df_train.iloc[:, :-1], df_train['diff']
X_test, y_test = df_test.iloc[:, :-1], df_test['diff']

X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

X_train.head()

Unnamed: 0,fortnight,fortnight_day,section,0,1,2,3,4,5,6,...,290,291,292,293,294,295,296,297,298,299
109,1,1,business,-1.315233,0.066983,-1.649023,-2.175933,0.52942,-0.508857,0.584903,...,2.959233,0.075589,0.488222,-0.711328,0.860507,-0.236368,-0.981299,0.474892,-2.994893,0.866523
110,1,1,business,1.061334,2.117094,-3.36828,0.149794,0.63254,0.780528,2.25058,...,3.639478,-0.74362,2.769138,-1.11692,0.30254,0.986802,2.122494,-0.259362,-3.765394,1.432296
111,1,1,business,-1.420114,-0.75415,-1.063446,3.384243,2.086764,-0.27916,1.264407,...,-0.140313,0.269516,1.602908,-0.628514,-2.125975,2.527315,1.014667,0.285393,-1.601423,0.64515
112,1,1,business,-0.28774,-0.485578,-3.30436,1.663494,3.03292,1.36718,-1.812258,...,2.127064,-0.882252,-0.627308,0.731974,-3.786586,1.372602,0.658952,0.2009,-1.492724,1.544788
113,1,1,business,-0.091955,0.73813,-1.753083,-0.719305,1.165198,-0.400015,0.58748,...,-0.364538,1.63756,4.254225,0.741447,-1.410237,2.148532,-0.8393,-0.117195,-1.369232,0.378197


In [154]:
colunas_numericas = X_train.select_dtypes(include='number').columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), colunas_numericas),
        ('cat', OneHotEncoder(), ['section'])
    ]
)

pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

X_train

array([[-1.42545029, -1.52237486, -0.73713244, ..., -0.22468799,
         1.        ,  0.        ],
       [-1.42545029, -1.52237486,  1.28905793, ...,  0.18499167,
         1.        ,  0.        ],
       [-1.42545029, -1.52237486, -0.8265509 , ..., -0.38498584,
         1.        ,  0.        ],
       ...,
       [ 1.87280378,  1.61448231,  1.95484679, ..., -0.24341516,
         1.        ,  0.        ],
       [ 1.87280378,  1.61448231, -0.81953702, ...,  0.89571587,
         1.        ,  0.        ],
       [ 1.87280378,  1.61448231,  1.41633638, ...,  1.79858386,
         1.        ,  0.        ]])