# How the News impact on Stock Prices?:
___

# Loading libraries:

In [None]:
import eikon as ek
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from textblob import TextBlob
import datetime
from datetime import time
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
import warnings
warnings.filterwarnings("ignore")

In [None]:
ek.set_app_id('**********')

# I keep headlines and news IDs from last year

I keep news headlines from the last year, the API limits to 100 news but with a loop I can concatenate more:

In [None]:
start_open, end_open = '2017-12-07T09:00:00', '2018-06-12T09:00:00'
start_close, end_close = '2017-12-07T16:55:00', '2018-06-12T16:55:00'

start_open, end_open = datetime.datetime.strptime(start_open, "%Y-%m-%dT%H:%M:%S"), \
                                    datetime.datetime.strptime(end_close, "%Y-%m-%dT%H:%M:%S")
start_close, end_close = datetime.datetime.strptime(start_close, "%Y-%m-%dT%H:%M:%S"), \
                                    datetime.datetime.strptime(end_close, "%Y-%m-%dT%H:%M:%S")

dif = int((end_open-start_open).total_seconds()/(3600*24)) ## time difference in days

start_date = [(start_open + datetime.timedelta(hours=24*x)).strftime("%Y-%m-%dT%H:%M:%S") for x in range(dif+1)]
end_date = [(end_open + datetime.timedelta(hours=24*x)).strftime("%Y-%m-%dT%H:%M:%S") for x in range(dif+1)]

In [None]:
df = pd.DataFrame()
for date, end_date in zip(start_date,end_date): 
    aux = ek.get_news_headlines('R:IBM.N AND Language:LEN', date_from = date, date_to = end_date, count=20)
    df = pd.concat([df, aux])
df.head()

In [None]:
elements = len(df.index)
print(u'I have a total of %s news to analyze' %elements)

# Sentimental analysis of the news

I create 3 columns to store variables that I will generate later:

In [None]:
df['Polarity'] = np.nan
df['Subjectivity'] = np.nan
df['Categorization'] = np.nan

We have a dataframe with the news headlines and the IDs that Thomson Reuters has assigned to those news items. With that ID we'll be able to access the news itself and analyze it:

In [None]:
for index, noticiaID in enumerate(df['storyId'].values): # Itero for all rows of the dataframe
    try:
        text = ek.get_news_story(noticiaID) # I get the text of each of the news
        if text:
            soup = BeautifulSoup(text, "lxml") # I create a BeautifulSoup object from our HTML article
            sents = TextBlob(soup.get_text()) # I pass the text of the article to TextBlob for analysis
            df['Polarity'].iloc[index] = sents.sentiment.polarity # I keep the polarity of sentiment in the dataframe
            df['Subjectivity'].iloc[index] = sents.sentiment.subjectivity # I keep the subjectivity 0->objective 1->subjective
            if sents.sentiment.polarity >= 0.05: # I categorize polarities -1->negative 1->positive
                score = 'Positive
            elif -.05 < sents.sentiment.polarity < 0.05:
                score = 'Neutral
            else:
                score = 'Negative
            df['Categorization'].iloc[index] = score 
            if indice%20==0: # I put a counter to see where the theme goes
                print(u'I'm going for the %s' % index)
    except:
        pass
df.head(3)

# I analyze the impact of the news on the price ceteris paribus
___

I get a time series of prices on every minute, from the minimum news date. Note, it returns a maximum of 50 thousand minutes so I have approximately 6 months of data (from the present to the start date).

In [None]:
inicio = df['versionCreated'].min().replace(hour=0,minute=0,second=0,microsecond=0).strftime('%Y/%m/%d')
fin = df['versionCreated'].max().replace(hour=0,minute=0,second=0,microsecond=0).strftime('%Y/%m/%d')
minute = ek.get_timeseries(["IBM.N"], start_date=inicio, interval="minute")
minute.tail()

In [None]:
min(minute.index), max(minute.index)

I define variables for saving the impacts after some minutes:

In [None]:
df['2Minutes'] = np.nan
df['5minutes'] = np.nan
df['10minutes'] = np.nan
df['30minutes'] = np.nan
df['1hour'] = np.nan
df.head(3)

I get the variation in each time interval as:
$$\triangle_{t\rightarrow t+x} = \Bigl ( \frac{Valor_{t+x}}{Valor_t} -1\Bigr) * 100$$

In [None]:
for index, date_new in enumerate(df['versionCreated'].values):
    sTime = df['versionCreated'][index]
    sTime = sTime.replace(second=0,microsecond=0) # Quito segundos y micros para ir a nivel minuto
    try:
        t0 = minuto.iloc[minuto.index.get_loc(sTime),2] # Lo que vale al crearse la noticia
        df['2Minutes'][index] = ((minuto.iloc[minuto.index.get_loc((sTime + datetime.timedelta(minutes=2))),3]/(t0)-1)*100)
        df['5minutes'][index] = ((minuto.iloc[minuto.index.get_loc((sTime + datetime.timedelta(minutes=5))),3]/(t0)-1)*100)
        df['10minutes'][index] = ((minuto.iloc[minuto.index.get_loc((sTime + datetime.timedelta(minutes=10))),3]/(t0)-1)*100) 
        df['30minutes'][index] = ((minuto.iloc[minuto.index.get_loc((sTime + datetime.timedelta(minutes=30))),3]/(t0)-1)*100)
        df['1hour'][index] = ((minuto.iloc[minuto.index.get_loc((sTime + datetime.timedelta(minutes=60))),3]/(t0)-1)*100)
    except:
        pass
df.head(3)

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna()
df.head(3)

In [None]:
grouped = df.groupby(['Categorization']).mean()
grouped

The negative news brings IBM's shares down by an average of 0.14 percent in the next 10 minutes, while the good news brings them down by 0.011 percent. This is biased towards IBM shares, but what we can deduce is that in the very short term the news has an impact on the share price.

# Visualizations

In [None]:
df = df[~df.index.duplicated()]

In [None]:
positives = df.loc[df['Categorization']=='Positive'].groupby(df['versionCreated'].dt.\
                            strftime('%Y-%m-%d'))['Categorization'].count().sort_index().reset_index()
negatives = df.loc[df['Categorization']=='Negative'].groupby(df['versionCreated'].dt.\
                            strftime('%Y-%m-%d'))['Categorization'].count().sort_index().reset_index()
neutrals = df.loc[df['Categorization']=='Neutral'].groupby(df['versionCreated'].dt.\
                            strftime('%Y-%m-%d'))['Categorization'].count().sort_index().reset_index()

In [None]:
minute_representation = minute.reset_index()
minute_representation = minute_representation[['Date','OPEN']].groupby(minute_representation['Date'].\
                                                dt.strftime('%Y-%m-%d')).mean().reset_index()

In [None]:
# Create traces
trace0 = go.Scatter(
    x = positives['versionCreated'],
    y = positives['Categorization'],
    line = dict(color = 'green'),
    mode = 'lines+markers',
    name = 'Positive news',
    yaxis='y2'
)
trace1 = go.Scatter(
    x = negatives['versionCreated'],
    y = negatives['Categorization'],
    line = dict(color = 'red'),
    mode = 'lines+markers',
    name = 'Negative news',
    yaxis='y2'
)
trace2 = go.Scatter(
    x = neutrals['versionCreated'],
    y = neutrals['Categorization'],
    line = dict(color = 'orange'),
    mode = 'lines+markers',
    name = 'Neutral news',
    yaxis='y2'
)
trace3 = go.Scatter(
    x = minute_representation['Date'],
    y = minute_representation['OPEN'],
    line=go.Line(shape='hv', color='black'),
    mode = 'lines',
    fill='tonexty',
    name = 'Share price'
)
datos = [trace0, trace1, trace2, trace3]

layout = dict(title = u'IBM News Sentiment Analysis',
                xaxis = dict(title = u'Date'),
                yaxis = dict(title = u'Share price'),
                yaxis2=dict(
                    title=u'Number of news',
                    overlaying='y',
                    side='right'
                )
              )

fig = dict(data=datos, layout=layout)
iplot(fig, filename='analysis-news')

In [None]:
df.to_csv('.data/analysis_news_file.csv', encoding='utf-8', sep=';', index=False)