# Importing Required Packages

In [1]:
import pandas as pd
import numpy as np
import yfinance

# Reading the Dataset
- The dataset is stored in a dataframe with the name **data**.
- The data-type of **date** column of the dataset is coverted from **str** to **datetime**.
- The **date** column is then set as the index.

In [2]:
raw_data = yfinance.download (tickers = "^GSPC ^FTSE ^N225 ^GDAXI", start = "1994-01-07", 
                              end = "2019-09-01", interval = "1d", group_by = 'ticker', auto_adjust = True, treads = True)

[*********************100%***********************]  4 of 4 completed


In [3]:
df_comp = raw_data.copy()

df_comp['spx'] = df_comp['^GSPC'].Close[:]
df_comp['dax'] = df_comp['^GDAXI'].Close[:]
df_comp['ftse'] = df_comp['^FTSE'].Close[:]
df_comp['nikkei'] = df_comp['^N225'].Close[:]

data = df_comp[['spx', 'dax', 'ftse', 'nikkei']]

The summary statistics for the numeric columns of the dataset are given below.

In [4]:
data.describe()

Unnamed: 0,spx,dax,ftse,nikkei
,,,,
count,6459.0,6495.0,6481.0,6299.0
mean,1380.614234,6447.864569,5531.558153,15051.368582
std,595.236124,3024.998142,1204.719455,4287.417278
min,438.920013,1911.699951,2876.600098,7054.97998
25%,1023.675018,4181.094971,4569.5,10889.379883
50%,1263.819946,5938.879883,5757.399902,15474.5
75%,1562.484985,7986.52002,6433.299805,18506.739258
max,3025.860107,13559.599609,7877.5,24270.619141


# Converting the data to time-series format.
Since the interval between periods in a time-series data must be equal, we set the frequency as **b** (Business Days). On adding this, we get a few more tuples in the dataset with missing values for each of the indices. 

In [5]:
data = data.asfreq('b')
data.isna().sum()

spx         233
dax         197
ftse        211
nikkei      393
dtype: int64

# Filling the missing values
The missing values for each column is filled using the **ffill** (front-fill) method of the **fillna** function. With this function, the missing value is filled with the next available value.

In [6]:
data.spx = data.spx.fillna(method='ffill')
data.dax = data.dax.fillna(method='ffill')
data.ftse = data.ftse.fillna(method='ffill')
data.nikkei = data.nikkei.fillna(method='ffill')

data.isna().sum()

spx         0
dax         0
ftse        0
nikkei      1
dtype: int64

In [7]:
data.nikkei = data.nikkei.fillna(method='bfill')

data.isna().sum()

spx         0
dax         0
ftse        0
nikkei      0
dtype: int64

# Calculating Retruns for the Prices
The returns for a index can be calculated using the following formula - <br>
$Returns(t) = Prices(t) - Prices(t-1)$ <br>
Here $t$ is the time period in question. <br><br>

On calculating the returns for each index, the data is concatenated to the dataframe itself.

In [8]:
data['spx_ret'] = data.spx.pct_change(1).mul(100)
data['dax_ret'] = data.dax.pct_change(1).mul(100)
data['ftse_ret'] = data.ftse.pct_change(1).mul(100)
data['nikkei_ret'] = data.nikkei.pct_change(1).mul(100)

# Calculating Normalized Prices
The Normalized prices of an index can be used to accurately compare the growth and fall of each market. <br>
It can be calculated using the following formula - <br>
$Normalized Prices(t) = (Prices(t)/Bench Mark) * 100$ <br>
Here $t$ is the time period in question. <br>
And, $Bench Mark$ is the baseline used for each marke index. For this analysis the first value of each index is chosen <br><br>

On calculating the noralized prices, the data is concatenated to the dataframe itself.

In [9]:
spx_benchmark = data.spx[0]
data['spx_norm'] = data.spx.div(spx_benchmark).mul(100)

dax_benchmark = data.dax[0]
data['dax_norm'] = data.dax.div(dax_benchmark).mul(100)

ftse_benchmark = data.ftse[0]
data['ftse_norm'] = data.ftse.div(ftse_benchmark).mul(100)

nikkei_benchmark = data.nikkei[0]
data['nikkei_norm'] = data.nikkei.div(nikkei_benchmark).mul(100)

# Calculating Normalized Returns
$Normalized Retruns(t) = (Retruns(t)/Bench Mark) * 100$ <br>
Here $t$ is the time period in question. <br>
And, $Bench Mark$ is the baseline used for each marke index. For this analysis the first value of each index is chosen.<br><br>

On calculating the noralized returns, the data is concatenated to the dataframe itself.

In [10]:
data['spx_ret_norm'] = data.spx_ret.div(data.spx_ret[1])*100
data['ftse_ret_norm'] = data.ftse_ret.div(data.ftse_ret[1])*100
data['dax_ret_norm'] = data.dax_ret.div(data.dax_ret[1])*100
data['nikkei_ret_norm'] = data.nikkei_ret.div(data.nikkei_ret[1])*100

# Calculating Squared Returns/Volatility
$Squared Retruns(t) = (Retruns(t))^2$<br>
Here $t$ is the time period in question. <br>

On calculating the noralized returns, the data is concatenated to the dataframe itself.

In [11]:
data['spx_vol'] = data.spx_ret.mul(data.spx_ret)
data['dax_vol'] = data.dax_ret.mul(data.dax_ret)
data['ftse_vol'] = data.ftse_ret.mul(data.ftse_ret)
data['nikkei_vol'] = data.nikkei_ret.mul(data.nikkei_ret)

# Splitting Dataset
80% of the data is seperated for training and 20% for testing.

In [12]:
size = int(len(data)*0.8)
df, df_test = data.iloc[:size], data.iloc[size:]

# Export DataFrame
The preprocessed data is exported as a **.csv** file.

In [13]:
data = data.reset_index()
df = df.reset_index()
df_test = df_test.reset_index()

In [14]:
data.to_csv('web_scrapped_data.csv')
df.to_csv('train.csv')
df_test.to_csv('test.csv')