In [27]:
# 1 - Problem definition

"""
In the supervised regression framewor used for this data analysis case,
the weekly return of TSLA stock is the predicted variable.
We will try to understand what affects TSLA stock price and incorporate as much
information into the model. In this case study, we will focus on correlated assets as features.
For upcoming study cases, we will use technical indicators and fundamental analysis.

For this case study, other than the historical data of TSLA, the independent variables
used are the following correlated assets:
 -> Stocks: NIO and GM
 -> Currency : USD/JPY and GBP/USD
 -> Indices : S&P 500, Dow Jones, and VIX

The dataset used for this case study is extracted from Yahoo Finance and the FRED
website.
"""

'\nIn the supervised regression framewor used for this data analysis case,\nthe weekly return of TSLA stock is the predicted variable.\nWe will try to understand what affects TSLA stock price and incorporate as much\ninformation into the model. In this case study, we will focus on correlated assets as features.\nFor upcoming study cases, we will use technical indicators and fundamental analysis.\n\nFor this case study, other than the historical data of TSLA, the independent variables\nused are the following correlated assets:\n -> Stocks: NIO and GM\n -> Currency : USD/JPY and GBP/USD\n -> Indices : S&P 500, Dow Jones, and VIX\n\nThe dataset used for this case study is extracted from Yahoo Finance and the FRED\nwebsite.\n'

In [28]:
# 2 - Loading the data and Python packages

# 2.1. - Loading the Python packages
"""
Below we indicate the list of the libraries used for data loading, data analysis,
data preparation, model evaluation, and model tuning.
"""

'\nBelow we indicate the list of the libraries used for data loading, data analysis,\ndata preparation, model evaluation, and model tuning.\n'

In [29]:
# Function and modules for the supervised regression models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor

# Function and modules for data analysis and model evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression

# Function and modules for deep learning models
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.layers import LSTM
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

# Function and modules for time series models
#from statsmodels import tsa
#from tsa import arima as ARIMA
#from statsmodels.tsa.arima import ARIMA
import statsmodels.api as sm

# Function and modules for data preparation and visualization
    # pandas, pandas_datareader, numpy and matplotlib
import numpy as np
import pandas as pd
import scipy
#import pandas_datareader as web
import pandas_datareader.data as web
import pandas_datareader as pdr
from matplotlib import pyplot
from pandas.plotting import scatter_matrix
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from pandas.plotting import scatter_matrix
from statsmodels.graphics.tsaplots import plot_acf
import matplotlib.pyplot as plt

# Yahoo for dataReader
import yfinance as yf
yf.pdr_override()

# Datetime
from datetime import *

#Diable the warnings
import warnings
warnings.filterwarnings('ignore')

In [30]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [1]:
# 2.2. - Loading the data
stock_tickers = ['TSLA','NIO','GM']
currency_tickers = ['DEXJPUS', 'DEXUSUK']
idx_tickers = ['SP500', 'DJIA','VIXCLS']

start_date = datetime(2018, 1, 1)
end_date = datetime(2022, 5, 1)

# Stock
#stock_data = yf.download(stock_tickers,start=start_date,end=end_date)
input_path_stock_data = '/Users/yousraaoudi/Desktop/TSLA_Data_analysis.py/stock_data.csv'
df_temp_stock_data = pd.read_csv(input_path_stock_data,parse_dates=True,na_values=['nan']).dropna()
df_stock_data = pd.DataFrame(data=df_temp_stock_data)
print('Adj Close data describe \n',df_stock_data['Adj Close'].dtype)
     
"""
# Outliers - cleaning
no_outlier_stock_data = df_stock_data[(np.abs(scipy.stats.zscore(df_stock_data)) < 3).all(axis=1)]
print('Outliers Stock data describe \n',no_outlier_stock_data.describe())
df_stock_data = no_outlier_stock_data
print('Stock data after removing outliers \n',df_stock_data.head())
"""

# Currency
#currency_data = web.get_data_fred(currency_tickers,start=start_date,end=end_date)
input_path_currency_data = '/Users/yousraaoudi/Desktop/TSLA_Data_analysis.py/currency_data.csv'
df_temp_currency_data = pd.read_csv(input_path_currency_data,parse_dates=True,na_values=['nan']).dropna()
df_currency_data = pd.DataFrame(data=df_temp_currency_data)
print('FX data describe \n',df_currency_data.describe())
"""
# Outliers - cleaning
no_outlier_currency_data = df_currency_data[(np.abs(scipy.stats.zscore(df_currency_data)) < 3).all(axis=1)]
print('Outliers FX data describe \n',no_outlier_currency_data.describe())
df_currency_data = no_outlier_currency_data
print('Currency data after removing outliers \n',df_currency_data.head())
"""

# Indices
#idx_data = web.get_data_fred(idx_tickers,start=start_date,end=end_date)
input_path_idx_data = '/Users/yousraaoudi/Desktop/TSLA_Data_analysis.py/idx_data.csv'
df_temp_idx_data = pd.read_csv(input_path_idx_data,parse_dates=True,na_values=['nan']).dropna()
df_idx_data = pd.DataFrame(data=df_temp_idx_data)
print('Indices data describe \n',df_idx_data.describe())

"""
# Outliers - cleaning
no_outlier_idx_data = df_idx_data[(np.abs(scipy.stats.zscore(df_idx_data)) < 3).all(axis=1)]
print('Outliers - Indices data describe \n',no_outlier_idx_data.describe())
df_idx_data = no_outlier_idx_data
print('Idx data after removing outliers \n',df_idx_data.head())
"""

return_period = 5
#Y = np.log(df_stock_data.loc[:,df_stock_data.loc('Adj Close', 'TSLA')]).diff(return_period).shift(-return_period)
df_stock_data = df_stock_data.reset_index(drop=True)
print('DF stock data \n',df_stock_data.columns)
Y = np.log(df_stock_data.loc[:,('Close', 'TSLA')]).diff(return_period).shift(-return_period)
Y.name = Y.name[-1]+'_pred'

X1 = np.log(df_stock_data.loc[:,('Close', ('F','GM'))]).diff(return_period)
X1.columns = X1.columns.droplevel()
X2 = np.log(df_currency_data).diff(return_period)
X3 = np.log(df_idx_data).diff(return_period)

X4 = pd.concat([np.log(df_stock_data.loc[:,('Adj Close','TSLA')]).diff(i)
                for i in [return_period, return_period*3, return_period*6,
                          return_period*12]], axis=1).dropna()
X4.columns = ['TSLA_DT', 'TSLA_3DT', 'TSLA_6DT', 'TSLA_12DT']
X = pd.concat([X1, X2, X3, X4], axis=1)

dataset = pd.concat([Y,X], axis=1).dropna().iloc[::return_period,:]
Y = dataset.loc[:,Y.name]
X = dataset.loc[:, X.columns]

SyntaxError: invalid syntax (<ipython-input-1-b919d0fe899c>, line 57)

In [None]:
# 3. Exploratory data analysis
"""
In this section, we will look at descriptive statistics, data visualization,
and time series analysis.
"""

# 3.1. - Descriptive statistics
print('dataset \n',dataset.head())
print('Statistics \n',dataset.describe())
dataset.head()

In [None]:
# 3.2. Data visualization
"""
In order to learn more about our data, we will visualize it. 
Visualization involves independently understanding each attribute of
the dataset. Therefore, we will look at the scatterplot and the correlation
matrix. These plots will give us a sense of the interdependence of
the data. Correlation can be can be calculated and displayed for each pair of the variables
by creating a correlation matrix.
Besides the relationship between dependent and independent variables, it also
shows the correlation among the independent variables. 
"""
dataset.hist(bins=50, sharex=False, sharey=False, xlabelsize=1, ylabelsize=1, figsize=(12,12))
pyplot.show()

dataset.plot(kind='density', subplots=True, layout=(4,4), sharex=True, legend=True, fontsize=1, figsize=(15,15))
pyplot.show()

In [None]:
# Correlation - To get a sense of data interdependence
correlation = dataset.corr()
pyplot.figure(figsize=(15,15))
pyplot.title('Correlation Matrix')
sns.heatmap(correlation,vmax=1,square=True,annot=True,cmap='cubehelix')
pyplot.show()