# PSEi Stock Market

### Import Python Packages
This will import the dependencies to be used to download and analyze the data that will be downloaded through the Yahoo Finance API.

!pip install yfinance
!pip install numpy
!pip install pandas_datareader 
!pip install scikit-learn
!pip install plotly
!conda install -c conda-forge ta-lib

In [1]:
# Yahoo Finance API
import yfinance as yf  

# DataFrame
import pandas as pd

# Numerical Python
import numpy as np

# Pandas Data Reader
from pandas_datareader import data as pdr

# Machine learning
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import cross_val_score

# Technical Analysis Library
import talib as ta     

### Library Options
This will set the options for the libraries that will be used in this notebook

In [2]:
yf.pdr_override()   # Override Yahoo Finance API output to use pandas data reader
pd.options.plotting.backend = "plotly" # Use plotly as the plotting backend

### Download PSEi Data
This will download the data from Yahoo Finance API and then import it to pandas dataframe.
The data will be downloaded is from `2000-01-01` to `2023-05-18` with an interval of `1 day`

In [3]:
df = pdr.get_data_yahoo('PSEI.PS', '2000-01-01', '2023-05-18')
df = df.dropna()
df.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-03,2143.669922,2148.709961,2122.98999,2141.77002,2141.219482,0
2000-01-04,2151.550049,2158.209961,2130.679932,2153.179932,2152.626465,0
2000-01-05,2113.379883,2113.379883,2070.139893,2074.75,2074.216553,0
2000-01-06,2079.050049,2082.810059,2066.879883,2079.110107,2078.575684,0
2000-01-07,2079.320068,2094.290039,2077.649902,2094.290039,2093.751709,0


### Output the raw data to CSV

In [4]:
df.to_csv('raw.csv')

# Initialize Variables

### Time Period
The window size (moving average in days) of the rolling mean and rolling correlation

### Train Size
The percentage of the data that will be used for training the model

In [5]:
# Time Period (in days)
timeperiod = 10

# Train Size (in percentage 0.1 = 10%)
trainsize = 0.8

# Prepare the data for the model

### Populate Tables

In [6]:
df['S_' + str(timeperiod)] = df['Close'].rolling(window=timeperiod).mean() # Rolling mean

df['Corr'] = df['Close'].rolling(window=timeperiod).corr(df['S_' + str(timeperiod)]) # Correlation between the close price and the rolling mean

df['RSI'] = ta.RSI(np.array(df['Close']), timeperiod=timeperiod) # Relative Strength Index

df['Open-Close'] = df['Open'] - df['Close'].shift(1) # The difference between the current day's open and the previous day's close

df['Open-Open'] = df['Open'] - df['Open'].shift(1) # The difference between the current day's open and the previous day's open

df = df.dropna() # Drop the NaN values
df.head() #    Show the dataframe

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,S_10,Corr,RSI,Open-Close,Open-Open
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2000-01-27,1993.709961,2011.099976,1970.780029,1975.199951,1974.692139,0,2051.536023,0.94551,23.487207,-4.880005,-53.570068
2000-01-28,1974.310059,1978.650024,1960.439941,1969.390015,1968.883789,0,2038.265015,0.952497,22.802548,-0.889893,-19.399902
2000-01-31,1959.0,1997.089966,1950.01001,1989.430054,1988.918579,0,2027.485022,0.910715,30.56029,-10.390015,-15.310059
2000-02-01,1986.619995,1990.949951,1972.109985,1973.439941,1972.932617,0,2017.345007,0.901408,28.060317,-2.810059,27.619995
2000-02-02,1976.099976,1988.030029,1971.880005,1975.959961,1975.452026,0,2010.088,0.890174,29.076283,2.660034,-10.52002


In [7]:
x = df.iloc[:,:9] # The features that will be used for the model
y = np.where(df['Close'].shift(-1) > df['Close'],1,-1) # 1 if the price goes up, -1 otherwise

In [8]:
split = int(trainsize * len(df)) # Split the data into train and test set
x_train, x_test, y_train, y_test = x[:split], x[split:], y[:split], y[split:] # Split the data into train and test set

## Implement Logistic Regression

In [9]:
model = LogisticRegression()  # Initialize the model
model = model.fit(x_train, y_train)  # Fit the model

### Show the model

In [10]:
# Show the coefficients of the model
pd.DataFrame(zip(x.columns, np.transpose(model.coef_)))
# 1st column is the features, 2nd column is the coefficients

Unnamed: 0,0,1
0,Open,[2.2167891450125878e-06]
1,High,[2.241330953471385e-06]
2,Low,[2.2182630397449996e-06]
3,Close,[2.237755014261298e-06]
4,Adj Close,[2.2371797880817864e-06]
5,Volume,[-3.352507710482293e-09]
6,S_10,[2.253239183341203e-06]
7,Corr,[7.71933511806674e-11]
8,RSI,[1.3118414601767827e-08]


### Predict the price

In [11]:
# Predict X
probability = model.predict_proba(x_test)

probability_df = pd.DataFrame(probability)
probability_df

# 1st column is the probability of the price going down
# 2nd column is the probability of the price going up

Unnamed: 0,0,1
0,0.474604,0.525396
1,0.474267,0.525733
2,0.474196,0.525804
3,0.474031,0.525969
4,0.474062,0.525938
...,...,...
1152,0.477758,0.522242
1153,0.477964,0.522036
1154,0.478091,0.521909
1155,0.478010,0.521990


In [12]:
# Predict Y
y_predicted = model.predict(x_test) # Predict the price

In [13]:
y_confusion_matrix = metrics.confusion_matrix(y_test, y_predicted)

y_confusion_matrix_df = pd.DataFrame(y_confusion_matrix).transpose()
y_confusion_matrix_df # Show the confusion matrix

Unnamed: 0,0,1
0,1,0
1,563,593


### Show the accuracy of the model

In [14]:
print("Accuracy:", model.score(x_test,y_test)) # Show the accuracy of the model


Accuracy: 0.5133967156439067


In [15]:
report = metrics.classification_report(y_test, y_predicted, output_dict=True) # Show the classification report
report_df = pd.DataFrame(report).transpose()
report_df


Unnamed: 0,precision,recall,f1-score,support
-1,1.0,0.001773,0.00354,564.0
1,0.512976,1.0,0.678102,593.0
accuracy,0.513397,0.513397,0.513397,0.513397
macro avg,0.756488,0.500887,0.340821,1157.0
weighted avg,0.750384,0.513397,0.349275,1157.0


### Show the cross validation score

In [16]:
cross_val = cross_val_score(LogisticRegression(), x, y, scoring='accuracy', cv=10)
cross_val


array([0.50604491, 0.50431779, 0.47841105, 0.50604491, 0.50519031,
       0.50692042, 0.50692042, 0.50865052, 0.50865052, 0.51557093])

### Calculate Prediction Signals

In [17]:
df['Predicted_Signal'] = model.predict(x) # Predict the signal
df['PSEi_returns'] = np.log(df['Close'] / df['Close'].shift(1)) # Calculate the PSEi returns
Cumulative_PSEi_returns = np.cumsum(df[split:]['PSEi_returns']) # Calculate the cumulative PSEi returns

df['Strategy_Returns'] = df['PSEi_returns'] * df['Predicted_Signal'].shift(1) # Calculate the strategy returns
Cumulative_Strategy_returns = np.cumsum(df[split:]['Strategy_Returns']) # Calculate the cumulative strategy returns

### Output the data to CSV

In [18]:
df.to_csv('predicted signals.csv') # Output the data to CSV

# Plot the data

### Plot the close price and the rolling mean

In [19]:
df.plot(y=['Close', 'S_' + str(timeperiod)], labels={'value':'Value', 'index':'Date', 'variable':'Variables'}) # Plot the close price and the rolling mean

### Plot Cumulative PSEi returns

In [20]:
Cumulative_PSEi_returns.plot(labels={'value':'Cumulative Returns', 'index':'Date', 'variable':'Variables'}) # Plot the cumulative PSEi returns

### Plot the Cumulative Strategy Returns

In [21]:
Cumulative_Strategy_returns.plot(labels={'value':'Cumulative Returns', 'index':'Date', 'variable':'Variables'}) # Plot the cumulative strategy returns

### Plot the PSEi returns and the Strategy returns

In [22]:
Cumulative_PSEi_returns_df = Cumulative_PSEi_returns.to_frame()
Cumulative_Strategy_returns_df = Cumulative_Strategy_returns.to_frame()

cumulative_returns_df = pd.merge(Cumulative_PSEi_returns_df, Cumulative_Strategy_returns_df, left_index=True, right_index=True)
cumulative_returns_df.head()

Unnamed: 0_level_0,PSEi_returns,Strategy_Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-08-22,0.01741,0.01741
2018-08-23,0.039667,0.039667
2018-08-24,0.034842,0.034842
2018-08-28,0.044853,0.044853
2018-08-29,0.043111,0.043111


In [25]:
cumulative_returns_df.plot(title='PSEi and Strategy Cumulative Returns', labels={'value':'Cumulative Returns', 'index':'Date', 'variable':'Strategy'})

In [24]:
cumulative_returns_df.to_csv('cumulative_returns.csv') # Output the data to CSV

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=104f152a-ece5-47f4-a4d4-d9ea9c0d19d6' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>