# CMSE 202 Final Project

## Modeling stock portfolios to maximize performance.

### &#9989; Yixiao Tang, Ishan Baweja, Zoe Zhang, Febri D
### &#9989; Section_002



In [2]:
import datetime as dt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

## Introduction

We have found almost all the stock's data on nasdaq. To analyze this huge amount of data is too complicated and hard to get precise prediction. So, I am planning to 5 stocks from following list, and doing regression to predict buy and sell to maximize profit.

### stock dataset



In [None]:
#This cell inculde the uploading of the general stock info using pandas
stock_info=pd.read_csv('symbols_valid_meta.csv')                    # include 8048 stocks.
stock_info

### stock indexs

In [None]:
#This cell of code includes the uploading of the specfic stock indexes and their quantitative data using pandas
stock_info2=pd.read_csv('data.csv')           
stock_info2

## Methodology

### The data used

This is the source we get the data. https://www.investing.com/equities/google-inc-historical-data

The data include stock price from 1/12018 to 11/25/2022. In each stock data include open price, highest price, lowest price,closed price and volume.

We also found the dataset which include important index which value the quality of stocks. We are going to use EPS to select 5 stocks from market.

Use machine learning to predict stock price
### Choose stock with highest EPS

### Stock: SEB,NVR,MKL,BIO,GOOGL

### Read stock dataframe

Now that we have found the stocks with the largest EPS, we will then set a variable for each the 5 stocks we have chosen to an individual data set for each stock

# 1. google

In [3]:
#use google as an example
#load the data
googl_df = pd.read_csv("GOOGL.csv")
googl_df.set_index("Date", inplace=True)
googl_df.head()
googl_df

Unnamed: 0_level_0,Price,Open,High,Low,Vol.,Change %
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
01/02/2018,53.66,52.65,53.80,52.65,31.77M,1.88%
01/03/2018,54.58,53.70,54.80,53.67,31.32M,1.71%
01/04/2018,54.79,54.85,55.20,54.71,26.05M,0.38%
01/05/2018,55.51,55.17,55.68,55.09,30.25M,1.31%
01/08/2018,55.71,55.55,55.96,55.50,24.64M,0.36%
...,...,...,...,...,...,...
11/18/2022,97.43,98.77,98.90,96.37,28.34M,-0.95%
11/21/2022,95.60,97.29,98.39,95.36,21.50M,-1.88%
11/22/2022,97.05,95.89,97.22,94.09,23.04M,1.52%
11/23/2022,98.46,97.20,98.75,97.15,18.87M,1.45%


In [None]:
#get change all the data to float type
for i in range (1235):
    googl_df['Vol.'][i]=float(str(googl_df['Vol.'][i])[:-1])*1000000
    googl_df['Change %'][i]=float(str(googl_df['Change %'][i])[:-1])
#    googl_df['Price'][i]=googl_df['Price'][i].replace(',','')
#    googl_df['Open'][i]=float(googl_df['Open'][i].replace(',',''))
#    googl_df['High'][i]=float(googl_df['High'][i].replace(',',''))
#    googl_df['Low'][i]=float(googl_df['Low'][i].replace(',',''))  



In [None]:
#plot the google stock price
googl_df.dropna()
googl_df['Price'].plot()
plt.ylabel("Price")
plt.xticks(rotation=45)
plt.title(label='Google stock price')
plt.show()
googl_df

In [None]:
#split the data into 90% training set and 10% testing set.
x = googl_df.iloc[:, 1:6].values[:-1]
y = googl_df['Price'].shift(-1)[:-1]

split = int(googl_df.shape[0]*0.9)
x_train, x_test = x[:split], x[split:]
y_train, y_test = y[:split], y[split:]

#We don't used the random state here because the stock price follows the time series.
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25,  random_state=0)

In [None]:
#use random forest regressor to train the model
scale = StandardScaler()
x_train = scale.fit_transform(x_train)
x_test = scale.transform(x_test)
model = RandomForestRegressor(n_estimators=10,random_state=0, min_samples_split=10, min_samples_leaf=10, max_depth=5, bootstrap=True)
model.fit(x_train, y_train)
predict = model.predict(x_test)
#print(predict.shape)

In [None]:
#evaluate the accuracy of the model
print("Mean Absolute Error:", round(metrics.mean_absolute_error(y_test, predict), 4))
print("Mean Squared Error:", round(metrics.mean_squared_error(y_test, predict), 4))
print("Root Mean Squared Error:", round(np.sqrt(metrics.mean_squared_error(y_test, predict)), 4))
print("(R^2) Score:", round(metrics.r2_score(y_test, predict), 4))
print(f'Train Score : {model.score(x_train, y_train) * 100:.2f}% and Test Score : {model.score(x_test, y_test) * 100:.2f}% using Random Tree Regressor.')
errors = abs(predict - y_test)
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.') 


In [None]:
#plot out our prediction and compared it to the actual stock price
plt.plot(predict,label='preidction')
plt.plot(y_test,label='Actual Value')
plt.xticks([0,10,20,30,40,50,60,70,80,90,100,110,120],rotation=45)
plt.legend()

In [None]:
predictions = pd.DataFrame({"Predictions": predict}, index=pd.date_range(start=googl_df.index[-123], periods=len(predict), freq="D"))
print(predictions)
print(predictions['Predictions'][0])

In [None]:
#Evaluate buying and selling strategies to maximize profits
oneyear_df_pred = pd.read_csv("google_predict.csv")

buy_price = min(predictions['Predictions'])
sell_price = max(predictions['Predictions'])
oneyear_buy = predictions.loc[predictions["Predictions"] == buy_price]
oneyear_sell = predictions.loc[predictions["Predictions"] == sell_price]
print("Buy price and date")
print(oneyear_buy)
print("Sell price and date")
print(oneyear_sell)

print('Earing rate:',round(sell_price/buy_price*100,1),'%')

#since the stock price would reach the highest point before the it reaches the lowest point
#so we can do the Leverage or Futures

# 2. BIO

In [None]:

df = pd.read_csv("BIO.csv")
#we have MKL,NVR,SEB,BIO
df.set_index("Date", inplace=True)

for i in range (1235):
    df['Vol.'][i]=float(str(df['Vol.'][i])[:-1])*1000000
    df['Change %'][i]=float(str(df['Change %'][i])[:-1])
#    googl_df['Price'][i]=googl_df['Price'][i].replace(',','')
#    googl_df['Open'][i]=float(googl_df['Open'][i].replace(',',''))
#    googl_df['High'][i]=float(googl_df['High'][i].replace(',',''))
#    googl_df['Low'][i]=float(googl_df['Low'][i].replace(',',''))  

df.dropna()
df['Price'].plot()
plt.ylabel("Price")
plt.xticks(rotation=45)
plt.title(label='BIO stock price')
plt.show()
googl_df

#split the data into 90% training set and 10% testing set.
x = df.iloc[:, 1:6].values[:-1]
y = df['Price'].shift(-1)[:-1]

split = int(df.shape[0]*0.9)
x_train, x_test = x[:split], x[split:]
y_train, y_test = y[:split], y[split:]

#use random forest regressor to train the model
scale = StandardScaler()
x_train = scale.fit_transform(x_train)
x_test = scale.transform(x_test)
model = RandomForestRegressor(n_estimators=10,random_state=0, min_samples_split=10, min_samples_leaf=10, max_depth=5, bootstrap=True)
model.fit(x_train, y_train)
predict = model.predict(x_test)
#print(predict.shape)

#plot out our prediction and compared it to the actual stock price
plt.plot(predict,label='preidction')
plt.plot(y_test,label='Actual Value')
plt.xticks([0,10,20,30,40,50,60,70,80,90,100,110,120],rotation=45)
plt.legend()

predictions = pd.DataFrame({"Predictions": predict}, index=pd.date_range(start=googl_df.index[-123], periods=len(predict), freq="D"))
print(predictions)


#Evaluate buying and selling strategies to maximize profits
oneyear_df_pred = pd.read_csv("google_predict.csv")

buy_price = min(predictions['Predictions'])
sell_price = max(predictions['Predictions'])
oneyear_buy = predictions.loc[predictions["Predictions"] == buy_price]
oneyear_sell = predictions.loc[predictions["Predictions"] == sell_price]
print("Buy price and date")
print(oneyear_buy)
print("Sell price and date")
print(oneyear_sell)

print('Earing rate:',round(sell_price/buy_price*100,1),'%')

#since the stock price would reach the highest point before the it reaches the lowest point
#so we can do the Leverage or Futures

# 3. SEB

In [None]:
df = pd.read_csv("SEB.csv")
#we have MKL,NVR,SEB,BIO
print(df.shape)
df=df.dropna()
df.set_index("Date", inplace=True)



for i in range (1235):
    df['Vol.'][i]=float(str(df['Vol.'][i])[:-1])*1000000
    df['Change %'][i]=float(str(df['Change %'][i])[:-1])
    df['Price'][i]=float(df['Price'][i].replace(',',''))
    df['Open'][i]=float(df['Open'][i].replace(',',''))
    df['High'][i]=float(df['High'][i].replace(',',''))
    df['Low'][i]=float(df['Low'][i].replace(',',''))  

df['Price'].plot()
plt.ylabel("Price")
plt.xticks(rotation=45)
plt.title(label='SEB stock price')
plt.show()
googl_df
#split the data into 90% training set and 10% testing set.
x = df.iloc[:, 1:6].values[:-1]
y = df['Price'].shift(-1)[:-1]

split = int(df.shape[0]*0.9)
x_train, x_test = x[:split], x[split:]
y_train, y_test = y[:split], y[split:]

#use random forest regressor to train the model
scale = StandardScaler()
x_train = scale.fit_transform(x_train)
x_test = scale.transform(x_test)
model = RandomForestRegressor(n_estimators=10,random_state=0, min_samples_split=10, min_samples_leaf=10, max_depth=5, bootstrap=True)
model.fit(x_train, y_train)
predict = model.predict(x_test)
#print(predict.shape)

#plot out our prediction and compared it to the actual stock price
plt.plot(predict,label='preidction')
plt.plot(y_test,label='Actual Value')
plt.xticks([0,10,20,30,40,50,60,70,80,90,100,110,120],rotation=45)
plt.legend()

predictions = pd.DataFrame({"Predictions": predict}, index=pd.date_range(start=googl_df.index[-123], periods=len(predict), freq="D"))
print(predictions)


#Evaluate buying and selling strategies to maximize profits
oneyear_df_pred = pd.read_csv("google_predict.csv")

buy_price = min(predictions['Predictions'])
sell_price = max(predictions['Predictions'])
oneyear_buy = predictions.loc[predictions["Predictions"] == buy_price]
oneyear_sell = predictions.loc[predictions["Predictions"] == sell_price]
print("Buy price and date")
print(oneyear_buy)
print("Sell price and date")
print(oneyear_sell)

print('Earing rate:',round(sell_price/buy_price*100,1),'%')

#since the stock price would reach the highest point before the it reaches the lowest point
#so we can do the Leverage or Futures

# 4. MKL

In [None]:
df = pd.read_csv("MKL.csv")
#we have MKL,NVR,SEB,BIO
print(df.shape)
df=df.dropna()
df.set_index("Date", inplace=True)



for i in range (1235):
    df['Vol.'][i]=float(str(df['Vol.'][i])[:-1])*1000000
    df['Change %'][i]=float(str(df['Change %'][i])[:-1])
    df['Price'][i]=float(df['Price'][i].replace(',',''))
    df['Open'][i]=float(df['Open'][i].replace(',',''))
    df['High'][i]=float(df['High'][i].replace(',',''))
    df['Low'][i]=float(df['Low'][i].replace(',',''))  

df['Price'].plot()
plt.ylabel("Price")
plt.xticks(rotation=45)
plt.title(label='MKL stock price')
plt.show()
googl_df
#split the data into 90% training set and 10% testing set.
x = df.iloc[:, 1:6].values[:-1]
y = df['Price'].shift(-1)[:-1]

split = int(df.shape[0]*0.9)
x_train, x_test = x[:split], x[split:]
y_train, y_test = y[:split], y[split:]

#use random forest regressor to train the model
scale = StandardScaler()
x_train = scale.fit_transform(x_train)
x_test = scale.transform(x_test)
model = RandomForestRegressor(n_estimators=10,random_state=0, min_samples_split=10, min_samples_leaf=10, max_depth=5, bootstrap=True)
model.fit(x_train, y_train)
predict = model.predict(x_test)
#print(predict.shape)

#plot out our prediction and compared it to the actual stock price
plt.plot(predict,label='preidction')
plt.plot(y_test,label='Actual Value')
plt.xticks([0,10,20,30,40,50,60,70,80,90,100,110,120],rotation=45)
plt.legend()

predictions = pd.DataFrame({"Predictions": predict}, index=pd.date_range(start=googl_df.index[-123], periods=len(predict), freq="D"))
print(predictions)


#Evaluate buying and selling strategies to maximize profits
oneyear_df_pred = pd.read_csv("google_predict.csv")

buy_price = min(predictions['Predictions'])
sell_price = max(predictions['Predictions'])
oneyear_buy = predictions.loc[predictions["Predictions"] == buy_price]
oneyear_sell = predictions.loc[predictions["Predictions"] == sell_price]
print("Buy price and date")
print(oneyear_buy)
print("Sell price and date")
print(oneyear_sell)

print('Earing rate:',round(sell_price/buy_price*100,1),'%')

#since the stock price would reach the highest point before the it reaches the lowest point
#so we can do the Leverage or Futures

# 5. NVR

In [None]:
df = pd.read_csv("NVR.csv")
#we have MKL,NVR,SEB,BIO
print(df.shape)
df=df.dropna()
df.set_index("Date", inplace=True)



for i in range (1235):
    df['Vol.'][i]=float(str(df['Vol.'][i])[:-1])*1000000
    df['Change %'][i]=float(str(df['Change %'][i])[:-1])
    df['Price'][i]=float(df['Price'][i].replace(',',''))
    df['Open'][i]=float(df['Open'][i].replace(',',''))
    df['High'][i]=float(df['High'][i].replace(',',''))
    df['Low'][i]=float(df['Low'][i].replace(',',''))  

df['Price'].plot()
plt.ylabel("Price")
plt.xticks(rotation=45)
plt.title(label='NVR stock price')
plt.show()
googl_df
#split the data into 90% training set and 10% testing set.
x = df.iloc[:, 1:6].values[:-1]
y = df['Price'].shift(-1)[:-1]

split = int(df.shape[0]*0.9)
x_train, x_test = x[:split], x[split:]
y_train, y_test = y[:split], y[split:]

#use random forest regressor to train the model
scale = StandardScaler()
x_train = scale.fit_transform(x_train)
x_test = scale.transform(x_test)
model = RandomForestRegressor(n_estimators=10,random_state=0, min_samples_split=10, min_samples_leaf=10, max_depth=5, bootstrap=True)
model.fit(x_train, y_train)
predict = model.predict(x_test)
#print(predict.shape)

#plot out our prediction and compared it to the actual stock price
plt.plot(predict,label='preidction')
plt.plot(y_test,label='Actual Value')
plt.xticks([0,10,20,30,40,50,60,70,80,90,100,110,120],rotation=45)
plt.legend()

predictions = pd.DataFrame({"Predictions": predict}, index=pd.date_range(start=googl_df.index[-123], periods=len(predict), freq="D"))
print(predictions)


#Evaluate buying and selling strategies to maximize profits
oneyear_df_pred = pd.read_csv("google_predict.csv")

buy_price = min(predictions['Predictions'])
sell_price = max(predictions['Predictions'])
oneyear_buy = predictions.loc[predictions["Predictions"] == buy_price]
oneyear_sell = predictions.loc[predictions["Predictions"] == sell_price]
print("Buy price and date")
print(oneyear_buy)
print("Sell price and date")
print(oneyear_sell)

print('Earing rate:',round(sell_price/buy_price*100,1),'%')

#since the stock price would reach the highest point before the it reaches the lowest point
#so we can do the Leverage or Futures

# conclusion

After applying the random forest model, we predict that BIO stock has the largest volatility and can reach a return of 160.8%. If you are only considering buying once and selling once in the next 123 days, BIO is the best choice.