<a href="https://colab.research.google.com/github/vickydaiya/Widhya-Internship/blob/main/Apple_stock_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Getting Apple stock data**

In [1]:
import pandas as pd
import quandl
df = quandl.get("EOD/AAPL", authtoken="********************") #Make an account on quandl and get your API key
df.head(5)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividend,Split,Adj_Open,Adj_High,Adj_Low,Adj_Close,Adj_Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2013-09-03,493.1,500.6,487.35,488.58,11854600.0,0.0,1.0,15.61965,15.857223,15.43751,15.476472,331928800.0
2013-09-04,499.56,502.24,496.28,498.691,12322600.0,0.0,1.0,15.82428,15.909173,15.720381,15.796753,345032800.0
2013-09-05,500.25,500.68,493.64,495.27,8441700.0,0.0,1.0,15.846136,15.859757,15.636755,15.688388,236367600.0
2013-09-06,498.44,499.38,489.95,498.22,12840200.0,0.0,1.0,15.788802,15.818578,15.519869,15.781833,359525600.0
2013-09-09,505.0,507.92,503.48,506.17,12167400.0,0.0,1.0,15.9966,16.089095,15.948451,16.033661,340687200.0


### **Removing unnecessary columns**

In [2]:
#Adjusted columns are the most ideal ones. Regular columns here are on day prices. 
#Stocks have a concept "stock splits" where 1 share becomes like 2 shares. Thus the value of share is halved but the value of company has not halved.

clean_df = df[['Adj_Open','Adj_High','Adj_Low','Adj_Close','Adj_Volume']]
clean_df.head(5)

Unnamed: 0_level_0,Adj_Open,Adj_High,Adj_Low,Adj_Close,Adj_Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-09-03,15.61965,15.857223,15.43751,15.476472,331928800.0
2013-09-04,15.82428,15.909173,15.720381,15.796753,345032800.0
2013-09-05,15.846136,15.859757,15.636755,15.688388,236367600.0
2013-09-06,15.788802,15.818578,15.519869,15.781833,359525600.0
2013-09-09,15.9966,16.089095,15.948451,16.033661,340687200.0


### **Adding 2 columns viz. high low percentage and percent change**

In [3]:
#These 2 are meaningful features which will help in more accurate predictions

clean_df.insert(loc=len(clean_df.columns),column="HL_PCT",value=(clean_df.Adj_High - clean_df.Adj_Low)/clean_df.Adj_Close * 100)
clean_df.insert(loc=len(clean_df.columns),column="PCT_change",value=(clean_df.Adj_Close - clean_df.Adj_Open)/clean_df.Adj_Open * 100)
clean_df.head(5)

Unnamed: 0_level_0,Adj_Open,Adj_High,Adj_Low,Adj_Close,Adj_Volume,HL_PCT,PCT_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-09-03,15.61965,15.857223,15.43751,15.476472,331928800.0,2.711941,-0.91665
2013-09-04,15.82428,15.909173,15.720381,15.796753,345032800.0,1.195129,-0.173953
2013-09-05,15.846136,15.859757,15.636755,15.688388,236367600.0,1.421447,-0.995502
2013-09-06,15.788802,15.818578,15.519869,15.781833,359525600.0,1.892738,-0.044138
2013-09-09,15.9966,16.089095,15.948451,16.033661,340687200.0,0.877176,0.231683


### **Removing unnecessary columns**

In [4]:
#we are interested in only the closing price of the stock on a day and hence the other prices can be dropped

clean_df = clean_df.drop(labels=['Adj_Open','Adj_High','Adj_Low'],axis='columns')
clean_df.head(5)

Unnamed: 0_level_0,Adj_Close,Adj_Volume,HL_PCT,PCT_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-09-03,15.476472,331928800.0,2.711941,-0.91665
2013-09-04,15.796753,345032800.0,1.195129,-0.173953
2013-09-05,15.688388,236367600.0,1.421447,-0.995502
2013-09-06,15.781833,359525600.0,1.892738,-0.044138
2013-09-09,16.033661,340687200.0,0.877176,0.231683


### **Checking for NA values**

In [5]:
for colname in clean_df.columns:
  print(colname," : ",sum(clean_df[colname].isna()))

Adj_Close  :  0
Adj_Volume  :  0
HL_PCT  :  0
PCT_change  :  0


### **Inserting labels (prediction for next day) column**

In [6]:
#we have the next day prices for a particular day (except the last day in the dataset). This will serve as label for training the model.

clean_df.insert(loc=len(clean_df.columns),column="labels",value=df.Adj_Close.shift(-1, axis = 0))
clean_df.head(5)

Unnamed: 0_level_0,Adj_Close,Adj_Volume,HL_PCT,PCT_change,labels
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-09-03,15.476472,331928800.0,2.711941,-0.91665,15.796753
2013-09-04,15.796753,345032800.0,1.195129,-0.173953,15.688388
2013-09-05,15.688388,236367600.0,1.421447,-0.995502,15.781833
2013-09-06,15.781833,359525600.0,1.892738,-0.044138,16.033661
2013-09-09,16.033661,340687200.0,0.877176,0.231683,15.668432


### **Removing records with NA**

In [7]:
#last entry of labels column will have NA since no next day prediction value is available. Hence, removing that record

clean_df = clean_df.dropna(axis = 0, how ='any')

### **Dividing features and output variables**

In [8]:
import numpy as np

X = np.array(clean_df.iloc[:, clean_df.columns != 'labels'])
y = np.array(clean_df.iloc[:, clean_df.columns == 'labels'])

### **Mean and standard deviation of feature variables**

In [9]:
print("Mean: ",X.mean(axis=0))
print("Standard deviation: ", X.std(axis=0))

Mean:  [2.63596212e+01 1.93250266e+08 1.65025149e+00 2.67776937e-02]
Standard deviation:  [6.60125888e+00 1.11222288e+08 9.72184204e-01 1.14730248e+00]


### **Scaling (normalization) feature variables**

In [10]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)
print(X_scaled)
print("Mean: ",X_scaled.mean(axis=0))
print("Standard deviation: ",X_scaled.std(axis=0))

[[-1.64864747  1.24685921  1.09206592 -0.82230055]
 [-1.60012938  1.36467732 -0.46814445 -0.17495889]
 [-1.6165452   0.38766811 -0.23535108 -0.89102914]
 ...
 [ 2.3847004  -1.14952221 -1.15439199  0.14132209]
 [ 2.22289522 -0.54402875 -0.61741663 -0.14071103]
 [ 2.2239885  -0.96435181 -1.05232461  0.2328654 ]]
Mean:  [ 6.78571575e-16  5.21978135e-17 -5.21978135e-17 -6.52472668e-18]
Standard deviation:  [1. 1. 1. 1.]


### **Creating train and test sets**

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled,y,test_size = 0.2,random_state=1)
print("X_train shape: ",X_train.shape)
print("X_test shape: ",X_test.shape)
print("y_train shape: ",y_train.shape)
print("y_test shape: ",y_test.shape)

X_train shape:  (871, 4)
X_test shape:  (218, 4)
y_train shape:  (871, 1)
y_test shape:  (218, 1)


### **Fitting a linear regression model on train data**

In [18]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

### **Model evaluation**

In [19]:
from sklearn.metrics import mean_squared_error,r2_score

print("Mean squared error (predicting for test set with the trained model): ",mean_squared_error(y_test, model.predict(X_test)))
print("R squared score: ",r2_score(y_test, model.predict(X_test)))

Mean squared error (predicting for test set with the trained model):  0.12021867258467335
R squared score:  0.9973092296865895
