# Data Science Internship at Widhya

## Mission: Stock Price Prediction Using Linear Regression


### Importing the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

### Reading the data

In [2]:
df = pd.read_csv('EOD-HD.csv')
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividend,Split,Adj_Open,Adj_High,Adj_Low,Adj_Close,Adj_Volume
0,2017-12-28,190.91,190.98,189.64,189.78,3175631.0,0.00,1.0,177.544280,177.609379,176.363193,176.493392,3175631.0
1,2017-12-27,190.60,191.49,190.01,190.19,5912613.0,0.00,1.0,177.255983,178.083673,176.707289,176.874687,5912613.0
2,2017-12-26,188.53,190.42,188.34,190.36,2969182.0,0.00,1.0,175.330905,177.088585,175.154207,177.032785,2969182.0
3,2017-12-22,188.20,188.46,187.27,188.13,3256519.0,0.00,1.0,175.024008,175.265805,174.159118,174.958909,3256519.0
4,2017-12-21,187.70,188.84,187.44,188.08,5859058.0,0.00,1.0,174.559014,175.619201,174.317216,174.912409,5859058.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1085,2013-09-09,72.98,73.79,72.90,73.58,5781800.0,0.00,1.0,62.005719,62.693916,61.937749,62.515495,5781800.0
1086,2013-09-06,73.55,73.56,72.21,72.70,9968400.0,0.00,1.0,62.490006,62.498502,61.351507,61.767824,9968400.0
1087,2013-09-05,74.07,74.21,72.84,72.99,11039000.0,0.00,1.0,62.931812,63.050759,61.886772,62.014215,11039000.0
1088,2013-09-04,73.98,74.51,73.74,74.14,8271600.0,0.00,1.0,62.855345,63.305647,62.651435,62.991286,8271600.0


### Data Collection and Analysis

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1090 entries, 0 to 1089
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Date        1090 non-null   object 
 1   Open        1090 non-null   float64
 2   High        1090 non-null   float64
 3   Low         1090 non-null   float64
 4   Close       1090 non-null   float64
 5   Volume      1090 non-null   float64
 6   Dividend    1090 non-null   float64
 7   Split       1090 non-null   float64
 8   Adj_Open    1090 non-null   float64
 9   Adj_High    1090 non-null   float64
 10  Adj_Low     1090 non-null   float64
 11  Adj_Close   1090 non-null   float64
 12  Adj_Volume  1090 non-null   float64
dtypes: float64(12), object(1)
memory usage: 110.8+ KB


In [4]:
df.describe()

Unnamed: 0,Open,High,Low,Close,Volume,Dividend,Split,Adj_Open,Adj_High,Adj_Low,Adj_Close,Adj_Volume
count,1090.0,1090.0,1090.0,1090.0,1090.0,1090.0,1090.0,1090.0,1090.0,1090.0,1090.0,1090.0
mean,118.182119,119.02803,117.35627,118.225661,5415875.0,0.010404,1.0,105.494714,106.248348,104.75932,105.533892,5415875.0
std,28.182296,28.315725,28.065522,28.209296,2274563.0,0.083182,0.0,27.742404,27.876593,27.624102,27.767769,2274563.0
min,72.98,73.56,72.21,72.7,1517075.0,0.0,1.0,62.005719,62.498502,61.351507,61.767824,1517075.0
25%,92.31,93.0,91.362575,92.1275,3895442.0,0.0,1.0,80.112181,80.745389,79.323727,79.919768,3895442.0
50%,120.91,121.885,119.69,120.69,4927485.0,0.0,1.0,107.569835,108.348245,106.36975,107.64632,4927485.0
75%,135.725,136.495,134.81,135.7375,6349263.0,0.0,1.0,122.60963,123.3751,121.97217,122.731563,6349263.0
max,190.91,191.49,190.01,190.36,20753200.0,0.89,1.0,177.54428,178.083673,176.707289,177.032785,20753200.0


**Checking for Null values**

In [5]:
df.isnull().sum()

Date          0
Open          0
High          0
Low           0
Close         0
Volume        0
Dividend      0
Split         0
Adj_Open      0
Adj_High      0
Adj_Low       0
Adj_Close     0
Adj_Volume    0
dtype: int64

### Data Manipulation and Feature Engineering

**Calculating new field High_Low_Percentage [HL_PCT]**

In [6]:
df['HL_PCT']= (df['Adj_High']-df['Adj_Low'])/df['Adj_Close']*100.0

In [7]:
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividend,Split,Adj_Open,Adj_High,Adj_Low,Adj_Close,Adj_Volume,HL_PCT
0,2017-12-28,190.91,190.98,189.64,189.78,3175631.0,0.00,1.0,177.544280,177.609379,176.363193,176.493392,3175631.0,0.706081
1,2017-12-27,190.60,191.49,190.01,190.19,5912613.0,0.00,1.0,177.255983,178.083673,176.707289,176.874687,5912613.0,0.778169
2,2017-12-26,188.53,190.42,188.34,190.36,2969182.0,0.00,1.0,175.330905,177.088585,175.154207,177.032785,2969182.0,1.092667
3,2017-12-22,188.20,188.46,187.27,188.13,3256519.0,0.00,1.0,175.024008,175.265805,174.159118,174.958909,3256519.0,0.632541
4,2017-12-21,187.70,188.84,187.44,188.08,5859058.0,0.00,1.0,174.559014,175.619201,174.317216,174.912409,5859058.0,0.744364
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1085,2013-09-09,72.98,73.79,72.90,73.58,5781800.0,0.00,1.0,62.005719,62.693916,61.937749,62.515495,5781800.0,1.209568
1086,2013-09-06,73.55,73.56,72.21,72.70,9968400.0,0.00,1.0,62.490006,62.498502,61.351507,61.767824,9968400.0,1.856946
1087,2013-09-05,74.07,74.21,72.84,72.99,11039000.0,0.00,1.0,62.931812,63.050759,61.886772,62.014215,11039000.0,1.876969
1088,2013-09-04,73.98,74.51,73.74,74.14,8271600.0,0.00,1.0,62.855345,63.305647,62.651435,62.991286,8271600.0,1.038576


**Calculating new field Percentage_Change [PCT_Change]**

In [8]:
df['PCT_Change']= (df['Adj_Close']-df['Adj_Open'])/df['Adj_Open']*100.0

In [9]:
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividend,Split,Adj_Open,Adj_High,Adj_Low,Adj_Close,Adj_Volume,HL_PCT,PCT_Change
0,2017-12-28,190.91,190.98,189.64,189.78,3175631.0,0.00,1.0,177.544280,177.609379,176.363193,176.493392,3175631.0,0.706081,-0.591902
1,2017-12-27,190.60,191.49,190.01,190.19,5912613.0,0.00,1.0,177.255983,178.083673,176.707289,176.874687,5912613.0,0.778169,-0.215110
2,2017-12-26,188.53,190.42,188.34,190.36,2969182.0,0.00,1.0,175.330905,177.088585,175.154207,177.032785,2969182.0,1.092667,0.970668
3,2017-12-22,188.20,188.46,187.27,188.13,3256519.0,0.00,1.0,175.024008,175.265805,174.159118,174.958909,3256519.0,0.632541,-0.037194
4,2017-12-21,187.70,188.84,187.44,188.08,5859058.0,0.00,1.0,174.559014,175.619201,174.317216,174.912409,5859058.0,0.744364,0.202451
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1085,2013-09-09,72.98,73.79,72.90,73.58,5781800.0,0.00,1.0,62.005719,62.693916,61.937749,62.515495,5781800.0,1.209568,0.822143
1086,2013-09-06,73.55,73.56,72.21,72.70,9968400.0,0.00,1.0,62.490006,62.498502,61.351507,61.767824,9968400.0,1.856946,-1.155676
1087,2013-09-05,74.07,74.21,72.84,72.99,11039000.0,0.00,1.0,62.931812,63.050759,61.886772,62.014215,11039000.0,1.876969,-1.458080
1088,2013-09-04,73.98,74.51,73.74,74.14,8271600.0,0.00,1.0,62.855345,63.305647,62.651435,62.991286,8271600.0,1.038576,0.216275


**Setting Index as Date**

In [10]:
df.set_index('Date')

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividend,Split,Adj_Open,Adj_High,Adj_Low,Adj_Close,Adj_Volume,HL_PCT,PCT_Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2017-12-28,190.91,190.98,189.64,189.78,3175631.0,0.00,1.0,177.544280,177.609379,176.363193,176.493392,3175631.0,0.706081,-0.591902
2017-12-27,190.60,191.49,190.01,190.19,5912613.0,0.00,1.0,177.255983,178.083673,176.707289,176.874687,5912613.0,0.778169,-0.215110
2017-12-26,188.53,190.42,188.34,190.36,2969182.0,0.00,1.0,175.330905,177.088585,175.154207,177.032785,2969182.0,1.092667,0.970668
2017-12-22,188.20,188.46,187.27,188.13,3256519.0,0.00,1.0,175.024008,175.265805,174.159118,174.958909,3256519.0,0.632541,-0.037194
2017-12-21,187.70,188.84,187.44,188.08,5859058.0,0.00,1.0,174.559014,175.619201,174.317216,174.912409,5859058.0,0.744364,0.202451
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013-09-09,72.98,73.79,72.90,73.58,5781800.0,0.00,1.0,62.005719,62.693916,61.937749,62.515495,5781800.0,1.209568,0.822143
2013-09-06,73.55,73.56,72.21,72.70,9968400.0,0.00,1.0,62.490006,62.498502,61.351507,61.767824,9968400.0,1.856946,-1.155676
2013-09-05,74.07,74.21,72.84,72.99,11039000.0,0.00,1.0,62.931812,63.050759,61.886772,62.014215,11039000.0,1.876969,-1.458080
2013-09-04,73.98,74.51,73.74,74.14,8271600.0,0.00,1.0,62.855345,63.305647,62.651435,62.991286,8271600.0,1.038576,0.216275


**Subsetting required features**

In [11]:
df = df[['Adj_Close','PCT_Change','HL_PCT']] 
# Take a look at the new data 
print(df.head())

    Adj_Close  PCT_Change    HL_PCT
0  176.493392   -0.591902  0.706081
1  176.874687   -0.215110  0.778169
2  177.032785    0.970668  1.092667
3  174.958909   -0.037194  0.632541
4  174.912409    0.202451  0.744364


**Identifying Null values and Shape of the data**

In [12]:
df.isnull().sum()

Adj_Close     0
PCT_Change    0
HL_PCT        0
dtype: int64

In [13]:
df.shape

(1090, 3)

In [14]:
df

Unnamed: 0,Adj_Close,PCT_Change,HL_PCT
0,176.493392,-0.591902,0.706081
1,176.874687,-0.215110,0.778169
2,177.032785,0.970668,1.092667
3,174.958909,-0.037194,0.632541
4,174.912409,0.202451,0.744364
...,...,...,...
1085,62.515495,0.822143,1.209568
1086,61.767824,-1.155676,1.856946
1087,62.014215,-1.458080,1.876969
1088,62.991286,0.216275,1.038576


**A variable for predicting number of days out in the future**

In [15]:
forecast_out = 10 #'n=10' days

**Creating another column (i.e. target variable) shifted 'n' units up**

In [16]:
df['Prediction'] = df[['Adj_Close']].shift(-forecast_out)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Prediction'] = df[['Adj_Close']].shift(-forecast_out)


**Preview of dataset**

In [17]:
print(df.tail())

      Adj_Close  PCT_Change    HL_PCT  Prediction
1085  62.515495    0.822143  1.209568         NaN
1086  61.767824   -1.155676  1.856946         NaN
1087  62.014215   -1.458080  1.876969         NaN
1088  62.991286    0.216275  1.038576         NaN
1089  62.787375   -1.242817  2.124493         NaN


**Creating independent data set (X)**

**Converting the dataframe to a numpy array**

In [18]:
X = np.array(df.drop(['Prediction'],1))

**Removing Last 10 rows**

In [19]:
X = X[:-forecast_out]
print(X)

[[176.49339151  -0.59190194   0.70608073]
 [176.87468717  -0.21511018   0.7781692 ]
 [177.03278537   0.9706678    1.09266653]
 ...
 [ 66.70415205   0.86073998   1.37562094]
 [ 65.7355782    2.11165369   3.39925036]
 [ 64.40166508   0.21152829   0.7651715 ]]


**Creating the dependent data set (y)**

**Convert the dataframe to a numpy array**

In [20]:
y = np.array(df['Prediction'])

**Geting all of the y values except the last '10' rows**

In [21]:
y = y[:-forecast_out]
print(y)

[170.21596294 169.07207596 169.4905712  ...  62.01421549  62.99128561
  62.78737533]


### Building and training the model

**Split the dataset into 80% training and 20% testing**

In [22]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

**Model Building: Linear Regression**

In [23]:
lr = LinearRegression()

**Model Training**

In [24]:
lr.fit(x_train, y_train)

LinearRegression()

**Model Testing: Score returns the coefficient of determination R^2 of the prediction.**

In [25]:
print('Training Score: ', round(lr.score(x_train,y_train)*100,2),'%')

# The best possible score is 1.0
print('lr confidence (Testing Score): ', round(lr.score(x_test,y_test)*100,2))


Training Score:  98.54 %
lr confidence (Testing Score):  98.36


**Model Predictions**

In [26]:
y_pred = lr.predict(x_test)
print(y_pred)

[132.22748857 119.64184165 119.18529619 116.85184738 104.59350145
 131.67215263 111.06199055 149.87526074  68.47335479 141.46633767
  96.90982082 133.12627281 164.3785343  111.04857105 132.68172254
 100.72007051  68.03783445 145.3724629  111.94719524  63.9173805
 100.54228906  66.89668868 123.45762509  69.53453724 113.13172854
 113.00301881 115.3980344  115.33090931  69.66427583  69.99480896
 139.19082729  67.30043077  81.46293204 149.31652741 156.96163137
  78.84598118  69.23286905  98.82911702  96.77686137  94.40277637
 100.66915005 108.00273367  66.28415557  95.7906962  140.22902779
  99.84828085 100.90890661 120.36073116 113.09279707 102.0720766
 118.4968783   84.63841854  97.2374429  117.37792837 123.67437303
 133.13169868  77.6758581  110.19149369 100.08587835 113.19284999
  79.50968969 131.90421663 105.01461327 139.13056033  77.07951974
 127.66737432 121.85488577 138.00654837 119.4562837   67.76805648
  85.02363098 171.93322153 132.53276161 138.81823606 118.48382331
 136.0785852

**Model Performance**

In [27]:
from sklearn import metrics
acc=metrics.r2_score(y_test,y_pred)
print("Accuracy Score of Model: ",round(acc*100,2),'%')

Accuracy Score of Model:  98.36 %


**Actual Price v/s Predicted Price**

In [28]:
prediction_df = pd.DataFrame({ 'Actual Price': y_test, 'Predicted Price': y_pred})  
prediction_df.head()

Unnamed: 0,Actual Price,Predicted Price
0,131.548467,132.227489
1,119.035008,119.641842
2,119.80465,119.185296
3,118.131126,116.851847
4,103.085088,104.593501


**Model evaluation**

In [29]:
from sklearn import metrics
print('Mean Absolute Error:',round(metrics.mean_absolute_error(y_test,y_pred),2))
print('Mean Squared Error:',round(metrics.mean_squared_error(y_test,y_pred),2))
print('Root Mean Squared Error:',round(np.sqrt(metrics.mean_squared_error(y_test,y_pred)),2))
print('Explained Variance Score:',metrics.explained_variance_score(y_test,y_pred))

Mean Absolute Error: 2.5
Mean Squared Error: 11.75
Root Mean Squared Error: 3.43
Explained Variance Score: 0.983605334890186


**Forecasting**

In [30]:
# Set x_forecast equal to the last 10 rows of the original data set from Adj. Close column
x_forecast = np.array(df.drop(['Prediction'],1))[-forecast_out:]
print(x_forecast)

[[64.18076227 -0.67061144  1.44294414]
 [63.81542301 -0.29204832  0.75888697]
 [64.06181461  0.29263102  1.02122016]
 [63.75594918  0.50897402  1.26599147]
 [63.38211366  0.86533261  1.21809651]
 [62.51549495  0.82214305  1.20956782]
 [61.7678239  -1.15567641  1.85694635]
 [62.01421549 -1.45808019  1.87696945]
 [62.99128561  0.21627467  1.03857567]
 [62.78737533 -1.24281705  2.12449256]]


In [31]:
# Printing lR model predictions for the next '10' days
lr_prediction = lr.predict(x_forecast)
print(lr_prediction)

[64.59025946 63.6275686  63.57580791 63.24021801 62.59494194 61.77417689
 62.78050927 63.24875206 62.59204695 63.97071605]


**Creating final function pricepredictor for priceprediction**

In [32]:
 def pricepredictor(forecast_days): 
        
    # A variable for predicting 'n' days out into the future
    forecast_out = forecast_days
    df['Prediction'] = df[['Adj_Close']].shift(-forecast_out)
    
    # Convert the dataframe to a numpy array
    X = np.array(df.drop(['Prediction'],1))
    
    #Remove the last '10' rows
    X = X[:-forecast_out]
    
    # Convert the dataframe to a numpy array 
    y = np.array(df['Prediction'])
    
    # Get all of the y values except the last '10' rows
    y = y[:-forecast_out]

    # Split the data into 80% training and 20% testing
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # Create and train the Linear Regression  Model
    lr = LinearRegression()
    
    # Train the model
    lr.fit(x_train, y_train)

    # The best possible score is 1.0
    lr_confidence = lr.score(x_test, y_test)
    print('The Confidence {}\n'.format(lr_confidence))
    
    # Set x_forecast equal to the last 10 rows of the original data set from Adj. Close column
    x_forecast = np.array(df.drop(['Prediction'],1))[-forecast_out:]

    
    # Print linear regression model predictions for the next '10' days
    lr_prediction = lr.predict(x_forecast)
    print('Prediction for next {} day price:\n {}'.format(forecast_days,lr_prediction))

**Predicting next 1 day price**

In [33]:
pricepredictor(1)

The Confidence 0.9995171252147548

Prediction for next 1 day price:
 [64.07384426]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Prediction'] = df[['Adj_Close']].shift(-forecast_out)


**Predicting next 15 days price**

In [34]:
pricepredictor(15)

The Confidence 0.9765797179236794

Prediction for next 15 day price:
 [65.03528368 67.96127627 65.84623278 65.16674867 63.78891355 64.77770448
 63.65317702 63.56017786 63.24065655 62.54382336 61.74176185 63.14446196
 63.65414085 62.60741605 64.37519564]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Prediction'] = df[['Adj_Close']].shift(-forecast_out)
