In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, model_selection
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
import datetime

In [2]:
df = pd.read_csv('GOOG.csv', delimiter=',')
df.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
749,2020-07-29,1506.319946,1531.251953,1501.329956,1522.02002,1522.02002,1106500
750,2020-07-30,1497.0,1537.869995,1492.219971,1531.449951,1531.449951,1671400
751,2020-07-31,1505.01001,1508.949951,1454.030029,1482.959961,1482.959961,3439900
752,2020-08-03,1486.640015,1490.469971,1465.640015,1474.449951,1474.449951,2330200
753,2020-08-04,1476.569946,1485.560059,1458.650024,1464.969971,1464.969971,1902200


In [3]:
# Close and Adj Close hold the same values
for item1, item2 in zip(df['Close'], df['Adj Close']):
    result = (item1 == item2)
print(result)

True


In [4]:
df['Pct_Change'] = (df['Close'] - df['Open']) / df['Open']*100

In [5]:
df.isnull().sum()

Date          0
Open          0
High          0
Low           0
Close         0
Adj Close     0
Volume        0
Pct_Change    0
dtype: int64

In [6]:
df_copy = df.copy()[['Close', 'Pct_Change']]

In [7]:
# Forecast 7 days into the future
forecast_col = 'Close'
forecast_out = 7
df_copy['Prediction'] = df[forecast_col].shift(-forecast_out)
print(df_copy)

           Close  Pct_Change  Prediction
0     929.359985    0.032289  926.960022
1     926.789978   -0.032365  910.979980
2     922.900024    0.248752  910.669983
3     907.239990   -1.123644  906.659973
4     914.390015    0.707077  924.690002
..           ...         ...         ...
749  1522.020020    1.042280         NaN
750  1531.449951    2.301266         NaN
751  1482.959961   -1.465110         NaN
752  1474.449951   -0.819974         NaN
753  1464.969971   -0.785603         NaN

[754 rows x 3 columns]


In [8]:
# x is features to predict y, y is the price prediction
x = np.array(df_copy.drop(['Prediction'], 1))
x = preprocessing.scale(x)
x_recent = x[-forecast_out:]
x = x[:-forecast_out]

y = np.array(df_copy['Prediction'])
y = y[:-forecast_out]

x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.2)
linear = LinearRegression()
linear.fit(x,y)
print(f"Accuracy of the model is {linear.score(x_test, y_test)}")

Accuracy of the model is 0.8896907729251312


In [9]:
df_copy.tail()

Unnamed: 0,Close,Pct_Change,Prediction
749,1522.02002,1.04228,
750,1531.449951,2.301266,
751,1482.959961,-1.46511,
752,1474.449951,-0.819974,
753,1464.969971,-0.785603,


In [10]:
forecast = linear.predict(x_recent)
print(forecast)

[1517.40294728 1491.86292623 1509.52699906 1517.10176667 1475.13350757
 1466.31224851 1457.24825841]


In [15]:
# Now I got predicted values for the next 7 days, let's prepare to plot the data 
df = df[['Date', 'Close']]

forecast_date = pd.date_range('2020-08-05', periods=forecast_out, freq='D')
predict_df = pd.DataFrame({'Date':forecast_date, "Close":forecast})
print(f'Original Table \n{df.tail()}')
print()
print(f'Prediction Table \n{predict_df}')

Original Table 
           Date        Close
756  2020-08-07  1509.526999
757  2020-08-08  1517.101767
758  2020-08-09  1475.133508
759  2020-08-10  1466.312249
760  2020-08-11  1457.248258

Prediction Table 
        Date        Close
0 2020-08-05  1517.402947
1 2020-08-06  1491.862926
2 2020-08-07  1509.526999
3 2020-08-08  1517.101767
4 2020-08-09  1475.133508
5 2020-08-10  1466.312249
6 2020-08-11  1457.248258
