In [17]:
import pandas as pd
import quandl
import math
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

## This is the implementation of Sentdex Yotube channel Machine Learning Playlist
https://www.youtube.com/watch?v=r4mwkS2T9aI&list=PLQVvvaa0QuDfKTOs3Keq_kaG2P55YRn5v&index=4

#### First 1-6 Videos of this Playlist is about applying Linear Regression on google stocks data


In [44]:
def getting_quandl_data():

    """
    fetching google stock data from quandl wiki websites
    It is free data 
    Saving the stocks data in csv in data folder
    """
    
    df = quandl.get("WIKI/GOOGL")
    df.to_csv("./data/google_stocks_data.csv",index=False)
    return df

def filtering_columns(df):
    """
    filtering columns 
    using only required columns
    
    """
    df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']]
    return df

def adding_pct_chg(df):
    """
    Creating PCT_change feature
    ((Adj.close/Adj.open) -1) *100
    """
    df['PCT_change'] = (df['Adj. Close']-df['Adj. Open']) / df['Adj. Close'] * 100
    return df

def high_close_pct(df):
    """ 
    Creating HL_PCT feature
    ((Adj.High/Adj.Close) -1) *100
    """
    
    df['HL_PCT'] = (df['Adj. High'] - df['Adj. Close'])/ df['Adj. Close'] *100
    return df

def filling_data(df):
    """
    filling NaNs because ML models cannot understand NaNs
    """
    df.fillna(-99999,inplace=True)
    return df

def creating_forecast_out(df,pct_no=0.01):
    """
    actually this forecast_out is used to bring future  Adj.Close in front of rows as a target /label 
    """
    
    forecast_out = math.ceil(pct_no * len(df))
    return forecast_out

def adding_label_target(df,forecast_col,forecast_out):
    """
    shifting future data in front of row
    """
    df['label'] = df[forecast_col].shift(-forecast_out)
    df.dropna(inplace=True)
    return df

def creating_X_y_data(df):
    """
    separating Df into label and features
    """
    y = df['label'].values
    df.drop(['label'],axis=1,inplace=True)
    X = df.values
    return X,y

def spliting_data_into_train_test(X,y,test_size=0.2):
    """
    creating train and test data
    """
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=test_size)
    return X_train,X_test,y_train,y_test

def linear_regression_model(X_train,y_train,X_test,y_test):
    """
    training Linear Regression Model 
    """
    
    model = LinearRegression()
    model.fit(X_train,y_train)
    accuracy = model.score(X_test,y_test)
    return model,accuracy

In [45]:
forecast_col = 'Adj. Close'
df = getting_quandl_data()
df = filtering_columns(df)
df = adding_pct_chg(df)
df = high_close_pct(df)
df = filling_data(df)
forecast_out = creating_forecast_out(df)
df = adding_label_target(df,forecast_col,forecast_out)
X,y = creating_X_y_data(df)
X_train,X_test,y_train,y_test = spliting_data_into_train_test(X,y)
model,accuracy = linear_regression_model(X_train,y_train,X_test,y_test)

In [46]:
print("accuract is {}".format(accuracy))

accuract is 0.9751356508749678
