In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from datetime import datetime
import plotly.figure_factory as ff

%load_ext nb_black

In [None]:
df = pd.read_csv(
    "../data_pipeline/data/VEVE_HistoricPrices_20141001-20200505.csv",
    thousands=",",
    parse_dates=["Date"],
    date_parser=lambda x: datetime.strptime(x, "%d/%m/%Y"),
)

In [None]:
df.info()

In [None]:
df.head(10)

In [None]:
df[df.Close == 0]

In [None]:
def impute_missing_open_with_previous_close(df):
    new_df = df.assign(
        Previous_Close=lambda x: x.sort_values(by=["Date"]).Close.shift(
            periods=1, fill_value=0
        )
    )
    mask = new_df["Open"] == 0
    print(f"Number of rows with missing open: {new_df[mask].shape[0]}")
    new_df["Open"] = new_df["Open"].where(~mask, new_df["Previous_Close"])
    return new_df

In [None]:
def remove_zero_volume_rows(df):
    mask = df["Volume"] == 0
    print(f"Number of rows with zero volume: {df[mask].shape[0]}")
    new_df = df[~mask]
    return new_df

In [None]:
processed_df = df.pipe(impute_missing_open_with_previous_close).pipe(
    remove_zero_volume_rows
)

In [None]:
fig = go.Figure(
    data=go.Scatter(x=processed_df.Date, y=processed_df.Close, mode="lines")
)
fig.update_layout(title_text="Time Series Plot for Stock Price")
fig.show()

In [None]:
processed_df["Movement"] = processed_df.Close - processed_df.Open

In [None]:
def classify_movement(x):
    if x > 0:
        return 1
    elif x < 0:
        return -1
    else:
        return 0

In [None]:
processed_df["Sign"] = processed_df.Movement.apply(classify_movement)

In [None]:
processed_df_copy = processed_df.copy()

In [None]:
processed_df_copy.loc[processed_df_copy["Movement"] > 0, "test"] = 1

In [None]:
processed_df_copy.loc[processed_df_copy["Movement"] < 0, "test"] = -1

In [None]:
processed_df_copy["test"]

In [None]:
print(f"Distribution:\n{processed_df.Sign.value_counts()}")

In [None]:
processed_df[processed_df.Sign == 0]

In [None]:
def calculate_return(df):
    sorted_df = df.sort_values(by=["Date"])
    return (
        sorted_df.iloc[-1, sorted_df.columns.get_loc("Close")]
        / sorted_df.iloc[0, sorted_df.columns.get_loc("Open")]
        - 1
    )

In [None]:
weekly_return = processed_df.groupby(pd.Grouper(key="Date", freq="W")).apply(
    calculate_return
)
weekly_return.describe()

In [None]:
fig = ff.create_distplot([montly_return.values], group_labels=["Monthly Return"])
fig.update_layout(title_text="Hist and Curve Plot for Weekly Return")
fig.show()

In [None]:
fig = go.Figure(
    data=go.Scatter(x=weekly_return.index, y=weekly_return.values, mode="lines+markers")
)
fig.update_layout(title_text="Time Series Plot for Weekly Return")
fig.show()

In [None]:
montly_return = processed_df.groupby(pd.Grouper(key="Date", freq="M")).apply(
    calculate_return
)
montly_return.describe()

In [None]:
fig = ff.create_distplot([montly_return.values], group_labels=["Monthly Return"])
fig.update_layout(title_text="Hist and Curve Plot for Monthly Return")
fig.show()

In [None]:
fig = go.Figure(
    data=go.Scatter(x=montly_return.index, y=montly_return.values, mode="lines+markers")
)
fig.update_layout(title_text="Time Series Plot for Monthly Return")
fig.show()

In [None]:
df = pd.read_parquet("../data_pipeline/data/processed_price.parquet",)

In [None]:
df

In [None]:
df.info(memory_usage="deep")