# **Import Module and Data**

In [1]:
!pip install pandas_ta

Collecting pandas_ta
  Downloading pandas_ta-0.3.14b.tar.gz (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.1/115.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: pandas_ta
  Building wheel for pandas_ta (setup.py) ... [?25l- \ | done
[?25h  Created wheel for pandas_ta: filename=pandas_ta-0.3.14b0-py3-none-any.whl size=218923 sha256=de2aae36ede6be0e63791922fd40055e5b1affd1902faa6b4730a840610facf7
  Stored in directory: /root/.cache/pip/wheels/0b/81/f0/cca85757840e4616a2c6b9fe12569d97d324c27cac60724c58
Successfully built pandas_ta
Installing collected packages: pandas_ta
Successfully installed pandas_ta-0.3.14b0
[0m

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pandas_ta as ta
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from statsmodels.tsa.stattools import pacf, acf
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
import pandas_ta as ta
import plotly.graph_objects as go

  shapely_geos_version, geos_capi_version_string


In [3]:
ETF_stock = pd.read_csv("/kaggle/input/mutual-funds-and-etfs/ETF prices.csv").set_index("fund_symbol")
ETF_info = pd.read_csv("/kaggle/input/mutual-funds-and-etfs/ETFs.csv").set_index("fund_symbol")

# **Create utils function**

**Global Information of Data**

In [4]:
def main_information(filename: str):
    df = pd.read_csv(filename)
    print(f'The Dataset {filename} contains {df.shape[0]} rows and {df.shape[1]} columns')
    print(
        f"There are {len(list(df.select_dtypes(exclude='number').columns))} categorical columns and  {len(list(df.select_dtypes(include='number').columns))} numerical columns")
    print(f'There are {len(list(df.columns[df.isnull().any()]))} columns with missing values')
    print(
        f'The maximum number of missing values is {df.isnull().sum().max()} which is representing {round((df.isnull().sum().max() / df.shape[0]) * 100, 3)} percent of the column')


In [5]:
def worldwide_description(data: pd.DataFrame):
    df = data[["fund_family", "fund_category"]].dropna()
    df = df.groupby(["fund_family","fund_category"]).size().reset_index().rename(columns={0: 'count'})
    fig = px.bar(df, x="fund_family", y="count", color="fund_category", text_auto=True, title="Fund category per fund family")
    fig.show()

In [6]:
def filter_by_size(data: pd.DataFrame):
    df = data[["size_type", "investment_type", "exchange_name"]].dropna()
    df = df.groupby(["size_type", "investment_type", "exchange_name"]).size().reset_index().rename(columns={0: 'count'})
    fig = px.bar(df, x="exchange_name", y="count", color="investment_type", facet_col="size_type", text_auto=True,
                 title="Type of investment depending on the size of the fund owner of the ETF")
    fig.show()

# For Fund global statistics and ratio


In [7]:
def fund_composition_sector(data: pd.DataFrame, fund_code:str):
    composition_col = data.iloc[:, 33:44]
    df = composition_col.loc[fund_code].T.reset_index()[:-1]
    df.columns = ["cat", "value"]
    df["cat"] = df["cat"].apply(lambda x: x.split("fund_sector_")[1])
    fig = go.Pie(values=df["value"], labels=df["cat"])
    return fig

In [8]:
def fund_composition_bond(data: pd.DataFrame, fund_code:str):
    composition_col = data.iloc[:, 48:59]
    df = composition_col.loc[fund_code].T.reset_index()[:-1].dropna()
    df.columns = ["cat", "value"]
    df["cat"] = df["cat"].apply(lambda x: x.split("fund_")[1])
    if len(df.columns) == 2:
        df.columns = ["cat", "value"]
        fig = go.Pie(values=df.iloc[2:,:]["value"], labels=df.iloc[2:,:]["cat"])
        return fig
    else:
        print("No value or no bond in the ETF")

In [9]:
def fund_fundamental_ratio(data: pd.DataFrame, fund_code:str):
    df_ratio = data.iloc[ETF_info.index.get_loc(fund_code), 44:48]
    ratio = sorted([(i.split("fund_price_")[1]) for i in df_ratio.index])
    fig = go.Table(
        header=dict(values=ratio,
                    fill_color='paleturquoise',
                    align='center', font=dict(color="black", size=12)),
        cells=dict(values=df_ratio.values,
                   fill_color=['lavender']*4,
                   align='center', font=dict(color="black", size=12)))
    return fig

In [10]:
def fund_performance(data: pd.DataFrame, fund_code: str):
    fund_perf = data.loc[fund_code].iloc[120:]
    perf_indicators = sorted(list(set(["_".join(i.split("fund_")[1].split("_")[:-1]) for i in fund_perf.index[:-1]])))
    perf_years = sorted(list(set([i.split("fund_")[1].split("_")[-1] for i in fund_perf.index[:-1]])),
                        key=lambda x: int(x.split("years")[0]))

    to_plot = pd.DataFrame(
        data=[
            [fund_perf.loc[f"fund_{j}_{i}"] for j in perf_indicators] for i in perf_years
        ],
        index=perf_years,
        columns=perf_indicators
    ).reset_index().rename(columns={"index": "year"})

    fig = go.Table(
        header=dict(values=list(to_plot.columns),
                    fill_color='paleturquoise',
                    align='center', font=dict(color="black", size=12)),
        cells=dict(values=[to_plot[i] for i in to_plot.columns],
                   fill_color=['lavender']+["white"]*(len(to_plot.columns)-1),
                   align='center', font=dict(color="black", size=12)))
    return fig

**Fund Price Evolution over Time**

In [11]:
def fund_evolution(data: pd.DataFrame, fund_code: str, kind: str):
    fund_perf = data.loc[fund_code]
    insight_list = ["SMA30", "SMA60", "SMA90", "CMA30", "EWMA30"]
    fund_perf.loc[:, "SMA30"] = fund_perf['adj_close'].rolling(30).mean()
    fund_perf.loc[:, "SMA60"] = fund_perf['adj_close'].rolling(60).mean()
    fund_perf.loc[:, "SMA90"] = fund_perf['adj_close'].rolling(90).mean()
    fund_perf.loc[:, "CMA30"] = fund_perf['adj_close'].expanding().mean()
    fund_perf.loc[:, "EWMA30"] = fund_perf['adj_close'].ewm(span=30).mean()
    fund_perf.ta.macd(close='adj_close', fast=12, slow=26, signal=9, append=True)

    fig_list = []

    if kind == "candle":
        fig_list.append(go.Candlestick(name='Closing Price',
                     x=fund_perf['price_date'],
                     open=fund_perf['open'],
                     high=fund_perf['high'],
                     low=fund_perf['low'],
                     close=fund_perf['close']))
    else:
        fig_list.append(go.Scatter(
                name='Closing Price',
                x=fund_perf['price_date'],
                y=fund_perf['adj_close']
            ))

    for insight in insight_list:
        fig_list.append(go.Scatter(
            name=insight,
            x=fund_perf["price_date"],
            y=fund_perf[insight]
        ))

    fig_list.append(go.Scatter(
        name="MACD_12_26_9",
        x=fund_perf["price_date"],
        y=fund_perf["MACD_12_26_9"],
    ))
    #fig.add_trace(go.Bar(x=fund_perf["price_date"], y=fund_perf['volume'], showlegend=False), row=2, col=1,secondary_y=False)
    #fig.update(layout_xaxis_rangeslider_visible=False)
    #fig.update_layout(
        #yaxis_title='Price ($)',
        #title=f"Stock Price evolution over time for fund {fund_code}")
    return fig_list

**Return Analysis**

In [12]:
def add_anotation(fig, text: str):
    fig.add_annotation(text=text,
                       align='left',
                       showarrow=False,
                       xref='paper',
                       yref='paper',
                       x=1.1,
                       y=0.8,
                       bordercolor='black',
                       borderwidth=1,
                       xshift=30,
                       yshift=-500)

In [13]:
def get_full_statistics(serie: pd.Series):
    stat_dict={
        "mean": serie.mean(),
        "median": serie.median(),
        "std": serie.std(),
        "skewness": serie.skew(),
        "kurtosis": serie.kurt()
    }
    return stat_dict

In [14]:
def plot_corr_serie(serie: pd.Series):
    corr_array = acf(serie.dropna(), alpha=0.05)
    #lower_y = corr_array[1][:, 0] - corr_array[0]
    #upper_y = corr_array[1][:, 1] - corr_array[0]
    fig_list = [i for i in[go.Scatter(x=(x, x), y=(0, corr_array[0][x]), mode='lines', showlegend=False)
     for x in range(len(corr_array[0]))]]
    fig_list.append(go.Scatter(x=np.arange(len(corr_array[0])), y=corr_array[0], mode='markers', showlegend=False))
    #fig_list.append(go.Scatter(x=np.arange(len(corr_array[0])), y=upper_y, mode='lines', showlegend=False ))
    #fig_list.append(go.Scatter(x=np.arange(len(corr_array[0])), y=lower_y, mode='lines', fillcolor='rgba(32, 146, 230,0.3)',
                    #fill='tonexty', showlegend=False))

    return fig_list

In [15]:
def show_distribution(serie: pd.Series):
    fig = ff.create_distplot([serie.values.tolist()], group_labels=["Return Distribution"],
                             curve_type='kde', show_hist=False)
    fig2 = ff.create_distplot([serie.values.tolist()], group_labels=["Serie Return"],
                             curve_type='normal')
    normal_x = fig2.data[1]['x']
    normal_y = fig2.data[1]['y']
    fig3 = go.Scatter(x=fig.data[0]['x'], y=fig.data[0]['y'], mode='lines',
                              line=dict(color='rgba(0,255,0, 0.6)',
                                        width=1),
                              name='Return Distribution')
    fig4 = go.Scatter(x=normal_x, y=normal_y, mode='lines',
                              line=dict(color='rgba(255,0,0, 0.6)',
                                        width=1),
                              name='Normal Distribution')

    return fig3, fig4

In [16]:
def plot_return_over_time(data: pd.DataFrame):
    return go.Bar(x=data["price_date"], y=data["return"], name="Return over Time",
                  marker=dict(color=data["return"], colorscale="viridis"))

# **Fund Prediction**

In [17]:
def filter_by_date(data: pd.DataFrame, start_date, end_date):
    return data.loc[(data.index >= start_date) & (data.index <= end_date)]


In [18]:
def add_feature(data: pd.DataFrame):
    data.loc[:, "SMA30"] = data['close'].rolling(30).mean()
    data.loc[:, "SMA60"] = data['close'].rolling(60).mean()
    data.loc[:, "CMA30"] = data['close'].expanding().mean()
    data.loc[:, "EWMA30"] = data['close'].ewm(span=30).mean()
    data.ta.macd(close='close', fast=12, slow=26, signal=9, append=True)
    return data.dropna()

In [19]:
def get_train_test_data(data: pd.DataFrame, target:str, train_size: float=0.8):
    trainset, testset = data.iloc[:int(data.shape[0]*train_size)+1, :], data.iloc[int(data.shape[0]*train_size)+1:, :]
    return trainset.drop(target, axis=1), trainset[target], testset.drop(target, axis=1), testset[target]


In [20]:
def create_model_ML(model_type: str):
    model_dict = {"Linear_Regression": LinearRegression()}
    scale = MinMaxScaler()
    RF2 = model_dict[model_type]
    return Pipeline(steps=[('scaler', scale), ('rf1', RF2)])

In [21]:
def get_model_info(data: pd.DataFrame, model_type:str, train_size: float, target:str):
    model = create_model_ML(model_type)
    X_train, y_train, X_test, y_test = get_train_test_data(add_feature(data), target, train_size)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    precision = model.score(X_test, y_test)

    fig = px.line(y_test, x=y_test.index, y="close", title=f'Tesla stock evolution prediction. Score: {precision}')
    fig.add_trace(go.Scatter(
                name="Prediction",
                x=y_test.index,
                y=y_predict
            ))
    fig.show()

# **Merging Function**

In [22]:
def full_return_plot(data: pd.DataFrame, fund_code: str, plot_pacf=False):
    df = data.loc[fund_code]
    df.loc[:, "return"] = df["close"].pct_change(1)*100
    df.dropna(inplace=True)
    fig = make_subplots(rows=3, cols=4,
                        specs=[[{"colspan": 4}, None, None, None], [{"colspan": 2}, None, {"colspan": 2}, None],
                               [{"colspan": 4}, None, None, None]],
                        vertical_spacing=0.22,
    subplot_titles=("Return of stock over Time", "Autocorrrelation of return",
                    "Autocorrelation of squared returns", "Return Distribtution vs Normal Distribution"))
    fig.add_trace(plot_return_over_time(df.loc[:, ["price_date", "return"]]), row=1, col=1)
    for i in plot_corr_serie(df.loc[:, "return"]):
        fig.add_trace(i,row=2, col=1)
    for j in plot_corr_serie(df.loc[:, "return"]):
        fig.add_trace(j,row=2, col=3)
    fig.add_trace(show_distribution(df.loc[:, "return"])[0], row=3, col=1)
    fig.add_trace(show_distribution(df.loc[:, "return"])[1], row=3, col=1)
    add_anotation(fig, "Statistics for Return Distribution <br>" + " <br>".join([name + ": " + str(round(result, 2))
                  for name, result in get_full_statistics(df.loc[:, "return"]).items()]))
    fig.update_layout(template="plotly_dark", xaxis_rangeslider_visible=True, xaxis4=dict(range=[-1, 1]),
                      title=f"{fund_code} Return over Time")
    fig.update_xaxes(rangeslider={'visible': False}, row=1, col=2)
    fig.update_xaxes(rangeslider={'visible': False}, row=2, col=1)
    fig['layout']['yaxis'].update(autorange=True)
    fig.update_layout()
    fig.show()

In [23]:
def plot_fund_performance(data_stock: pd.DataFrame, data_info: pd.DataFrame, fund_code: str, kind: str):
    df_stock, df_info = data_stock.loc[fund_code], data_info.loc[fund_code]
    fig = make_subplots(rows=3, cols=2,
                        vertical_spacing=0.22,
                        specs=[[{"colspan": 2, "secondary_y": True}, None],
                               [{"type": "table"}, {"type": "table"}],
                               [{"type": "pie"}, {"type": "pie"}],],
                        subplot_titles=(f"Evolution of {fund_code} stock price",
                            f"Activity Domain of Company composing fund {fund_code}", f"Bond repartition for fund {fund_code} - Bond Duration:  - "
                                                                                      f"Bond Maturity: Percent of bond from US Government: ",
                        "Financial Ratio", "Technical Ratio"))
    fig.add_trace(fund_composition_sector(data_info, fund_code), row=2, col=1)
    fig.add_trace(fund_composition_bond(data_info, fund_code), row=2, col=2)
    fig.add_trace(fund_fundamental_ratio(data_info, fund_code), row=3, col=1)
    fig.add_trace(fund_performance(data_info, fund_code), row=3, col=2)
    for i in fund_evolution(data_stock, fund_code, kind):
        secondary_y=False
        if i.name.startswith("MACD"):
            secondary_y=True
        fig.add_trace(i, row=1, col=1, secondary_y=secondary_y)
    fig.update_layout(template="plotly_dark", title=f"{fund_code} Performance")
    fig.show()

# **Result**

In [24]:
full_return_plot(ETF_stock, "AGZ")


In [25]:
plot_fund_performance(ETF_stock, ETF_info, "AADR", "candle")

In [26]:
get_model_info(ETF_stock.loc["AADR", :].set_index("price_date"), "Linear_Regression", 0.8, "close")