In [1]:
# add absolute path to the project root folder to the system path
import sys
import os
from pathlib import Path

notebook_path = Path().resolve()
sys.path.append(str(notebook_path.parent))
import warnings
from scipy.stats import trim_mean
from scipy.optimize import minimize
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from utils.timescale_connector import TimescaleConnector

In [8]:
# Preparing financial ratios data
def financial_ratio():
    _df = TimescaleConnector.query_financial_ratios()

    _columns = [
        "quarter",
        "net_profit",
        "profit_growth_(%)",
        "revenue",
        "revenue_growth_(%)",
        "market_capital",
        "eps_(vnd)",
        "p/e",
        "outstanding_share",
        "roe_(%)",
        "symbol",
    ]

    _df = _df[_columns]

    _trimmed_mean_roe = trim_mean(_df["roe_(%)"].dropna(), 0.1)
    _trimmed_mean_market_capital = trim_mean(_df["market_capital"].dropna(), 0.1)
    _trimmed_mean_eps = trim_mean(_df["eps_(vnd)"].dropna(), 0.1)
    _trimmed_mean_pe = trim_mean(_df["p/e"].dropna(), 0.1)
    _trimmed_mean_outstanding_share = trim_mean(_df["outstanding_share"].dropna(), 0.1)

    _df["roe_(%)"].fillna(_trimmed_mean_roe, inplace=True)
    _df["market_capital"].fillna(_trimmed_mean_market_capital, inplace=True)
    _df["eps_(vnd)"].fillna(_trimmed_mean_eps, inplace=True)
    _df["p/e"].fillna(_trimmed_mean_pe, inplace=True)
    _df["outstanding_share"].fillna(_trimmed_mean_outstanding_share, inplace=True)

    _revenue_null_value = _df[_df["revenue"].isnull()]
    _revenue_null_value.head(5)

    _financial_ratios_cleaned = _df.dropna()
    return _financial_ratios_cleaned


df = financial_ratio()


In [9]:
df


Unnamed: 0,quarter,net_profit,profit_growth_(%),revenue,revenue_growth_(%),market_capital,eps_(vnd),p/e,outstanding_share,roe_(%),symbol
0,Q4 2016,41294860102,0.949968,6.676867e+11,0.451578,1.455455e+12,817.936153,10.250735,56964988.0,0.160569,AAA
1,Q1 2017,66894667573,1.358928,7.335012e+11,0.728754,1.949325e+12,1303.709320,9.762348,59249988.0,0.195754,AAA
2,Q2 2017,39129314217,0.162533,8.538257e+11,0.650835,1.970062e+12,733.182009,9.805468,59249988.0,0.193482,AAA
3,Q3 2017,52029043164,0.345329,1.062929e+12,0.981942,2.399320e+12,948.578466,8.722134,83599988.0,0.200697,AAA
4,Q4 2017,65289636792,0.581060,1.426508e+12,1.136493,3.344000e+12,1117.613576,7.486253,167199976.0,0.202759,AAA
...,...,...,...,...,...,...,...,...,...,...,...
2067,Q3 2022,220666896949,6.246926,6.521616e+11,1.262953,8.362940e+12,931.039774,7.810071,236241246.0,0.245687,VSH
2068,Q4 2022,382126600956,0.982613,9.628110e+11,0.518069,9.520522e+12,1612.272023,7.551561,236241246.0,0.273152,VSH
2069,Q1 2023,476648446226,0.180250,8.921769e+11,0.103405,1.058361e+13,2011.079450,7.944936,236241246.0,0.272947,VSH
2070,Q2 2023,261808202789,0.018630,6.572797e+11,-0.005774,1.060723e+13,1104.623545,7.908467,236241246.0,0.265446,VSH


In [37]:
def analyze_industry_sector():
    _df_industry_sector = pd.read_csv(
        "/home/tb24/projects/rule-based-stock-recommendation-system/data/industry_sector_analysis.csv",
        encoding="ISO-8859-1",
    )
    _df_industry_sector = _df_industry_sector.dropna()
    # rename industry sector to industry_sector
    _df_industry_sector.rename(
        columns={"industry sector": "industry_sector"}, inplace=True
    )

    _corrections = {
        "Hóa Ch?t": "Hóa Chất",
        "Hóa ch?t": "Hóa Chất",
        "B?t ??ng s?n": "Bất Động Sản",
        "S?n xu?t th?c ph?m": "Sản xuất thực phẩm",
        "D?ch v? tài chính": "Dịch vụ tài chính",
        "Xây d?ng và v?t li?u": "Xây dựng và vật liệu",
        "B?o hi?m nhân th?": "Bảo hiểm nhân thọ",
        "N??c và khí ??t": "Nước và khí đốt",
        "Ph?n m?m d?ch v? máy tính": "Phần mềm dịch vụ máy tính",
        "Bán l?": "Bán lẻ",
        "Lâm nghi?p và gi?y": "Lâm nghiệp và giấy",
        "Ph?n m?m và d?ch v? máy tính": "Phần mềm và dịch vụ máy tính",
        "S?n xu?t và phân ph?i ?i?n": "Sản xuất và phân phối điện",
        "?i?n t? và thi?t b? ?i?n": "Điện tử và thiết bị điện",
        "V?n t?i": "Vận tải",
        "Kim lo?i": "Kim loại",
        "D??c ph?m": "Dược phẩm",
        "S?n xu?t d??c ph?m": "Sản xuất dược phẩm",
        "S?n xu?t d?u khí": "Sản xuất dầu khí",
        "Hàng cá nhân": "Hàng cá nhân",
        "Thi?t b?, d?ch v? và phân ph?i d?u khí": "Thiết bị, dịch vụ và phân phối dầu khí",
        "Công nghi?p n?ng": "Công nghiệp nặng",
        "Bia và ?? u?ng": "Bia và đồ uống",
        "Thi?t b? và ph?n c?ng": "Thiết bị và phần cứng",
        "Du l?ch và gi?i trí": "Du lịch và giải trí",
        "Ch? s? th? tr??ng chung": "Chỉ số thị trường chung",
    }

    _df_industry_sector.industry_sector = _df_industry_sector.industry_sector.replace(
        _corrections
    )
    _df_industry_sector = _df_industry_sector.reset_index()
    _df_industry_sector = _df_industry_sector.drop(["index"], axis=1)

    _df_industry_sector.date = pd.to_datetime(_df_industry_sector.date)
    _df_industry_sector.close = pd.to_numeric(
        _df_industry_sector.close, errors="coerce"
    )
    _df_industry_sector.dropna(subset=["close"], inplace=True)

    _df_vn_index = _df_industry_sector[_df_industry_sector.symbol == "VN-Index"]
    _df_vn_index["vn_index_change"] = _df_vn_index["close"].pct_change()
    _df_industry_sector["stock_price_change"] = _df_industry_sector.groupby("symbol").close.pct_change()

    _df_merged = _df_industry_sector.merge(
        _df_vn_index[["date", "vn_index_change"]], on="date", how="left"
    )

    _grouped = _df_merged.groupby("industry_sector").agg(
        {"stock_price_change": "mean", "vn_index_change": "mean"}
    )

    _grouped["RS"] = _grouped["stock_price_change"] / _grouped["vn_index_change"]
    _ranked_sectors = _grouped.sort_values(by="RS", ascending=False)
    _ranked_sectors.reset_index(inplace=True)
    _ranked_sectors["ranking"] = _ranked_sectors.reset_index(drop=False).index + 1
    return _ranked_sectors

In [38]:
df2 = analyze_industry_sector()
df2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _df_vn_index["vn_index_change"] = _df_vn_index["close"].pct_change()


Unnamed: 0,industry_sector,stock_price_change,vn_index_change,RS,ranking
0,Thiết bị và phần cứng,-0.001685,-5.9e-05,28.795372,1
1,"Thiết bị, dịch vụ và phân phối dầu khí",0.001446,0.000476,3.039476,2
2,Phần mềm dịch vụ máy tính,0.001247,0.000537,2.322306,3
3,Phần mềm và dịch vụ máy tính,0.001304,0.000623,2.092782,4
4,Ngân hàng,0.000694,0.000369,1.882444,5
5,Vận tải,0.000518,0.00028,1.849879,6
6,Nước và khí đốt,-8.3e-05,-5.7e-05,1.469318,7
7,Bán lẻ,-0.000346,-0.000253,1.367405,8
8,Chỉ số thị trường chung,0.000119,0.000119,1.0,9
9,Xây dựng và vật liệu,0.000636,0.000748,0.849964,10


In [42]:
def process_dataframe(_df):
    _df["eps_growth(%)"] = (
        (_df["eps_(vnd)"] - _df.groupby("symbol")["eps_(vnd)"].shift(4))
        / _df.groupby("symbol")["eps_(vnd)"].shift(4)
    ) * 100
    _df = _df.dropna()

    _df["profit_growth_(%)"] *= 100
    _df["revenue_growth_(%)"] *= 100

    _df_fundamental = _df[
        [
            "quarter",
            "net_profit",
            "profit_growth_(%)",
            "revenue",
            "revenue_growth_(%)",
            "eps_(vnd)",
            "eps_growth(%)",
            "roe_(%)",
            "symbol",
        ]
    ]
    return _df_fundamental


_df_fundamental = process_dataframe(df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["eps_growth(%)"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["profit_growth_(%)"] *= 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["revenue_growth_(%)"] *= 100


Unnamed: 0,quarter,net_profit,profit_growth_(%),revenue,revenue_growth_(%),eps_(vnd),eps_growth(%),roe_(%),symbol
16,Q4 2020,57706672925,-2.034545e+06,2.121136e+12,1.545248e+07,303.392915,-11.822850,0.080255,AAA
17,Q1 2021,75141819491,2.848167e+07,2.284374e+12,4.517164e+07,374.909340,9.746183,0.080908,AAA
18,Q2 2021,86326563080,2.754296e+07,3.269796e+12,8.028239e+07,365.387561,-7.579244,0.075830,AAA
19,Q3 2021,62514754147,-2.042946e+07,3.409265e+12,7.666065e+07,238.526477,-45.459378,0.067289,AAA
20,Q4 2021,60517570104,4.871009e+06,4.201845e+12,9.809409e+07,200.626583,-33.872357,0.063366,AAA
...,...,...,...,...,...,...,...,...,...
2067,Q3 2022,220666896949,6.246926e+08,6.521616e+11,1.262953e+08,931.039774,-590.378907,0.245687,VSH
2068,Q4 2022,382126600956,9.826128e+07,9.628110e+11,5.180688e+07,1612.272023,98.121861,0.273152,VSH
2069,Q1 2023,476648446226,1.802500e+07,8.921769e+11,1.034047e+07,2011.079450,18.025005,0.272947,VSH
2070,Q2 2023,261808202789,1.863037e+06,6.572797e+11,-5.773878e+05,1104.623545,1.863037,0.265446,VSH
