In [51]:
# Read data sets on ETF and mutual funds 
# Identify a historical financial dataset including a measure of financial risk
import pandas as pd
etf = pd.read_csv("/Users/ishaanbabbar/6400-Project-Group2/data/ETFs.csv")
etf.head()

Unnamed: 0,fund_symbol,quote_type,region,fund_short_name,fund_long_name,currency,fund_category,fund_family,exchange_code,exchange_name,...,fund_stdev_5years,fund_sharpe_ratio_5years,fund_treynor_ratio_5years,fund_alpha_10years,fund_beta_10years,fund_mean_annual_return_10years,fund_r_squared_10years,fund_stdev_10years,fund_sharpe_ratio_10years,fund_treynor_ratio_10years
0,AAAU,ETF,US,DWS RREEF Real Assets Fund - Cl,DWS RREEF Real Assets Fund - Class A,USD,,DWS,PCX,NYSEArca,...,,,,,,,,,,
1,AADR,ETF,US,AllianzGI Health Sciences Fund,Virtus AllianzGI Health Sciences Fund Class P,USD,Foreign Large Growth,Virtus,NGM,NasdaqGM,...,19.3,0.62,9.66,3.32,0.96,0.79,73.64,16.78,0.53,8.15
2,AAXJ,ETF,US,,American Century One Choice Blend+ 2015 Portfo...,USD,Pacific/Asia ex-Japan Stk,American Century Investments,NGM,NasdaqGM,...,15.91,0.66,10.37,0.3,0.99,0.55,78.24,16.83,0.36,4.81
3,ABEQ,ETF,US,Thrivent Large Cap Growth Fund,Thrivent Large Cap Growth Fund Class A,USD,Large Value,Thrivent Funds,PCX,NYSEArca,...,,,,,,,,,,
4,ACES,ETF,US,,American Century One Choice Blend+ 2015 Portfo...,USD,Miscellaneous Sector,American Century Investments,PCX,NYSEArca,...,,,,,,,,,,


In [52]:
#Train and test several network models with the historical data
# Perform Data pre-processing and remove features in the dataset that aren't needed 
# Many columns have a higher percentage or missing columns, we will remove unnecessary columns 
# Check the percentage of missing values in each column 
missing_perc = etf.isnull().mean() * 100

# Show the columns with a higher than 50 percent missing values and drop them
missing_columns = missing_perc[missing_perc > 50]
etf_cleaned = etf.drop(columns = missing_columns.index)

# Fill the remaining missing values with median to retain integrity in the data and make it numeric
numeric_cols = etf_cleaned.select_dtypes(include = ['float64', 'int64']).columns
etf_cleaned[numeric_cols] = etf_cleaned[numeric_cols].fillna(etf_cleaned[numeric_cols].median())
etf_cleaned.head()

Unnamed: 0,fund_symbol,quote_type,region,fund_short_name,fund_long_name,currency,fund_category,fund_family,exchange_code,exchange_name,...,category_return_2009,category_return_2008,category_return_2007,fund_alpha_3years,fund_beta_3years,fund_mean_annual_return_3years,fund_r_squared_3years,fund_stdev_3years,fund_sharpe_ratio_3years,fund_treynor_ratio_3years
0,AAAU,ETF,US,DWS RREEF Real Assets Fund - Cl,DWS RREEF Real Assets Fund - Class A,USD,,DWS,PCX,NYSEArca,...,0.32673,-0.39004,0.08189,13.18,0.07,1.23,0.54,14.93,0.91,187.1
1,AADR,ETF,US,AllianzGI Health Sciences Fund,Virtus AllianzGI Health Sciences Fund Class P,USD,Foreign Large Growth,Virtus,NGM,NasdaqGM,...,0.2908,-0.42427,0.31193,-1.3,1.11,0.85,75.96,22.42,0.4,6.11
2,AAXJ,ETF,US,,American Century One Choice Blend+ 2015 Portfo...,USD,Pacific/Asia ex-Japan Stk,American Century Investments,NGM,NasdaqGM,...,0.73761,-0.51342,0.39069,1.2,0.9,0.8,74.34,18.48,0.46,7.8
3,ABEQ,ETF,US,Thrivent Large Cap Growth Fund,Thrivent Large Cap Growth Fund Class A,USD,Large Value,Thrivent Funds,PCX,NYSEArca,...,0.25157,-0.35846,0.01809,-0.715,1.02,0.88,78.01,19.66,0.56,8.4
4,ACES,ETF,US,,American Century One Choice Blend+ 2015 Portfo...,USD,Miscellaneous Sector,American Century Investments,PCX,NYSEArca,...,0.31262,-0.33986,0.01815,24.18,1.31,3.65,52.02,32.6,1.3,34.5


In [3]:
# Convert etf clened data into a csv
etf_cleaned = etf_cleaned.drop(columns = ['investment_strategy'])
etf_cleaned.to_csv("/Users/ishaanbabbar/Desktop/GT Summer 2024/DSAN 6400/project/ETF_cleaned.csv")

In [71]:
# Identify columns that are focused on fund returns for the correlation matrix and construct the network with the established edges between nodes

additional_cols = [
    'fund_annual_report_net_expense_ratio', 'fund_price_book_ratio', 'fund_price_cashflow_ratio',
    'fund_price_earning_ratio', 'fund_price_sales_ratio', 'fund_sharpe_ratio_3years',
    'fund_treynor_ratio_3years', 'fund_beta_3years', 'fund_r_squared_3years',
    'fund_stdev_3years', 'fund_sharpe_ratio_3years', 'fund_treynor_ratio_3years', 'fund_return_ytd', 
    'fund_return_1month', 'fund_return_3months', 'fund_return_1year', 'fund_return_3years', 'fund_return_2020',
    'fund_return_2019', 'fund_return_2018', 'fund_return_2017'
]

cen_data = etf_cleaned[additional_cols]


# Ensure the columns are numeric
return_columns = [col for col in cen_data.columns if 'fund' in col and 'category' not in col]

#Fully ensure all areas are numeric 
return_columns = [col for col in return_columns if pd.api.types.is_numeric_dtype(etf_cleaned[col])]

# Extract the return values 
return_data = etf_cleaned[return_columns].copy()

# Calculate correlation matrix for fund returns 
return_matrix = return_data.corr()

return_matrix

Unnamed: 0,fund_annual_report_net_expense_ratio,fund_price_book_ratio,fund_price_cashflow_ratio,fund_price_earning_ratio,fund_price_sales_ratio,fund_sharpe_ratio_3years,fund_treynor_ratio_3years,fund_beta_3years,fund_r_squared_3years,fund_stdev_3years,...,fund_treynor_ratio_3years.1,fund_return_ytd,fund_return_1month,fund_return_3months,fund_return_1year,fund_return_3years,fund_return_2020,fund_return_2019,fund_return_2018,fund_return_2017
fund_annual_report_net_expense_ratio,1.0,0.01788,0.04665,-0.0446,0.01617,-0.275047,0.005918,-0.052566,-0.072356,0.308247,...,0.005918,0.061808,0.034467,0.03036,0.060892,-0.194871,-0.129757,-0.073059,-0.092,-0.028512
fund_price_book_ratio,0.01788,1.0,0.80104,0.709158,0.555176,0.228188,0.008641,-0.000941,0.082559,-0.035833,...,0.008641,-0.046723,0.28045,0.104084,0.044516,0.207095,0.316999,0.140507,0.134646,0.082522
fund_price_cashflow_ratio,0.04665,0.80104,1.0,0.743135,0.670423,0.210015,0.010518,-0.0216,0.053885,-0.055364,...,0.010518,-0.088159,0.240499,0.060835,0.002031,0.172903,0.303401,0.115052,0.144076,0.061024
fund_price_earning_ratio,-0.0446,0.709158,0.743135,1.0,0.540094,0.200882,0.009725,-0.020438,0.008453,-0.061991,...,0.009725,-0.075425,0.222982,0.064277,-0.013157,0.151837,0.236842,0.083954,0.156524,0.003572
fund_price_sales_ratio,0.01617,0.555176,0.670423,0.540094,1.0,0.159302,0.003401,0.009973,0.026014,-0.013233,...,0.003401,-0.014352,0.195291,0.107663,0.027115,0.146758,0.164675,0.143512,0.099399,0.043885
fund_sharpe_ratio_3years,-0.275047,0.228188,0.210015,0.200882,0.159302,1.0,-0.055499,0.275494,0.167347,-0.347987,...,-0.055499,0.169206,0.15355,0.253706,0.266276,0.771484,0.548626,0.549828,-0.037133,0.39438
fund_treynor_ratio_3years,0.005918,0.008641,0.010518,0.009725,0.003401,-0.055499,1.0,-0.022772,0.022922,0.087689,...,1.0,-0.013882,0.102811,-0.046744,-0.035154,-0.106462,-0.04512,-0.081206,0.019081,-0.072578
fund_beta_3years,-0.052566,-0.000941,-0.0216,-0.020438,0.009973,0.275494,-0.022772,1.0,0.144693,0.069884,...,-0.022772,0.273697,0.150228,0.283582,0.327558,0.30554,0.202276,0.348538,-0.302837,0.208049
fund_r_squared_3years,-0.072356,0.082559,0.053885,0.008453,0.026014,0.167347,0.022922,0.144693,1.0,0.016558,...,0.022922,0.150947,0.048882,0.09631,0.185051,0.229139,0.079711,0.24962,-0.101389,0.171835
fund_stdev_3years,0.308247,-0.035833,-0.055364,-0.061991,-0.013233,-0.347987,0.087689,0.069884,0.016558,1.0,...,0.087689,0.227014,0.05642,0.131631,0.31287,-0.365637,-0.194464,0.040925,-0.348601,0.011709


In [77]:
# Calcualte the degree, betweenness and closeness centralities 
#Evaluate measures of centrality (betweenness, degree, closeness)
import numpy as np
import networkx as nx 
# Create a new network graph
G = nx.Graph()

# Add nodes(return funds in this context)
for funds in return_columns:
    G.add_node(funds)

# Add edges based on given threshold 
threshold = 0.2 
for i in range(len(return_matrix)):
    for j in range(i+1, len(return_matrix)):
        if abs(return_matrix.iloc[i,j]) > threshold:
            G.add_edge(return_matrix.index[i], return_matrix.index[j], weight = return_matrix.iloc[i,j])

# Compute centrality measures 
degree_centrality = nx.degree_centrality(G)
betweeness_centrality = nx.betweenness_centrality(G)
closeness_centrality = nx.closeness_centrality(G)


#Compare model performance and determine best model
centrality_df = pd.DataFrame({
    "Degree Centrality": degree_centrality,
    "Betweenness Centrality": betweeness_centrality,
    "Closeness Centrality": closeness_centrality
})
centrality_df

Unnamed: 0,Degree Centrality,Betweenness Centrality,Closeness Centrality
fund_annual_report_net_expense_ratio,0.111111,0.0,0.45873
fund_price_book_ratio,0.388889,0.058397,0.573413
fund_price_cashflow_ratio,0.333333,0.027135,0.535185
fund_price_earning_ratio,0.333333,0.027135,0.535185
fund_price_sales_ratio,0.166667,0.0,0.382275
fund_sharpe_ratio_3years,0.777778,0.17991,0.729798
fund_treynor_ratio_3years,0.111111,0.0,0.0
fund_beta_3years,0.5,0.01407,0.617521
fund_r_squared_3years,0.111111,0.0,0.433934
fund_stdev_3years,0.333333,0.028994,0.55364


Going over the centrality results in the network analysis on financial funds and the levels of influence within the network. 

Degree Centrality: Higher degree centrality shows funds are more connected with multiple fund types. This can be influential towards shared movements in returns. fund_return_1year as one example has a high degree of centrality which shows high interconnection and can react similarly to a range of different markets. 

Betweenness Centrality: Funds with high centrality here can control the flow of information and how the spread of influence impacts the networks. It can be critical to know what funds might act as bridges in financial systems during a market crisis. fund_return_1year is an example of this as it has high centrality 

Closeness Centrality: High closeness centrality shows that funds can interact with each other or affect one another in the network. fund_return_1year shows this again as an example, meaning a rapid increase of changes in the market might occur potentially.

Overall for our research topic, funds like fund_return_1year which is a 1-year fund return for ETFs, can play critical roles in the network which can cause systemic risk and market stability. Funds with high betweenness centrality can create predictions on what type of funds would be best used for a crisis that may come up, and keeping an eye on them would be beneficial. 

In [81]:
# To incorporate train and testing model for the centralities, will create a risk level to predict risk level

# Threshold numbers to determine high risk
degree_threshold = 0.5
bet_threshold = 0.1
closeness_threshold = 0.7

# Calculate risk level based on thresholds 
centrality_df['Risk level'] = ((centrality_df['Degree Centrality']> degree_threshold) |
                               (centrality_df['Betweenness Centrality']> bet_threshold) |
                               (centrality_df['Closeness Centrality']> closeness_threshold)).astype(int)
centrality_df

Unnamed: 0,Degree Centrality,Betweenness Centrality,Closeness Centrality,Risk level
fund_annual_report_net_expense_ratio,0.111111,0.0,0.45873,0
fund_price_book_ratio,0.388889,0.058397,0.573413,0
fund_price_cashflow_ratio,0.333333,0.027135,0.535185,0
fund_price_earning_ratio,0.333333,0.027135,0.535185,0
fund_price_sales_ratio,0.166667,0.0,0.382275,0
fund_sharpe_ratio_3years,0.777778,0.17991,0.729798,1
fund_treynor_ratio_3years,0.111111,0.0,0.0,0
fund_beta_3years,0.5,0.01407,0.617521,0
fund_r_squared_3years,0.111111,0.0,0.433934,0
fund_stdev_3years,0.333333,0.028994,0.55364,0


To go over the additional columns and how they influence transaction price, volume, value, loss, and Measure of risk 

Measure of Risk: fund_sharpe_ratio and fund_treynor_ratio are related to risk measurement. Sharpe ratio can help rank and evaluate performance that exceeds the risk-free rate after adjusting the risk. The Treynor ratio is also a risk-adjusted measurement return based on systematic risk. Overall they focus on risk-adjusted returns in funds.

Fund_beta is also a volatility measure that can indicate systematic risk. Fund r squared and Stdev also show a measure of risk. R_squared focuses on fund movements and is explained through movements in a benchmark index, while Stdev focuses on overall volatility to determine risk.

Loss: Hogh centrality levels, specifically betweenness centrality, can determine values that indicate loss areas with a risk level of 1 in our dataset. For example for fund_sharpe_ratio, it has a high betweenness centrality result of 0.17, which may indicate high market risk and can lead to potential loss in the network path. 

Transaction price: Raios such as fund_price_earning_ratio and price_book_ratio are directly involved with transaction price since they focus on specific transaction occurrences to determine profit. Higher centrality specifically in closeness centrality, affects how transactions are valued. 

Transaction Volume: Similar to said above about transaction price, higher centrality in trade volume can show how often it is used in market activity and influence trade behavior in funds. So price_earnings and price_cashflow have high centrality are some values that show these influences. Also, lower values such as price_sales_ratio could lead to a potential increase in buying activity, which could increase volumes.

Measure of risk: fund_sharpe_ratio_3years, fund_treynor_ratio_3years, fund_beta_3years, fund_r_squared_3years, fund_stdev_3years

Loss: Any value with a risk level for 1 

Transaction Price/Volume: fund_price_book_ratio, fund_price_cashflow_ratio, fund_price_earning_ratio, fund_price_sales_ratio

Returns: fund_return_ytd, fund_return_1month, fund_return_3months, fund_return_1year, fund_return_3years, fund_return_2020, fund_return_2019, fund_return_2018, fund_return_2017	