In [1]:
import os
from datetime import datetime as dt
from datetime import timezone as tz

repo_dir = os.path.dirname(os.getcwd())
assert os.path.exists(repo_dir)

logging_dir = os.path.join(repo_dir, 'logs')
os.makedirs(logging_dir, exist_ok=True)

data_dir = os.path.join(repo_dir, 'data')
os.makedirs(data_dir, exist_ok=True)

fundamentals_dir = os.path.join(repo_dir, 'data')
os.makedirs(fundamentals_dir, exist_ok=True)

daily_date_str = dt.now(tz.utc).strftime('%Y%m%d')
# daily_date_str = 'YYYYMMDD'

fundamentals_dir = os.path.join(fundamentals_dir, f'{daily_date_str}')

file_paths = []

if os.path.exists(fundamentals_dir):
	file_paths = [
		os.path.join(fundamentals_dir, f)
		for f in os.listdir(fundamentals_dir) if
		os.path.isfile(os.path.join(fundamentals_dir, f))
	]
else:
	print(f'Directory {fundamentals_dir} does not exist')

print(f'Found {len(file_paths)} files in {fundamentals_dir}')

Directory /Users/tyler.austin/Github/eodhd-ez/data/20250128 does not exist
Found 0 files in /Users/tyler.austin/Github/eodhd-ez/data/20250128


In [None]:
import json

ticker_fundamentals = {}
non_standard_data = []


def check_fundamental_data(data):
	if 'General' not in data:
		return False
	if 'Code' not in data['General']:
		return False
	return True


def load_fundamentals(path):
	with open(path, 'r') as f:
		data = json.load(f)
	if check_fundamental_data(data):
		symbol_code = data['General']['Code']
	else:
		non_standard_data.append(path)
		return None, None
	return symbol_code, data


for file_path in file_paths:
	ticker, fundamentals = load_fundamentals(file_path)
	if ticker is not None:
		ticker_fundamentals[ticker] = fundamentals

print(f'Loaded {len(ticker_fundamentals.keys())} tickers')
print(f'Found {len(non_standard_data)} non-standard data files')

In [None]:
sectors = []
groups = []
industries = []
sub_industries = []

missing_gics_dict = {}

for ticker, fundamentals in ticker_fundamentals.items():
	general_data = fundamentals['General']
	code = general_data['Code']
	sector = general_data['GicSector'] if 'GicSector' in general_data else None
	group = general_data['GicGroup'] if 'GicGroup' in general_data else None
	industry = general_data['GicIndustry'] if 'GicIndustry' in general_data else None
	sub_industry = general_data['GicSubIndustry'] if 'GicSubIndustry' in general_data else None

	if sector is not None:
		sectors.append(sector)
	elif code not in missing_gics_dict:
		missing_gics_dict[code] = general_data

	if group is not None:
		groups.append(group)

	if industry is not None:
		industries.append(industry)

	if sub_industry is not None:
		sub_industries.append(sub_industry)

	sectors = list(set(sectors))
	groups = list(set(groups))
	industries = list(set(industries))
	sub_industries = list(set(sub_industries))

print(f'{len(sectors)} sectors')
print(f'{len(groups)} groups')
print(f'{len(industries)} industries')
print(f'{len(sub_industries)} sub-industries')
print('\n')
print(f'{len(missing_gics_dict.keys())} tickers with missing GICS data')

In [None]:
# make missing_gics_keys dictionary a dataframe
import pandas as pd

missing_gics_vals = missing_gics_dict.values()
missing_gics = pd.DataFrame(missing_gics_vals)
missing_gics

In [None]:
from difflib import SequenceMatcher


def calculate_similarity(search_term, category):
	# Token-based matching
	search_tokens = search_term.split(' - ') if search_term else []
	search_tokens = [token.lower() for token in search_tokens if len(token) > 1]

	category_tokens = category.lower().split()

	# Token match score
	token_match_score = sum(1 for token in search_tokens if token in category_tokens)

	# Order and sequence similarity
	sequence_similarity = SequenceMatcher(None, search_term.lower(), category.lower()).ratio()

	# Weighted score: token match + sequence similarity
	return token_match_score * 0.7 + sequence_similarity * 0.3


def search_category(search_term, categories):
	cat_results = []
	for cat in categories:
		if cat is not None:
			score = calculate_similarity(search_term, cat)
			cat_results.append((cat, score))
	if len(cat_results) == 0:
		return None, 0
	cat_results = sorted(cat_results, key=lambda x: x[1], reverse=True)
	best_result = cat_results[0]
	return best_result[0], best_result[1]


# Your data
unique_industries = missing_gics['Industry'].unique()
unique_industries = [industry for industry in unique_industries if industry]
best_matches = []

for search in unique_industries:
	best_sector, sector_score = search_category(search, sectors)
	best_group, group_score = search_category(search, groups)
	best_industry, industry_score = search_category(search, industries)
	best_sub_industry, sub_industry_score = search_category(search, sub_industries)
	best_matches.append({
		'Search Industry': search,
		'GICS Sub-Industry': best_sub_industry,
		'Sub-Industry Score': sub_industry_score if sub_industry_score else 0,
		'GICS Industry': best_industry,
		'Industry Score': industry_score if industry_score else 0,
		'GICS Group': best_group,
		'Group Score': group_score if group_score else 0,
		'GICS Sector': best_sector,
		'Sector Score': sector_score,
	})

best_matches_df = pd.DataFrame(best_matches)
best_matches_df

In [None]:
# countby industry in missing_gics dataframe
missing_gics['Industry'].value_counts()

In [None]:
# gics = {
# 	'sector_name': {
# 		'code': 'XX',
# 		'groups': {
# 			'group_name': {
# 				'code': 'XXXX',
# 				'industries': {
# 					'industry_name': {
# 						'code': 'XXXXXX',
# 						'sub_industries': {
# 							'sub_industry_name': {
# 								'code': 'XXXXXXX',
# 							},
# 						},
# 					},
# 				},
# 			},
# 		},
# 	},
# }

from gics import GICS


def build_gics_tree():
	result = {}
	# Create an empty GICS object to get all sectors
	root_gics = GICS('')
	gics_sectors = root_gics.children

	for gics_sector in gics_sectors:
		sector_code = gics_sector['code']
		sector_name = gics_sector['name']
		result[sector_name] = {
			'code': sector_code,
			'groups': {}
		}

		sector_gics = GICS(sector_code)
		gics_groups = sector_gics.children
		for gics_group in gics_groups:
			group_code = gics_group['code']
			group_name = gics_group['name']
			result[sector_name]['groups'][group_name] = {
				'code': group_code,
				'industries': {}
			}

			group_gics = GICS(group_code)
			gics_industries = group_gics.children
			for gics_industry in gics_industries:
				industry_code = gics_industry['code']
				industry_name = gics_industry['name']
				result[sector_name]['groups'][group_name]['industries'][industry_name] = {
					'code': industry_code,
					'sub_industries': {}
				}

				industry_gics = GICS(industry_code)
				gics_sub_industries = industry_gics.children
				for gics_sub_industry in gics_sub_industries:
					sub_industry_code = gics_sub_industry['code']
					sub_industry_name = gics_sub_industry['name']
					result[sector_name]['groups'][group_name]['industries'][industry_name]['sub_industries'][
						sub_industry_name] = {
						'code': sub_industry_code
					}

	return result


# Generate the GICS tree
gics_tree = build_gics_tree()

In [None]:
gics_tree.keys()

In [None]:
import pandas as pd

repo_dir = '/Users/tyler.austin/Github/eodhd-ez'
analysis_dir = os.path.join(repo_dir, 'analysis')
os.makedirs(analysis_dir, exist_ok=True)
missing_categories = os.path.join(analysis_dir, 'missing_categories.csv')
missing_gics_keys = missing_gics.keys()

missing_gics_df = pd.DataFrame([ticker_fundamentals[missing]['General'] for missing in missing_gics_keys])
missing_gics_df.to_csv(missing_categories, index=False)
print(f'Missing GICS data saved to {missing_categories}')

In [None]:
"""
TODO:

* Write all GICS catalogues to file
* Lookup GICS numbers for each category
* Create a database table for Sector, Group, Industry, Sub-Industry
* Add tickers to the database tables
"""