# 1. 

In [None]:
# crawl_and_store_news.py

import os
import time
import logging
import requests
import pandas as pd
import nltk
import spacy
import ast
from datetime import datetime, timedelta
from sqlalchemy import create_engine
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from collections import Counter
from dotenv import load_dotenv

# ────── 설정 ──────
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)-8s %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# 1) 환경 변수 로드
load_dotenv()
API_KEY = os.getenv('NEWSAPI_KEY')
if not API_KEY:
    logger.error("NEWSAPI_KEY not found in environment variables")
    raise ValueError("NEWSAPI_KEY not found")

# 2) 감성 분석 & NER 모델 초기화
nltk.download('vader_lexicon', quiet=True)
vader = SentimentIntensityAnalyzer()
nlp = spacy.load("en_core_web_lg")

# 3) 날짜 범위 설정
today     = datetime.today()
from_date = (today - timedelta(days=30)).strftime('%Y-%m-%d')
to_date   = today.strftime('%Y-%m-%d')

# 4) NASDAQ 매핑
BASE_DIR    = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
nasdaq_csv  = os.path.join(BASE_DIR, 'data', 'nasdaq_screener_1744184912302.csv')
ticker_name_map = {}
if os.path.exists(nasdaq_csv):
    df_nasdaq = pd.read_csv(nasdaq_csv)
    required_cols = {"Symbol","Name","Market Cap","Sector"}
    missing = required_cols - set(df_nasdaq.columns)
    if missing:
        logger.error(f"Missing columns in NASDAQ CSV: {missing}")
        raise RuntimeError(f"NASDAQ CSV missing: {missing}")
    mask = (
        (df_nasdaq["Market Cap"] > 0) &
        df_nasdaq["Sector"].notnull() &
        ~df_nasdaq["Name"].str.contains("Units|Rights|Warrant|Preferred|Depositary|Series", case=False)
    )
    df_clean = df_nasdaq[mask].drop_duplicates(subset="Symbol")
    df_clean["Symbol"] = df_clean["Symbol"].str.lower().str.strip()
    df_clean["Name"]   = df_clean["Name"].str.strip()
    ticker_name_map    = dict(zip(df_clean["Symbol"], df_clean["Name"]))
    logger.info(f"Loaded {len(ticker_name_map)} ticker-name mappings")
else:
    logger.warning("NASDAQ mapping CSV not found; continuing without ticker mapping")

# 5) 산업 키워드 로드
kw_path = os.path.join(BASE_DIR, 'data', 'industry_keywords.txt')
with open(kw_path, 'r', encoding='utf-8') as f:
    contents = f.read()
mod = ast.parse(contents)
industry_keywords = []
for node in mod.body:
    if isinstance(node, ast.Assign):
        for target in node.targets:
            if getattr(target, 'id', None) == 'industry_keywords':
                industry_keywords = ast.literal_eval(node.value)
                break
if not industry_keywords:
    logger.error("No industry_keywords loaded")
    raise RuntimeError("industry_keywords.txt parsing failed")
logger.info(f"Loaded {len(industry_keywords)} industry keywords")

# 6) 크롤링 파라미터
API_URL   = "https://newsapi.org/v2/everything"
params    = {
    'q':        'nasdaq OR stock OR technology OR innovation',
    'from':     from_date,
    'to':       to_date,
    'language': 'en',
    'pageSize': 100,
    'domains':  'cnn.com',
    'apiKey':   API_KEY,
    'sortBy':   'publishedAt'
}
MAX_PAGES = 1  # 무료 플랜 최대 100건만 수집

# 7) 뉴스 수집
articles = []
for page in range(1, MAX_PAGES + 1):
    params['page'] = page
    res = requests.get(API_URL, params=params, headers={'User-Agent':'Mozilla/5.0'})
    try:
        data = res.json()
    except ValueError:
        logger.error("Invalid JSON response from NewsAPI")
        break

    if res.status_code != 200:
        code = data.get('code','')
        if code == 'maximumResultsReached':
            logger.warning("Reached free-plan limit (100 articles). Stopping.")
        else:
            logger.error(f"API error {res.status_code}: {data.get('message','')}")
        break

    articles_batch = data.get('articles')
    if not isinstance(articles_batch, list):
        logger.error("Unexpected API response structure")
        break
    if not articles_batch:
        break

    for art in articles_batch:
        full = ' '.join(filter(None, [
            art.get('title'),
            art.get('description'),
            art.get('content')
        ]))
        articles.append({
            'title':        art.get('title',''),
            'description':  art.get('description',''),
            'content':      art.get('content',''),
            'url':          art.get('url',''),
            'published_at': art.get('publishedAt',''),
            'full_text':    full
        })
    time.sleep(1)

logger.info(f"Crawled total articles: {len(articles)}")

# 8) DataFrame 생성 & 체크
df = pd.DataFrame(articles)
if df.empty:
    logger.error("No articles collected; exiting")
    raise RuntimeError("No data to process")

# 9) 감성·키워드 추출 함수
def extract_positive_orgs(text, max_orgs=3):
    if not isinstance(text, str):
        return []
    if vader.polarity_scores(text)['compound'] <= 0:
        return []
    doc = nlp(text)
    orgs = [ent.text.lower() for ent in doc.ents if ent.label_=='ORG']
    found, added = [], set()
    for name in orgs:
        for sym, nm in ticker_name_map.items():
            if name in nm.lower() or nm.lower() in name:
                if sym and sym not in added:
                    found.append(sym); added.add(sym)
                break
        if len(found) >= max_orgs:
            break
    return found

def extract_industry_keywords(text):
    txt = text.lower() if isinstance(text, str) else ""
    return [kw for kw in industry_keywords if kw in txt]

# 10) 필터링 & 정제
df['positive_orgs']  = df['full_text'].apply(extract_positive_orgs)
df['industry_keys']  = df['full_text'].apply(extract_industry_keywords)
df_f = df[(df['positive_orgs'].str.len()>0) | (df['industry_keys'].str.len()>0)].copy()

for col in ['positive_orgs','industry_keys']:
    df_f[col] = (
        df_f[col]
        .apply(lambda lst: [s for s in lst if s])
        .apply(lambda lst: ','.join(lst))
    )

# 11) SQLite DB 저장
db_path = os.path.join(BASE_DIR, 'data', 'cnn_news.db')
engine  = create_engine(f"sqlite:///{db_path}")
df_f.to_sql('cnn_positive_news', engine, if_exists='replace', index=False)
logger.info(f"Stored {len(df_f)} positive articles in `cnn_positive_news` table")

# 12) 통계 출력
org_counts      = Counter(df_f['positive_orgs'].str.split(',').sum())
industry_counts = Counter(df_f['industry_keys'].str.split(',').sum())
org_counts.pop('', None)
industry_counts.pop('', None)

logger.info(f"Top 기업 키워드: {org_counts.most_common(10)}")
logger.info(f"Top 산업 키워드: {industry_counts.most_common(10)}")


2. 

In [None]:
# make_industry_emb.py

import os
import json
import ast
import numpy as np
from sklearn.decomposition import PCA
import spacy
import logging

# ────── 설정 ──────
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)-8s %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# 1) 경로 설정
BASE_DIR   = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
GLOVE_PATH = os.path.join(BASE_DIR, 'data', 'glove.6B.300d.txt')
KW_PATH    = os.path.join(BASE_DIR, 'data', 'industry_keywords.txt')
EMBED_OUT  = os.path.join(BASE_DIR, 'data', 'industry_emb_32d.npy')
VOCAB_OUT  = os.path.join(BASE_DIR, 'data', 'industry_vocab.json')

# 2) spaCy 모델 로드 (phrase fallback 용)
nlp = spacy.load("en_core_web_lg")

# 3) 산업 키워드 로드
with open(KW_PATH, 'r', encoding='utf-8') as f:
    src = f.read()
mod = ast.parse(src)
industry_keywords = []
for node in mod.body:
    if isinstance(node, ast.Assign):
        for targ in node.targets:
            if getattr(targ, 'id', None) == 'industry_keywords':
                industry_keywords = ast.literal_eval(node.value)
                break

if not industry_keywords:
    logger.error("Failed to load any industry_keywords")
    raise RuntimeError("industry_keywords.txt parsing failed")
logger.info(f"Loaded {len(industry_keywords)} industry keywords")

# 4) GloVe 전체 로드
glove = {}
with open(GLOVE_PATH, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split()
        word, vec = parts[0], np.array(parts[1:], dtype=np.float32)
        glove[word] = vec
logger.info(f"Loaded {len(glove)} GloVe vectors")

# 5) 구(phrase) 처리 함수
def get_phrase_vector(phrase: str):
    toks = phrase.split()
    found = [glove[t] for t in toks if t in glove]
    if found:
        return np.mean(found, axis=0)
    # spaCy fallback
    docvec = nlp(phrase).vector
    return docvec if np.linalg.norm(docvec)>0 else None

# 6) 키워드별 벡터 수집
words, vecs = [], []
for kw in industry_keywords:
    if kw in glove:
        vec = glove[kw]
    else:
        vec = get_phrase_vector(kw)
    if vec is not None:
        words.append(kw)
        vecs.append(vec)
    else:
        logger.warning(f"No embedding for keyword: {kw}")

if not words:
    logger.error("No keywords matched any embeddings")
    raise RuntimeError("No embeddings to process")

mat = np.vstack(vecs)  # shape [len(words), 300]
logger.info(f"Collected embeddings for {len(words)} keywords")

# 7) PCA 축소 (300→32)
pca = PCA(n_components=32, random_state=42)
reduced = pca.fit_transform(mat)
logger.info(f"PCA-reduced shape: {reduced.shape}")

# 8) 결과 저장
os.makedirs(os.path.dirname(EMBED_OUT), exist_ok=True)
np.save(EMBED_OUT, reduced)
with open(VOCAB_OUT, 'w', encoding='utf-8') as f:
    json.dump(words, f, ensure_ascii=False, indent=2)

logger.info(f"Saved embeddings → {EMBED_OUT}")
logger.info(f"Saved vocab      → {VOCAB_OUT}")


3. 

In [None]:
# make_promise_labels.py

import os
import json
import logging

import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from sklearn.preprocessing import StandardScaler

# ────── 설정 ──────
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)-8s %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# 1) 경로 설정
BASE_DIR   = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
DB_PATH    = os.path.join(BASE_DIR, 'data', 'cnn_news.db')
VOCAB_PATH = os.path.join(BASE_DIR, 'data', 'industry_vocab.json')
OUT_CSV    = os.path.join(BASE_DIR, 'data', 'industry_promise.csv')

# 2) 긍정 뉴스 DB 로드
engine = create_engine(f'sqlite:///{DB_PATH}')
df = pd.read_sql_table('cnn_positive_news', engine)
logger.info(f"Loaded {len(df)} positive news articles from DB")

# 3) 산업 키워드 목록 로드
with open(VOCAB_PATH, 'r', encoding='utf-8') as f:
    industries = json.load(f)
logger.info(f"Loaded {len(industries)} industries from vocab")

# 4) 피처 계산
records = []
for idx, kw in enumerate(industries):
    mask    = df['industry_keys'].str.contains(fr'\b{kw}\b', case=False, na=False)
    df_kw   = df[mask]
    freq    = len(df_kw)
    # positive_orgs 컬럼은 comma-separated 심볼; 길이 > 0 이면 '기업 언급 동시'
    pos_rate = (df_kw['positive_orgs'].str.len() > 0).mean() if freq > 0 else 0.0
    records.append((idx, freq, pos_rate))

logger.info("Calculated raw features (freq, pos_rate) for each industry")

# 5) 표준화 & 가중합 레이블 생성
arr    = np.array([[r[1], r[2]] for r in records], dtype=float)
scaler = StandardScaler().fit(arr)
scaled = scaler.transform(arr)

# 예시 가중합: 0.7*freq_z + 0.3*pos_rate_z
weights = np.array([0.7, 0.3])
labels  = (scaled * weights).sum(axis=1)

logger.info("Standardized features and computed promise_label via weighted sum")

# 6) CSV로 저장
out_df = pd.DataFrame({
    'industry_id'   : [r[0] for r in records],
    'promise_label' : labels,
    'feat_freq'     : arr[:, 0],
    'feat_pos_rate' : arr[:, 1]
})
out_df.to_csv(OUT_CSV, index=False)
logger.info(f"Saved promise labels to {OUT_CSV} ({len(out_df)} rows)")


4. 

In [None]:
# train_promise_predictor.py

import os
import json
import logging

import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
import joblib

# ────── 설정 ──────
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)-8s %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# 1) 경로 설정
BASE_DIR       = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
EMBED_PATH     = os.path.join(BASE_DIR, 'data', 'industry_emb_32d.npy')
VOCAB_PATH     = os.path.join(BASE_DIR, 'data', 'industry_vocab.json')
LABEL_CSV_PATH = os.path.join(BASE_DIR, 'data', 'industry_promise.csv')
OUT_MODEL_PATH = os.path.join(BASE_DIR, 'models', 'promise_predictor.pkl')

# 2) 데이터 로드
logger.info("Loading embeddings and labels...")
embeddings = np.load(EMBED_PATH)  # shape (N, 32)
with open(VOCAB_PATH, 'r', encoding='utf-8') as f:
    vocab = json.load(f)          # len(vocab) == embeddings.shape[0]
labels_df = pd.read_csv(LABEL_CSV_PATH)

# 3) 피처(X)와 레이블(y) 준비
ids   = labels_df['industry_id'].astype(int).values
X_all = embeddings[ids]                 # (M, 32)
y_all = labels_df['promise_label'].values  # (M,)

logger.info(f"Prepared X_all shape={X_all.shape}, y_all shape={y_all.shape}")

# 4) 학습/테스트 분할
X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=0.2, random_state=42
)
logger.info(f"Train/Test split: {len(X_train)}/{len(X_test)}")

# 5) 모델 학습 (Ridge Regression)
model = Ridge(alpha=1.0)
model.fit(X_train, y_train)
logger.info("Ridge regression model training complete")

# 6) 성능 평가
y_pred_train = model.predict(X_train)
y_pred_test  = model.predict(X_test)

r2_train  = r2_score(y_train, y_pred_train)
r2_test   = r2_score(y_test,  y_pred_test)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
rmse_test  = np.sqrt(mean_squared_error(y_test,  y_pred_test))

logger.info(f"🏅 Train  R²: {r2_train:.4f}, RMSE: {rmse_train:.4f}")
logger.info(f"🎯 Test   R²: {r2_test:.4f}, RMSE: {rmse_test:.4f}")

# 7) 모델 저장
os.makedirs(os.path.dirname(OUT_MODEL_PATH), exist_ok=True)
joblib.dump(model, OUT_MODEL_PATH)
logger.info(f"✅ Model saved to {OUT_MODEL_PATH}")


5.

In [None]:
# train_contrastive.py

import os
import json
import numpy as np
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
from sqlalchemy import create_engine
import pandas as pd
from sklearn.decomposition import PCA

# 1) 경로 설정
BASE_DIR             = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
GLOVE_TXT_PATH       = os.path.join(BASE_DIR, 'data', 'glove.6B.300d.txt')
GLOVE_PCA_EMBED_PATH = os.path.join(BASE_DIR, 'data', 'glove_pca_64d.npy')
GLOVE_PCA_VOCAB_PATH = os.path.join(BASE_DIR, 'data', 'glove_pca_vocab.json')
DB_PATH              = os.path.join(BASE_DIR, 'data', 'cnn_news.db')
OUT_MODEL_PATH       = os.path.join(BASE_DIR, 'models', 'projector.pth')

# 2) GloVe→PCA preprocessing if needed
if not os.path.exists(GLOVE_PCA_EMBED_PATH):
    print("🏗 Generating Glove-PCA embeddings (64d)…")
    words, vecs = [], []
    with open(GLOVE_TXT_PATH, 'r', encoding='utf-8') as f:
        for line in f:
            vals = line.split()
            words.append(vals[0])
            vecs.append(np.array(vals[1:], dtype=np.float32))
    mat = np.vstack(vecs)
    pca = PCA(n_components=64)
    reduced = pca.fit_transform(mat)
    os.makedirs(os.path.dirname(GLOVE_PCA_EMBED_PATH), exist_ok=True)
    np.save(GLOVE_PCA_EMBED_PATH, reduced)
    with open(GLOVE_PCA_VOCAB_PATH, 'w', encoding='utf-8') as f:
        json.dump(words, f, ensure_ascii=False)
    print(f"✅ Saved PCA embeddings → {GLOVE_PCA_EMBED_PATH}")

# 3) Load PCA'd GloVe
vectors = np.load(GLOVE_PCA_EMBED_PATH)   # shape (V, D)
with open(GLOVE_PCA_VOCAB_PATH, 'r', encoding='utf-8') as f:
    vocab = json.load(f)                 # length V
glove_pca = dict(zip(vocab, vectors))

# 4) Load CNN news and vectorize
engine = create_engine(f'sqlite:///{DB_PATH}')
df = pd.read_sql_table('cnn_positive_news', engine)
texts = df['full_text'].dropna().tolist()

def get_text_vector(text):
    toks = text.split()
    vs = [glove_pca[t.lower()] for t in toks if t.lower() in glove_pca]
    return np.mean(vs, axis=0) if vs else None

# build sample matrix
sample_list = [get_text_vector(t) for t in texts]
sample_list = [v for v in sample_list if v is not None]
samples_np  = np.stack(sample_list)  # shape (N, D)
print(f"🧪 Total contrastive samples: {len(samples_np)}")

# 5) Dataset & DataLoader
class ContrastiveDataset(Dataset):
    def __init__(self, array: np.ndarray):
        # array: NumPy array shape (N, D)
        self.vecs = torch.from_numpy(array.astype(np.float32))
    def __len__(self):
        # leave at least 2 ahead for (pos, neg)
        return self.vecs.size(0) - 2
    def __getitem__(self, idx):
        # return (anchor, positive, negative)
        return self.vecs[idx], self.vecs[idx+1], self.vecs[idx+2]

loader = DataLoader(ContrastiveDataset(samples_np), batch_size=32, shuffle=True)

# 6) Model & optimizer
D = samples_np.shape[1]  # embedding dimension (should be 64)
class ContrastiveProjector(nn.Module):
    def __init__(self, dim=D):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, dim),
            nn.ReLU(),
            nn.Linear(dim, dim)
        )
    def forward(self, x):
        return self.net(x)

device    = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model     = ContrastiveProjector(dim=D).to(device)
optimizer = Adam(model.parameters(), lr=1e-3)
loss_fn   = nn.CosineEmbeddingLoss()

# 7) Training loop
epochs = 10
for epoch in range(1, epochs+1):
    total_loss = 0.0
    for a, p, n in loader:
        a, p, n = a.to(device), p.to(device), n.to(device)
        za, zp, zn = model(a), model(p), model(n)
        y_pos = torch.ones(za.size(0), device=device)
        y_neg = -torch.ones(za.size(0), device=device)
        loss = loss_fn(za, zp, y_pos) + loss_fn(za, zn, y_neg)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}/{epochs}, Loss: {total_loss/len(loader):.4f}")

# 8) Save projector
os.makedirs(os.path.dirname(OUT_MODEL_PATH), exist_ok=True)
torch.save(model.state_dict(), OUT_MODEL_PATH)
print(f"✅ Contrastive projector saved → {OUT_MODEL_PATH}")


6.

In [None]:
import numpy as np
import pandas as pd
import json
import joblib
import os

# ====== 경로 세팅 ======
BASE_DIR   = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
DB_PATH    = os.path.join(BASE_DIR, 'data', 'cnn_news.db')
VOCAB_PATH = os.path.join(BASE_DIR, 'data', 'industry_vocab.json')
OUT_CSV    = os.path.join(BASE_DIR, 'data', 'industry_promise.csv')
CUR_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
emb_path = os.path.join(CUR_DIR, 'data', "industry_emb_32d.npy")
vocab_path = os.path.join(CUR_DIR, 'data', "industry_vocab.json")
comp_path = os.path.join(CUR_DIR, 'data', "nasdaq_screener_1744184912302.csv")
model_path = os.path.join(CUR_DIR, 'models',"promise_predictor.pkl")
output_path = os.path.join(CUR_DIR, 'results', "company_promise_score.csv")

# ====== 임베딩 & vocab 불러오기 ======
industry_emb = np.load(emb_path)  # (산업개수, 32)
with open(vocab_path, "r") as f:
    industry_vocab = json.load(f)
industry2idx = {v.lower(): i for i, v in enumerate(industry_vocab)}

# ====== 기업 CSV 불러오기 ======
companies = pd.read_csv(comp_path)
# 컬럼 예시: Symbol, Name, Industry

# ====== 기업 임베딩 생성 함수 ======
def extract_industries(industry_str):
    if pd.isnull(industry_str):
        return []
    return [x.strip().lower() for x in industry_str.split(",")]

def get_company_emb(inds):
    idxs = [industry2idx[ind] for ind in inds if ind in industry2idx]
    if len(idxs) == 0:
        return np.zeros(industry_emb.shape[1])
    return np.mean(industry_emb[idxs], axis=0)

# ====== 전체 기업 임베딩 생성 ======
company_emb_list = []
for _, row in companies.iterrows():
    inds = extract_industries(row['Industry'])
    emb = get_company_emb(inds)
    company_emb_list.append(emb)
company_embs = np.stack(company_emb_list)  # (기업수, 32)

# ====== 유망도 예측 ======
predictor = joblib.load(model_path)
scores = predictor.predict(company_embs)
companies['promise_score'] = scores

# ====== 결과 저장 ======
companies.to_csv(output_path, index=False)
print(f'>> 결과 저장 완료: {output_path}')
print(companies[['Name', 'Industry', 'promise_score']].sort_values('promise_score', ascending=False).head(20))
