In [None]:
import re
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import base64
import joblib

# 파일 로드
raw = open('dataset/message.txt', 'r', encoding='utf-8').read()

# Subject 추출
subject_match = re.search(r'"Subject"\s*:\s*"([^"]*)"', raw)
subject = subject_match.group(1) if subject_match else ''

# body_text 값 추출
body_match = re.search(r'"body_text"\s*:\s*"(.+)"\s*}', raw, re.DOTALL)
body = body_match.group(1).encode('utf-8').decode('unicode_escape') if body_match else ''
body = body.replace('\\"', '"')

# DataFrame 생성
df_new = pd.DataFrame([{'subject': subject, 'body': body}])

# 데이터 전처리 함수
def clean_text(text):
    if pd.isna(text):
        return ''
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z0-9가-힣\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip().lower()
    return text

def extract_urls(text):
    return re.findall(r'http[s]?://\S+', str(text))

def extract_domains(urls):
    domains = []
    for url in urls:
        try:
            domain = urlparse(url).netloc
            if domain:
                domains.append(domain.lower())
        except:
            continue
    return domains

# 텍스트 정제 및 피처 생성
df_new['clean_subject'] = df_new['subject'].apply(clean_text)
df_new['clean_body'] = df_new['body'].apply(clean_text)
df_new['subject_len'] = df_new['clean_subject'].apply(len)
df_new['body_len'] = df_new['clean_body'].apply(len)
df_new['extracted_urls'] = df_new['body'].apply(extract_urls)
df_new['num_urls'] = df_new['extracted_urls'].apply(len)
df_new['url_domains'] = df_new['extracted_urls'].apply(extract_domains)
df_new['num_unique_domains'] = df_new['url_domains'].apply(lambda x: len(set(x)))

# 7. TF–IDF 변환 (기존 벡터라이저 사용)
tfidf_vectorizer = joblib.load('tfidf_vectorizer.joblib')
X_tfidf_new = tfidf_vectorizer.transform(df_new['clean_body'])

# 8. DataFrame 변환 및 최종 결합
tfidf_df_new = pd.DataFrame(
    X_tfidf_new.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out(),
    index=df_new.index
)
num_feats_new = df_new[['subject_len', 'body_len', 'num_urls', 'num_unique_domains']].reset_index(drop=True)
X_final_new = pd.concat([tfidf_df_new, num_feats_new], axis=1)

print(X_final_new)


    00   01        02   03   04   05   06   07   08   10  ...  world  would  \
0  0.0  0.0  0.733253  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0   

   wrote  www  you  your  subject_len  body_len  num_urls  num_unique_domains  
0    0.0  0.0  0.0   0.0           34       209        14                   3  

[1 rows x 304 columns]


In [None]:
# 머신러닝 모델 로드
model = joblib.load('phishing_Detecting_model.joblib')

# 예측
preds = model.predict(X_final_new)

# 해당 메일이 피싱일 확률
probs = model.predict_proba(X_final_new)[:, 1]

# 결과를 DataFrame에 추가
df_new['label'] = preds
df_new['phishing_prob'] = probs

# 5) 결과 확인
print(df_new[['subject', 'label', 'phishing_prob']])

                                  subject  label  phishing_prob
0  (광고) AI 구독클럽 제휴카드 혜택으로 원하는 것만. 필요한 만큼.      0            0.5
