In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np 

##  1. 数据收集与预处理

In [2]:
def preprocess_text(text):
    # 简单文本清洗
    import re
    text = re.sub(r'\W+', ' ', text)
    return text.lower()

##  2. 关键词提取

In [3]:
def extract_keywords(texts, top_n=5):
    vectorizer = TfidfVectorizer(max_features=1000)
    tfidf_matrix = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names_out()
    keywords = []
    for row in tfidf_matrix:
        indices = row.toarray().argsort()[0][-top_n:]
        keywords.append([feature_names[i] for i in indices])
    return keywords

## 3. 行业分类映射

In [4]:
def map_keywords_to_industries(keywords, industry_mapping):
    industry_hits = []
    for keyword_list in keywords:
        industries = set()
        for keyword in keyword_list:
            if keyword in industry_mapping:
                industries.update(industry_mapping[keyword])
        industry_hits.append(list(industries))
    return industry_hits

## 示例行业映射表

In [6]:
industry_mapping = {
    "oil": ["Energy", "Oil & Gas"],
    "chip": ["Technology", "Semiconductors"],
    "bank": ["Finance", "Banking"]
}

## 示例新闻数据

In [7]:
news_data = [
    "Oil prices are rising due to geopolitical tensions.",
    "New advancements in chip technology are announced.",
    "Major banks report quarterly earnings."
]

##  4. 分析流程

In [8]:
preprocessed_news = [preprocess_text(news) for news in news_data]
keywords = extract_keywords(preprocessed_news)
industries = map_keywords_to_industries(keywords, industry_mapping)

In [9]:
# 输出结果
for i, news in enumerate(news_data):
    print(f"News: {news}")
    print(f"Keywords: {keywords[i]}")
    print(f"Industries: {industries[i]}")
    print()

News: Oil prices are rising due to geopolitical tensions.
Keywords: ['tensions', 'oil', 'prices', 'rising', 'to']
Industries: ['Oil & Gas', 'Energy']

News: New advancements in chip technology are announced.
Keywords: ['new', 'chip', 'announced', 'technology', 'advancements']
Industries: ['Technology', 'Semiconductors']

News: Major banks report quarterly earnings.
Keywords: ['earnings', 'banks', 'quarterly', 'report', 'major']
Industries: []



## 示例行业映射表

In [10]:

industry_mapping = {
    "歼10c": ["航空航天", "国防军工"],
    "阵风": ["航空航天", "国防军工"],
    "武器": ["国防军工"],
    "巴基斯坦": ["国际关系", "国防军工"],
    "中国": ["国际关系", "综合"]
}

## 示例中文新闻数据

In [11]:
news_data = [
    "巴基斯坦用中国歼10c武器击落阵风飞机。",
    "中国宣布新型芯片技术突破。",
    "国际油价因地缘政治紧张局势上涨。"
]

## 4. 分析流程

In [12]:
preprocessed_news = [preprocess_text(news) for news in news_data]
keywords = extract_keywords(preprocessed_news)
industries = map_keywords_to_industries(keywords, industry_mapping)

## 输出结果

In [13]:
for i, news in enumerate(news_data):
    print(f"新闻: {news}")
    print(f"关键词: {keywords[i]}")
    print(f"关联行业: {industries[i]}")
    print()

新闻: 巴基斯坦用中国歼10c武器击落阵风飞机。
关键词: ['中国宣布新型芯片技术突破', '国际油价因地缘政治紧张局势上涨', '巴基斯坦用中国歼10c武器击落阵风飞机']
关联行业: []

新闻: 中国宣布新型芯片技术突破。
关键词: ['国际油价因地缘政治紧张局势上涨', '巴基斯坦用中国歼10c武器击落阵风飞机', '中国宣布新型芯片技术突破']
关联行业: []

新闻: 国际油价因地缘政治紧张局势上涨。
关键词: ['中国宣布新型芯片技术突破', '巴基斯坦用中国歼10c武器击落阵风飞机', '国际油价因地缘政治紧张局势上涨']
关联行业: []



# 使用FLASK实现

In [14]:
from flask import Flask, request, render_template
from sklearn.feature_extraction.text import TfidfVectorizer


In [15]:
app = Flask(__name__)

# 示例行业映射表
industry_mapping = {
    "歼10c": ["航空航天", "国防军工"],
    "阵风": ["航空航天", "国防军工"],
    "武器": ["国防军工"],
    "巴基斯坦": ["国际关系", "国防军工"],
    "中国": ["国际关系", "综合"]
}

# 关键词提取函数
def extract_keywords(text, top_n=5):
    vectorizer = TfidfVectorizer(max_features=1000)
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    indices = tfidf_matrix.toarray().argsort()[0][-top_n:]
    return [feature_names[i] for i in indices]

# 行业映射函数
def map_keywords_to_industries(keywords):
    industries = set()
    for keyword in keywords:
        if keyword in industry_mapping:
            industries.update(industry_mapping[keyword])
    return list(industries)

# 路由和逻辑
@app.route("/", methods=["GET", "POST"])
def index():
    if request.method == "POST":
        news = request.form["news"]
        keywords = extract_keywords(news)
        industries = map_keywords_to_industries(keywords)
        return render_template("index.html", news=news, keywords=keywords, industries=industries)
    return render_template("index.html", news="", keywords=[], industries=[])

if __name__ == "__main__":
    app.run(debug=True)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with watchdog (fsevents)
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1074, in launch_instance
    app.initialize(argv)
  File "/opt/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 118, in inner
    return method(app, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 654,

SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
