In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import chardet 

#定义爬取函数
def crawl_sina_finance_reports(pages):
    base_url = "https://stock.finance.sina.com.cn/stock/go.php/vReport_List/kind/lastest/index.phtml"
    reports = []
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    for page in range(1, pages + 1):
        url = f"{base_url}?p={page}"
        response = requests.get(url,headers=headers)#
        # 使用chardet检测编码
        detected_encoding = chardet.detect(response.content)['encoding']
        if detected_encoding:
            #print(detected_encoding)
            response.encoding = detected_encoding
        else:
            response.encoding = 'GB2312'  # 如果chardet无法检测到编码，则默认使用GB2312
        soup = BeautifulSoup(response.content)#, 'html.parser'
 
        # 找到所有报道的列表项
        report_items = soup.find_all('tr')[1:]  # 跳过表头
        for item in report_items:
            columns = item.find_all('td')
            if len(columns) >= 4:
                title = columns[1].text.strip()
                kind = columns[2].text.strip()
                date = columns[3].text.strip()
                organization = columns[4].text.strip()
                analysts = columns[5].text.strip()
                reports.append([title, kind, date, organization, analysts])
 
    return reports
 
# 爬取数据
pages = 10
reports_data = crawl_sina_finance_reports(pages)

# 创建DataFrame
df_reports = pd.DataFrame(reports_data, columns=["标题",'报告类型', "发布日期", "机构","研究员"])
df_reports
csv_file = f"sina_daily.csv"
df_reports.to_csv(csv_file, encoding='utf-8-sig',index=False)

In [3]:
from google.cloud import language_v1
import os
import pandas as pd

# Set up authentication for Google Cloud Natural Language API
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'civic-network-418910-0aa3df60f95d.json'

# Instantiate a client
client = language_v1.LanguageServiceClient()

def analyze_sentiment(text):
    """
    Using the Google Cloud Natural Language API to analyze sentiment
    """
    document = language_v1.Document(content=text, type_=language_v1.Document.Type.PLAIN_TEXT)
    sentiment = client.analyze_sentiment(request={'document': document}).document_sentiment
    # Convert sentiment score from -1 (negative) to 1 (positive) scale to a 0-10 scale
    score = round((sentiment.score + 1) * 5, 1)
    return score

# Load your initial scraping result to a DataFrame
df = pd.read_csv('sina_daily.csv')

# Remove duplicate rows
df = df.drop_duplicates()

# Ensure text column is string type
df['标题'] = df['标题'].astype(str)

# Apply sentiment analysis to each row in the 'text' column
df['情绪得分'] = df['标题'].apply(analyze_sentiment)

# Save the DataFrame with the sentiment scores to a new CSV file
df.to_csv('sina_result_with_sentiments.csv', index=False, encoding='utf_8_sig')

print("Sentiment scoring completed and results saved to 'sina_result_with_sentiments.csv'.")

Sentiment scoring completed and results saved to 'sina_result_with_sentiments.csv'.
