In [5]:
import logging
import os
import jieba.posseg as psg
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from sklearn import preprocessing
import joblib
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import glob
import pandas as pd
import numpy as np
import pyLDAvis.gensim
import matplotlib.pyplot as plt

def loadStops(filename):
    stops = []
    with open(filename, encoding='utf-8') as fr:
        fr.readline()
        for line in fr:
            stops.append(line.strip())
    return stops



def goTrainLDA(data,n_topic=10):

    dct = Dictionary(data)
    joblib.dump(dct, f'./models/dct{n_topic}.m')
    print('词库保存完毕！')
#     corpus = [dct.doc2bow(_) for _ in data]
    print(len(dct))
#     ldamodel = LdaModel(corpus=corpus, num_topics=n_topic, id2word=dct,eta=0.0001,passes=50)
#     joblib.dump(ldamodel, f'./models/lda_model{n_topic}.m')
    print('LDA模型保存完毕！')

class LoadModel:
    def __init__(self,n_topic):
        self.ldamodel = joblib.load(f'./models/lda_model{n_topic}.m')
        self.dct = joblib.load(f'./models/dct{n_topic}.m')

    def batchPredict(self,data):
            #Print the 8 topic and largest weighted words in topic
#             topic_list=self.ldamodel.print_topics(8, 30)
            for topic in self.ldamodel.print_topics(20, 40):
                print(topic)
            
            for line in data:
                corpus = [self.dct.doc2bow(_) for _ in data]
                doc_lda = self.ldamodel[corpus[0]] 
                doc_lda_max = sorted(doc_lda, key=lambda k: k[1], reverse=True)[0]
                info = self.ldamodel.print_topic(doc_lda_max[0], 6)
                
                

                feather = [x.strip().replace('"', '').split('*') for x in info.strip().split('+')]
                weight_feather = [float(y[0]) for y in feather]
                word = [y[1] for y in feather]
#                 print("context:",line)
#                 print('LDA:', info)
#                 print('word:', word)
#                 print('weights:', weight_feather)
                self.ldamodel.print_topics(8, 30)
                break

    def topic_analysie(self,data):
            da = [self.dct.doc2bow(_) for _ in data]
            doc_lda = self.ldamodel[da]
            doc_lda_max = [sorted(w, key=lambda k: k[1], reverse=True)[0] for w in doc_lda]
            topic_list=[w for w,t in doc_lda_max]

            return topic_list

In [6]:
# Preprocesssing

filenames = glob.glob(r"stopwords/*")
stopwords = []
for filename in filenames:
    stopwords.extend(loadStops(filename))
    
filenames1 = glob.glob(r"lib/*")
lib = []
for filename in filenames1:
    lib.extend(loadStops(filename))    



In [7]:
filenames_news = glob.glob(r"news_spider/*")
temp={}
for filename in filenames_news:
     print(filename)
     temp[filename.split(".")[0].split("\\")[1]]= pd.read_csv(filename, encoding="utf-8")
    
    

news_spider\交易所公告.csv
news_spider\交易所动态.csv
news_spider\国内新闻.csv
news_spider\国际新闻.csv
news_spider\省市动态.csv


In [17]:
news=pd.DataFrame()
for key in temp.keys():
    news=pd.concat([news,temp[key]],axis=0)

In [25]:
news.set_index(pd.to_datetime(news["日期"]),inplace=True)
news.drop("日期",inplace=True,axis=1)

In [29]:
news["context"]=news["标题"].str.cat(news["内容"],na_rep="")
news.drop(["标题","内容","板块"],axis=1,inplace=True)
news

Unnamed: 0_level_0,context
日期,Unnamed: 1_level_1
2020-09-26,关于2020年国庆节、中秋节休市安排的公告尊敬的各交易参与人：根据《国务院办公厅关于2020...
2020-08-28,2020年茂名市石狗塘村等5个省定贫困村分布式光伏发电碳普惠项目（PHCER）竞价情况根据《...
2020-08-18,广东省省级碳普惠制核证减排量（PHCER）项目竞价公告根据《广东省碳排放管理试行办法》（省政...
2020-07-13,2020年7月6日企业委托配额（GDEA）竞价成交情况广州碳排放权交易中心（以下简称“广碳所...
2020-07-06,2020年7月6日广东省碳排放配额（GDEA）竞价情况根据《广东省碳排放管理试行办法》（省政...
...,...
2012-11-27,近百名环保志愿者走进社区 倡导生态文明24日，珠海市环保局、市文明办组织了近百人的环保志愿服...
2012-11-26,环保嘉年华启动 快乐享低碳盛宴11月24日上午，天空飘着毛毛细雨，但丝毫不能浇灭少年宫广场上...
2012-11-22,惠州市召开2012年污染减排工作推进会11月20日，惠州市召开2012年污染减排工作推进会，...
2012-11-22,坪山打造水生态示范区记者获悉，坪山新区拟以坪山河流域启动区的汤坑水水环境综合治理工程为突破，...


In [58]:
filenames2 = glob.glob(r"envir_lib/*")
envir_words = []
import jieba
import re
for filename in filenames2:
    envir_words.extend(loadStops(filename)) 

stopwords.extend(["年","月","日","新","时","说"])    
for x in envir_words:
    jieba.add_word(x)
for x in lib:
    jieba.add_word(x)    
    
def cut_words(intxt):

    return " ".join([w for w in jieba.cut(re.sub(r'[0-9]+',"",str(intxt))) if w not in stopwords])

In [31]:
news["cut"]= news.context.apply(cut_words)

In [64]:
news.to_csv("news_onexchange_cut_lda.csv",encoding="utf-8")

In [8]:
news=pd.read_csv("news_onexchange_cut_lda.csv",encoding="utf-8")
news.set_index(pd.to_datetime(news["日期"]),inplace=True)
news.drop("日期",inplace=True,axis=1)


In [9]:
news.head()

Unnamed: 0_level_0,context,cut
日期,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-06-06,广州碳排放权交易中心有限公司公开声明尊敬的会员及客户：广州碳排放权交易中心有限公司（以下简称...,广州 碳排放权 交易中心 有限公司 公开 声明 尊敬 会员 客户 广州 碳排放权 交易中心 ...
2017-12-27,关于2018年元旦休市安排的公告尊敬的各交易参与人：根据《国务院办公厅关于2018年部分节假...,元旦 休市 安排 公告 尊敬 交易 参与 国务院办公厅 节假日 安排 通知 国办发 明电 号...
2017-10-16,广州碳排放权交易所第二届会员大会邀请函尊敬的会员：为加强会员交流，凝聚会员合力，携手推动广东...,广州 碳排放权 交易所 第二届 会员大会 邀请函 尊敬 会员 会员 交流 凝聚 会员 合力 ...
2017-09-26,关于2017年国庆中秋休市安排的公告尊敬的各交易参与人：根据《国务院办公厅关于2017年部分...,国庆 中秋 休市 安排 公告 尊敬 交易 参与 国务院办公厅 节假日 安排 通知 国办发 明...
2017-09-04,CCER购买需求发布某公司拟购买不少于100万吨CCER，详细信息如下表所示：公告编号：GM...,CCER 购买 需求 发布 公司 拟 购买 万吨 CCER 详细信息 表 所示 公告 编号 ...


In [10]:
stopwords_tan = loadStops(r"stop.txt")
print(stopwords_tan)

['去年', '提出', '屋顶', '约', '号', '广东建设报', '科技报', '经济报', '全市', '绿色', '碳', '增长', '.%', '广东', '广州', '领域', '国家', '广东省', '珠三角', '项目', '加快', '区', '提高', '顺德', '高', '纯', '家', '关注', '…', '学会', '国内', '东莞', '深圳', '佛山', '海珠区', '湛江', '大鹏', '省', '占', '地区', '达', '珠海', '全省', '中心', '提升', '升级', '惠州', '肇庆', '我省', '格力', '低', '我国', '提供']


In [11]:
stopwords_tan.extend(["村","中","工作","相关","更","广东","量","委" ,"一行","碳市场","碳交易","碳排放","碳排放权","有限公司","简称","排放","请","未","%","建设","城市","权","镇","寨说","南方日报","记者","副","二","元","年月日","三","吨","万吨","情况","年度","本次","广州市","应"])
def refine(intxt):
    return " ".join([w for w in str(intxt).split() if w not in stopwords_tan])

In [12]:
tan=pd.DataFrame([[2010,2.497946263],
[2011,2.406106919],
[2012,3.571845513],
[2013,4.302691177],
[2014,4.037716008],
[2015,4.552173336],
[2016,3.366226293],
[2017,2.528045957]
],columns=["date","vol"])
tan.set_index(["date"],inplace=True)

In [13]:
def cor(n_topic,corpus,df,df_index):
    print(f"=============={n_topic}===============")
    df_index.set_index(pd.to_datetime(df_index.index),inplace=True)
    df_env=pd.DataFrame(index=pd.to_datetime(df.index))
    lm = LoadModel(n_topic)
    lm.batchPredict(corpus)
    topic_list=lm.topic_analysie(corpus)
    df_env["topic"]=np.array(topic_list)
    print(df_env.topic)
#     count=df_env.groupby(df_env.index.year)["topic"].count()
    count=df_index.groupby(df_index.index.year)["cut"].count()
    print(count)
    group=df_env.groupby([df_env.index.year,"topic"])["topic"].count()/count
    group=group.unstack(level=0)
    print(group)
    print(f"==============corr===============")
    print(group.corrwith(tan.vol,axis=1))

In [19]:
news = news[~news.index.year.isin([2018,2019,2020])]
news.shape

(625, 2)

In [20]:
corpus = []
for line in news.cut:
            corpus.append([str(w) for w in str(line).split() if w not in stopwords_tan])

In [16]:
 
def run_tfidf(df,min_df):    
    def reduce_stop(intxt):
        return " ".join([w for w in intxt.split(" ") if w not in stopwords_tan])
    df['cut']=df.cut.apply(reduce_stop)
    data_=list(df.cut)
    #训练tfidf 向量
   
    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer(ngram_range=(1,1), min_df=min_df)
    features = vectorizer.fit_transform(data_)
    
    terms = vectorizer.get_feature_names()

    # sum tfidf frequency of each term through documents
    sums = features.sum(axis=0)
    
    # connecting term to its sums frequency
    data = []
    for col, term in enumerate(terms):
        data.append( (term, sums[0,col] ))
    print(data[0])
    def takeSecond(elem):
        return elem[1]
    data.sort(key=takeSecond)
    print(data[:101])

In [17]:
grouped=news.groupby([news.index.year])
for index in range(2012,2018):
    print(f"========================={index}=================")
    run_tfidf(grouped.get_group(index),5)

('一种', 0.3938490021464971)
[('处于', 0.19385200567563882), ('燃料', 0.22720378422399462), ('变化', 0.2339469631501993), ('道路', 0.23418863985621152), ('各国', 0.24214978541837406), ('条件', 0.2515029145109764), ('努力', 0.28746697871623844), ('经济发展', 0.29028091583221977), ('快速', 0.2975286324861184), ('举措', 0.3005577075970008), ('能力', 0.30986289794936966), ('结构调整', 0.31417576396325764), ('低于', 0.314925692469988), ('不利', 0.3150753734448856), ('单位', 0.3208955382631144), ('责任', 0.32581561124154607), ('保证', 0.32702995458262596), ('能源结构', 0.3296333250040147), ('高于', 0.3298081887477961), ('负责', 0.34957923941624525), ('最终', 0.35399749878221465), ('购买', 0.36326494662511616), ('做出', 0.36487490130303296), ('强制', 0.3678632387019837), ('评价', 0.3725066150769177), ('在内', 0.37290007658623014), ('提交', 0.3736794577023479), ('制度', 0.37510002053657726), ('特别', 0.3806937904221701), ('调整', 0.38385128648838523), ('发展和', 0.3852813672305854), ('予以', 0.39059631048464577), ('时间', 0.3906441370352102), ('一种', 0.393849002146497

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cut']=df.cut.apply(reduce_stop)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cut']=df.cut.apply(reduce_stop)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cut']=df.cut.apply(reduce_stop)


('ccer', 0.8087363826045049)
[('三中全会', 0.12845750430981392), ('十八', 0.12845750430981392), ('状况', 0.14671193940489294), ('供需', 0.15404014581216766), ('政府部门', 0.1610272436536153), ('适时', 0.17133370496374076), ('大部分', 0.17170039309597065), ('总体', 0.17372665252406377), ('潜力', 0.17950233344747774), ('发生', 0.18010262885050024), ('趋势', 0.18160412489681804), ('月底', 0.18535243617760686), ('事实上', 0.1853718636223962), ('化工', 0.18670159519145366), ('着手', 0.19622547552943384), ('政府的', 0.19768430908501697), ('检验', 0.2005119504864321), ('纺织', 0.20128088392511168), ('有序', 0.2077085125275982), ('扣除', 0.21252852127535432), ('树立', 0.21898106017177385), ('尚未', 0.22121478155414917), ('加速', 0.22192029827851134), ('按程序', 0.22623787571356505), ('市场机制', 0.226654263157383), ('激励', 0.22680910356326117), ('约束', 0.2268752247344507), ('统筹', 0.22783744338597978), ('效应', 0.22805616144296464), ('陶瓷', 0.22972010494946976), ('运作', 0.23569151758327853), ('学习', 0.23670529818583091), ('淘汰', 0.23732346240687707), ('相继', 0.2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cut']=df.cut.apply(reduce_stop)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cut']=df.cut.apply(reduce_stop)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cut']=df.cut.apply(reduce_stop)


In [177]:
grouped=news.groupby([news.index.year])
for index in range(2012,2018):
    print(f"========================={index}=================")
    run_tfidf(grouped.get_group(index),10)

('中国', 5.0605068857378175)
[('北京', 0.9921224676560655), ('介绍', 1.250755535803497), ('建立', 1.2531844486310535), ('研究', 1.3640292700090328), ('重点', 1.373165028418162), ('机制', 1.382192310579088), ('十二五', 1.414348163537173), ('主任', 1.4744150005475114), ('推动', 1.5533284917917816), ('指出', 1.5667405017502534), ('影响', 1.6482511870610999), ('改革', 1.6719694864745824), ('资源', 1.7187521764506482), ('行业', 1.8005541131597944), ('全球', 1.8062962292557552), ('未来', 1.808413780629253), ('投资', 1.8213984440429816), ('包括', 1.834304844349645), ('启动', 1.8557043606666497), ('理念', 1.9691467535040053), ('生态文明', 2.0202642757653253), ('全国', 2.078514322042671), ('组织', 2.1833851641909003), ('会议', 2.190895763113583), ('方案', 2.2199293955198907), ('措施', 2.259549424927195), ('政策', 2.350099057024605), ('气候', 2.515674658028237), ('实施', 2.534816721210739), ('经济', 2.5543960700097226), ('低碳', 2.560905258065133), ('减少', 2.5637294255601635), ('政府', 2.592949972017477), ('标准', 2.6151477736233915), ('区域', 2.6590264795460024), ('环

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cut']=df.cut.apply(reduce_stop)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cut']=df.cut.apply(reduce_stop)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cut']=df.cut.apply(reduce_stop)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col

('ccer', 2.9877557714374734)
[('采用', 0.379018812830404), ('钢铁', 0.43877547778188), ('第三', 0.4713705482379097), ('免费', 0.4734438274617517), ('尝试', 0.4768239065615613), ('举措', 0.498429343436544), ('健康', 0.5257802791957615), ('体现', 0.5310171200580365), ('连续', 0.5407079727647036), ('粤府', 0.5502097749761747), ('选择', 0.5523217513823762), ('月底', 0.5640252832012487), ('设定', 0.5837510856715104), ('进展', 0.6120655911675674), ('规范', 0.613364652163648), ('市场化', 0.620929779681084), ('咨询', 0.631909388492502), ('各项', 0.6409479039416102), ('事项', 0.6459549396921068), ('石化', 0.6477773562003005), ('深化', 0.6485549905105598), ('资本', 0.6709878460458552), ('意见', 0.6766867510490097), ('试行', 0.6796398028365639), ('增加', 0.6800094788105348), ('第三方', 0.688634839964839), ('贡献', 0.6891259042216197), ('统计', 0.6928932538575269), ('一系列', 0.6945614411838591), ('产业', 0.6957481041200129), ('流程', 0.6981392705440895), ('减碳', 0.7015457924620557), ('市场机制', 0.704063420694276), ('新能源', 0.7126344758304446), ('超过', 0.720607248878

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cut']=df.cut.apply(reduce_stop)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cut']=df.cut.apply(reduce_stop)


In [79]:
#训练LDA 模型，并保存
for i in range(2,8):
    goTrainLDA(corpus,i)

词库保存完毕！
LDA模型保存完毕！
词库保存完毕！
LDA模型保存完毕！
词库保存完毕！
LDA模型保存完毕！
词库保存完毕！
LDA模型保存完毕！
词库保存完毕！
LDA模型保存完毕！
词库保存完毕！
LDA模型保存完毕！


In [18]:
for i in range(2,8):
    cor(i,corpus,news,news)

(0, '0.010*"创新" + 0.008*"产业" + 0.006*"企业" + 0.005*"经济" + 0.005*"规划" + 0.004*"科技" + 0.004*"生态" + 0.004*"服务" + 0.004*"重点" + 0.003*"实施" + 0.003*"合作" + 0.003*"亿元" + 0.003*"区域" + 0.003*"环境" + 0.003*"全国" + 0.003*"打造" + 0.003*"文化" + 0.003*"改革" + 0.003*"管理" + 0.003*"社会" + 0.003*"人才" + 0.003*"环保" + 0.002*"体系" + 0.002*"资源" + 0.002*"转型" + 0.002*"金融" + 0.002*"政府" + 0.002*"战略" + 0.002*"投资" + 0.002*"国际" + 0.002*"建立" + 0.002*"平台" + 0.002*"综合" + 0.002*"目标" + 0.002*"工程" + 0.002*"完善" + 0.002*"制度" + 0.002*"旅游" + 0.002*"基地" + 0.002*"创业"')
(1, '0.014*"企业" + 0.008*"中国" + 0.007*"产业" + 0.007*"技术" + 0.006*"市场" + 0.005*"产品" + 0.004*"公司" + 0.004*"制造" + 0.004*"经济" + 0.004*"行业" + 0.004*"汽车" + 0.004*"投资" + 0.004*"亿元" + 0.004*"生产" + 0.004*"装备" + 0.003*"全球" + 0.003*"制造业" + 0.003*"新能源" + 0.003*"建筑" + 0.003*"创新" + 0.003*"国际" + 0.003*"节能" + 0.003*"智能" + 0.003*"研发" + 0.002*"合作" + 0.002*"工业" + 0.002*"能源" + 0.002*"政策" + 0.002*"世界" + 0.002*"设计" + 0.002*"全国" + 0.002*"未来" + 0.002*"研究" + 0.002*"转型" + 0.002*"集团" + 0.002*"数据" + 

IndexError: index 133223 is out of bounds for axis 1 with size 115825