In [None]:
#!flask/bin/python
# -*- coding: utf-8 -*-

# for mysql & mongo
#import mysql.connector
#from mysql.connector import errorcode
from pymongo import MongoClient

# for api
from flask import Flask, jsonify, abort, make_response, request, render_template, Markup
from flask_cors import CORS
import json
import collections
import datetime
from dateutil.relativedelta import relativedelta
import urllib

# for country_info_templates
#import templates


app = Flask(__name__, template_folder='templates')

# for CORS
cors = CORS(app, resources={r"/api/*": {"origins": "*"}})

@app.route('/', methods=['GET'])
def index():
    return "Hello, world! This is for August POC"

@app.route('/api/test/', methods=['GET'])
def get_test():
	name = request.args.get('name', '', type=str); #type=str要去除掉，否則會有編碼問題
	print name;
	return name;

# potential projects
@app.route('/api/landing-map-markers/landing-map-markers/potential-projects/', methods=['GET'])
def get_potential_projects():
    # connect to mysql
    cnx = mysql.connector.connect(user='root', password='obor3',
                              host='obor-mysql',
                              database='tpdb',
                              charset='utf8')
    cursor = cnx.cursor()
    res = list()

    # get the parameter from args
    country_id = request.args.get('country_id', '', type=str).replace('"','').replace("'","")

    # sql script
    query_all = "SELECT tender_no, description, category, institute, location, country_id, longitude, latitude, advertised_date, closing_date, related_link FROM POTENTIALITEM;"
    #query2 = 'select tender_no, description, category, institute, location, country_id, longitude, latitude, advertised_date, closing_date, related_link FROM POTENTIALITEM where country_id =  %s;' % (country_id)
    query_country = "SELECT tender_no, description, category, institute, location, country_id, longitude, latitude, advertised_date, closing_date, related_link FROM POTENTIALITEM where country_id ='{0}';".format(country_id)
    
    # check the country_id value and then query the data
    if country_id:
        cursor.execute(query_country)
        alldata = cursor.fetchall()
        cursor.close()
        cnx.close()
        if alldata:
            return jsonify(alldata) 
        else:
            return jsonify({'error': 'County Not found'})
    else:
        cursor.execute(query_all)
        alldata = cursor.fetchall()
        cursor.close()
        cnx.close()
        return jsonify(alldata)

# potential projects
@app.route('/api/potential_item/', methods=['GET'])
def get_potential_items():
    # connection's parameter
    config = {
        'host': 'obor-mysql',
        'user': 'root', 
        'password': 'obor3',
        'charset': 'utf8'
    }

    # connect db
    try:
        cnx = mysql.connector.connect(**config)
    except mysql.connector.Error as e:
        return jsonify({'error': 'Connect failed! -- ' + str(e)})
    else:
        pass

    # get the parameter from args
    country_id = request.args.get('country_id', '', type=str).replace('"','').replace("'","")

    # use request.args.get() got empty for chinese
    if 'industry_id' in request.args:
        industry_id = request.args['industry_id'].replace('"','').replace("'","").encode('utf8')

        # if industry_id == industry code, use mapping table
        industry_table = {
            "0": "制造业",
            "1": "交通运输",
            "10": "水利环境和公共",
            "11": "建筑业",
            "12": "批发和零售业",
            "13": "军事",
            "14": "卫生",
            "2": "金融",
            "3": "房地产",
            "4": "餐饮和住宿",
            "5": "居民服务",
            "6": "教育",
            "7": "文体",
            "8": "农林牧渔业",
            "9": "电力热力燃气",
            "999": "其他"
        }
        if industry_id.isdigit():
            if industry_id in industry_table:
                industry_id = industry_table[industry_id]
            else:
                return jsonify({"result": "industry code not founded"})

        # use two words for fuzzy query
        industry_id = unicode(industry_id.decode('utf-8'))[:2].encode('utf8') if len(unicode(industry_id.decode('utf-8'))) > 2 else industry_id
        
        # for 農林牧漁業
        if industry_id == '农林':
            industry_id = '农、'
    else:
        industry_id = ''

    # create query script
    query_filter = ''
    if country_id and industry_id:
        query_filter = "WHERE country.ID='{0}' AND pot.industry LIKE '%{1}%'".format(country_id, str(industry_id))
    else:
        if country_id:
            query_filter = "WHERE country.ID= '{0}'".format(country_id)
        if industry_id:
            query_filter = "WHERE pot.industry LIKE '%{0}%'".format(str(industry_id))
        
    query_script = """
        SELECT pot.item_name, pot.release_date, pot.item_type, pot.invest_type, pot.industry, pot.location, \
        pot.duration, pot.amount, pot.attract_investment_amount, pot.mark, pot.description, pot.person_name, \
        pot.organization, pot.position, pot.phone, pot.email, country.LATITUDE, country.LONGITUDE \
        FROM tpdb.POTENTIALITEMNEW AS pot \
        LEFT JOIN ciip.COUNTRY AS country \
        ON pot.location = country.NAME \
        %s \
        ORDER BY pot.release_date DESC \
    """ % (query_filter) 
            
    # return default change to dictionary
    cur = cnx.cursor(buffered=True, dictionary=True)

    # query from db
    cur.execute(query_script)
    res = cur.fetchall()
    cur.close()
    cnx.close()
    if res:
        return jsonify(res)
    else:
        return jsonify({"result": "NO DATA"})


# country info
@app.route('/api/country_info/<country>/<industry>')
def get_country_info(country, industry):
    # connect to mongodb and query data
    client = MongoClient('obor-mongo', 27017)
    data = client['obor']['obor_countryInfo'].find_one({"country_code": country})


    # check country exist
    if data is None:
        return jsonify({'error': 'County Not Found'})

    # check industry exist
    if industry not in data:
        return jsonify({'error': 'Industry Not Found'})

    # for disease
    def article_title_counts(keyword, location, limit_months=2):
        '''Return counts from articles which contain the keyword'''
        two_month_ago = (datetime.datetime.now()-relativedelta(months=limit_months)).strftime("%Y-%m-%d")
        count = client['obor']['obor_article'].count({
            "$and":[
                {
                    # filter title keywords
                    "_source.atitle": {
                        "$regex": ".*%s.*" % keyword, 
                        "$options": "-i"  # ignore case
                    }
                },
                
                {
                    # filter datetime
                    "_source.dateTime": {"$gt": "%s" % two_month_ago}
                },
                {
                    # filter country
                    "$or": [
                        {
                            "_source.location.country.wikiUri": {
                                "$regex": ".*%s.*" % location, 
                                "$options": "-i"  # ignore case
                            }
                        },
                        {
                            "_source.location.wikiUri": {
                                "$regex": ".*%s.*" % location, 
                                "$options": "-i"  # ignore case
                            }
                        },
                    ]
                }
            ]
        })
        return count
    # only society contain a dynamic infomation (disease)
    if  industry == "society":
        keywords = {
            "epidemic": u"疫情", 
            "diarrhea": u"腹泻", 
            "infectious": u"感染​​性疾病",
            "Hepatitis": u"肝炎",
            "Typhoid": u"伤寒", 
            "dengue": u"登革热",
            "malaria": u"疟疾", 
            "rabies": u"狂犬病",
            "poliomyelitis": u"脊髓灰质炎"
        }
        disease_res = list()

        # if counts > 0, we claim the disease existed
        for k, v in keywords.items():
            counts = article_title_counts(k, "pakistan")
            if counts:
                disease_res.append(v)
        data['society']['major_infectious_diseases_current'] = unicode(', '.join(disease_res))

    # map key with zh-simple
    with open("data/label_map.json", "r") as f:
        label_title = json.loads(f.read())

    # order the data for html layout
    ord_data = collections.OrderedDict(sorted(data[industry].items()))
    client.close()
    return render_template('countryInfo.html', data=ord_data, label = label_title)

# countryback info
@app.route('/api/countryback_info/<country>')
def get_countryback_info(country):
    # connection's parameter
    config = {
        'host': 'obor-mysql',
        'user': 'root', 
        'password': 'obor3',
        'database': 'tpdb',
        'charset': 'utf8'
    }

    # connect db
    try:
        cnx = mysql.connector.connect(**config)
    except mysql.connector.Error as e:
        return jsonify({'error': 'Connect failed! -- ' + str(e)})
    else:
        pass
    # return default change to dictionary
    cur = cnx.cursor(buffered=True, dictionary=True)
    # query from db
    cur.execute('SELECT * FROM COUNTRYBACK WHERE country_id = "%s";'% country)
    res = cur.fetchall()
    cur.close()
    cnx.close()

    # CHECK query data exist
    if res:
        return Markup(res[0]['background'])
    else:
        return jsonify({'error': 'Country Not Found'}) 


@app.errorhandler(404)
def not_found(error):
    '''Handle error code'''
    return make_response(jsonify({'error': 'Not Found'}), 404)



if __name__ == '__main__':
    # app.debug=True
    app.run(host="0.0.0.0")


# Practice

In [2]:
from pymongo import MongoClient
from flask import Flask, jsonify, abort, make_response, request, render_template, Markup
from flask_cors import CORS
import json

app = Flask(__name__, template_folder='templates')


    
@app.route('/', methods=['GET'])
def index():
    #return "Hello, world! This is for August PO5"
    return render_template('index.html')

@app.route('/api/test/', methods=['POST','GET'])
def get_test():
    content = request.form.get('content', '預設值')
    #content = request.args.get('content', '') #type=str要去除掉，否則會有編碼問題
    print content;
    return content;

if __name__ == '__main__':
    # app.debug=True
    app.run(host="0.0.0.0")

哈哈
			


# Mongodb 連線

In [23]:
import re
from pymongo import MongoClient 

#預設就是自己
client = MongoClient('127.0.0.1', 27017)
database = client['test']
collection =database['news']
client.close()

import jieba
jieba.set_dictionary('D:/TextMining/dict.txt.big.txt')  #切換至中文繁體字庫
jieba.load_userdict("D:/TextMining/dict_keyw.txt")       #加入自建詞庫
jieba.load_userdict("D:/TextMining/ptt.txt")       #加入PTT詞庫

Building prefix dict from D:\TextMining\dict.txt.big.txt ...
Loading model from cache c:\users\ytchen\appdata\local\temp\jieba.ufb55ea6623e696143f43e248198acd74.cache
Loading model cost 1.355 seconds.
Prefix dict has been built succesfully.


# 抓取資料，製作模型

In [24]:
#給關鍵字----------------------------------------
tag1 = "我"
# tag2 = "台灣"
#-----------------------------------------------

date=[]
title=[]
content =[]
all_article = []
#把資料庫東西抓出來
for post in collection.find(
    {"$and":[                   
            {"content":{"$regex":tag1}},
#             {"content":{"$regex":tag2}},
            #{"date":{"$regex":"2016"}},
            ]},{"_id":0}).limit(200): 
    summary = post['content']
    all_article.append(summary)
    content.append(' '.join(jieba.cut(summary)))
    title.append(post['title'])
    date.append(post['date'])
    
client.close()

print title[0]

#文章數
newsNumber = len(title)
print "文章數:" + str(newsNumber)

# 將使用者輸入文章，塞入第一篇內容
mysummary = '''大陸著名民俗專家王作楫在一段影片中指出，人們把「福」字倒貼是「絶對原則性的錯誤」，雖然傳統文化中的確有倒貼福字的地方，但是是在垃圾桶、水桶等器具上。「福字為什麼不能倒貼」話題成了昨天微博熱搜首位，兩派人馬就此展開了激烈討論。
北京青年報報導，對於「福」字不應倒貼的問題，其實王作楫以前就多次講過，他的理由主要有三個。
其一，「福」是中國人的文化符號，中國有兩個傳統文化符號，一個是「圓」一個是「方」，就是古話講「天圓地方」。「圓」有圓圓滿滿、團團圓圓的含義；「方」的含義更重要，因為中國文字是方塊字，我們用方塊字記載了中國五千年的文明史，所以倒貼「福」字等於是把中國的傳統文化符號倒過來使用，「這不是顛倒黑白嗎，這是對文字和中國文化的不尊重」。
其二，「福」字除了部首「示字部」，右邊則由「一、口、田」三個字組成。按照古代說文解字的說法，「一」代表房子的房梁，「口」是房子裏邊住著的人口，「如果把它倒過來，你這房子和人口不就等於跑到田地下面去了嗎，你想想那是什麼」。
其三，大家認為把福倒過來貼就意味著「福到了」，他說，這是一種誤解。因為倒貼的這個倒是「倒掉」的意思，不是「來到」的「到」，兩者是有區別的。
王作楫進一步指出，傳統文化中的確有倒貼福字的地方，但是是在垃圾桶、水桶等器具上。
因為垃圾代表著災害和貧窮，所以需要倒掉，過去人們把倒垃圾稱為「扔災」。而福字倒貼在垃圾桶上，當倒掉垃圾時，垃圾桶底朝天，倒貼的福字就變正了，意思是把災和貧窮扔掉，福才會來。'''
all_article.insert(0, summary)
content.insert(0, ' '.join(jieba.cut(summary)))
title.insert(0, 'fortest')
date.insert(0,'')

超級7歲娃扛家顧盲母3弟妹-煮飯-洗衣-換燈管
文章數:200


In [25]:
print title[0]

fortest


# TD-IDF 權重計算

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(content)  # titile 放文本
weight = X.toarray() #裝關鍵字的權重值

#特徵值總共有多少個
features = vectorizer.get_feature_names()     # 拿到所有的關鍵詞  
print "特徵值數量:",len(features)

特徵值數量: 14583


## 拿到每篇文章的TOP TD-IDF

In [27]:
import time
tic = time.clock()

top_features = []
final_top_features = []
for n in range(0,newsNumber):  #迴圈參考上面的總文章數
    #裝每天文章的feature & score
    map_top_features = []
    
    #對特徵詞的index進行排序
    indices = np.argsort(weight[n])[::-1] #weight[n] 是第幾篇新聞的意思
    
    #拿到所有的特徵詞
    features = vectorizer.get_feature_names()
    
    #截取TOP多少的詞
    top_n = 40
    
    ###################################把每篇的TOP TF-IDF 與其分數存起來###############################################
    #將每天文章的TOP-TFIDF存起來
    top_features.append([features[i] for i in indices[:top_n]])
    
    a=0
    for i in top_features[n]:
#         print i,weight[n][indices[a]]
        data = {
            'feature':i,
            'score':weight[n][indices[a]]
        }
        #print data['score'],data['feature']
        a=a+1
        
        map_top_features.append(data)
    final_top_features.append(map_top_features)
# print final_top_features[0]
    #############################################################################################
    
toc = time.clock()
print "執行時間:",(toc - tic),"秒"

執行時間: 4.22381045705 秒


## 文章摘要運算

In [28]:
#對所有的新聞文章進行 "句子切詞 "
import re
for article_index in range(0,1):
    article = all_article[article_index] #一篇文章
    setenceList = re.split("，|。|",article.encode('utf-8')) #根據"逗號" & "句號" 進行句子斷開
    
    #裝一篇文章的句子 & 句子的分數
    sen_score_map = []
    article_score = 0 #文章的總分數
    avg_score = 0 #總文章的平均分數
    
    #將每篇文章的句子逐一取出進行分數計算
    for sen in setenceList:
        #初始化句子分數
        score = 0 
        
        #逐一抓出各文章的 map_top_features 特徵詞與分數，與句子進行比對，若該特徵詞存在於句子中，則分數加上去
        for i in final_top_features[article_index]:
            feature = i['feature'].encode('utf-8')
            if feature in sen:
                score += i['score']
        
        #存取句子與分數
        data = {
            'sentence' : sen,
            'score' : score,
        }
        sen_score_map.append(data)
        
        #文章總分數
        article_score += score
        
    #文章資訊
    article_sen_size = len(setenceList) #文章句子總數
    avg_score = article_score / article_sen_size #平均句子分數
    
    #文章摘要
    articleSummarize = ''
    skip = 0
    for i in sen_score_map:
        #第一句一定要有，加完就continue換下一個迴圈，不用跑下面的判斷
        if skip == 0:
            skip+=1
            articleSummarize += '，'+i['sentence']
            continue
        #進行判斷(句子分數 > 平均分數才加入摘要)
        if i['score'] > avg_score*1.5:
            articleSummarize += '，'+i['sentence']
    #修整文章摘要-最後面的逗號改成句號
    articleSummarize = articleSummarize.replace('，','',1)+"。" #把最前面的逗號替換掉
            
    print '平均分數=' + str(avg_score)
    print '---------------------------------------------'
    print '新聞標題：'+title[article_index].encode('utf-8')
    print '------------'
    print '文章摘要：'
    print articleSummarize
    print '------------'
    print '文章原文：'
    print article
    print '---------------------------------------------'    
    

平均分數=0.428970476272
---------------------------------------------
新聞標題：fortest
------------
文章摘要：
【鮮明、高堂堯╱連線報導】成立三十一年玉管處開出首張違規露營罰單！玉山國家公園管理處與警方前天在塔塔加遊客中心停車場取締違規露營族，依《國家公園法》開罰一千五百元罰鍰；玉管處表示，前天開出的罰單為成立以來第一張違規露營告發單，今年對違規露營民眾已開出五十八張勸導單，前天約七至八成停車格遭露營族霸佔，三人回嗆：「不管啦！我們就是要在這邊過夜！」警員依《國家公園法》開出一千五百元罰單，首張違規露營罰單」，明訂未經核准禁止在停車場及指定以外之地區露營、野炊、炊事、燃火、搭設帳篷、放置桌椅、大聲喧鬧及舉行營火等活動，一般違規露營民眾遇警方或管理處人員勸導，停車場一向是露營熱門地點，遊客中心停車場擠滿露營民眾，禁止民眾以露營車露營野炊，進行露營野炊等主管機關禁止行為，第二次以上違規處3000元罰鍰★生火需在合法地點。
------------
文章原文：
【鮮明、高堂堯╱連線報導】成立三十一年玉管處開出首張違規露營罰單！玉山國家公園管理處與警方前天在塔塔加遊客中心停車場取締違規露營族，三名二十多歲年輕人佔據停車格紮營，玉管處及警方廣播、口頭三度勸離無效，依《國家公園法》開罰一千五百元罰鍰；玉管處表示，前天開出的罰單為成立以來第一張違規露營告發單，今年對違規露營民眾已開出五十八張勸導單。玉山國家公園管理處副處長林文和表示，塔塔加遊客中心周邊停車場約四十個停車格，前天約七至八成停車格遭露營族霸佔，除拿出炊具煮食還著手紮營。其中來自彰化的周姓等三名年約二十三至二十四歲年輕男子，騎機車載蒙古包等露營器具搭設，警方先廣播驅離，下午五時二度口頭勸離，未料三人晚間八時許仍在原地，警員三度要求離開，三人回嗆：「不管啦！我們就是要在這邊過夜！」警員依《國家公園法》開出一千五百元罰單，林文和指「這是玉管處成立三十一年來，首張違規露營罰單」。林文和強調，玉管處有鑑於長期勸導未見具體成效，前年修訂玉山國家公園區域內公告禁止事項，明訂未經核准禁止在停車場及指定以外之地區露營、野炊、炊事、燃火、搭設帳篷、放置桌椅、大聲喧鬧及舉行營火等活動，第一次違規罰一千五百元罰鍰，第二次以上違規罰三千元罰鍰。內政部營建署國家