In [1]:
import numpy as np
import pandas as pd
import jieba
from collections import Counter
import word2vec

In [2]:
features = pd.read_csv("EDM_features_for_demo.csv", encoding='BIG5')

In [3]:
features.head()

Unnamed: 0,Subject
0,遊日就是要刷JCB! 指定實體商店消費最高3%回饋
1,★千里尋他千百度，刷卡分期我來助★
2,【年底也想讓房子煥然一新嗎】3大裝潢主題邀你一同體驗，實現你對幸福生活的想像！
3,線上快速試算您的房屋價值及增貸空間，手續費8折優惠
4,富邦icash聯名卡大首筆！7-ELEVEN X-STORE首刷送100%，新卡週週送10%！


In [4]:
content = list(features['Subject'])
contenttxt = ''.join(str(e) for e in content)
#把全部要分析的主旨灌進一個string

In [5]:
#把結巴沒斷好的拿掉
garbage = ['怪字','享紅']#demo拿掉了很多實際案例了

In [6]:
# 計算詞頻
def get_words(txt, num=150):
    seg_list = jieba.cut(txt, cut_all=False)
    temp = list(seg_list)
    temp = [x.strip() for x in temp]
    temp = [x for x in temp if x not in garbage]
    c = Counter()
    for x in temp:
        if len(x)>1 and x != '\r\n':
            c[x] += 1
    return c.most_common(num)

In [7]:
frq = get_words(contenttxt)
print(frq)
print(type(frq))

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/ps/4m0fjtss0nv0lbhy26n3224m0000gn/T/jieba.cache
Loading model cost 1.074 seconds.
Prefix dict has been built succesfully.


[('房子', 2), ('線上', 2), ('快速', 2), ('房屋', 2), ('手續費', 2), ('定期', 2), ('定額', 2), ('1.68', 2), ('小時', 2), ('免費', 2), ('專屬', 2), ('遊日', 1), ('就是', 1), ('JCB', 1), ('指定', 1), ('實體', 1), ('商店', 1), ('消費', 1), ('最高', 1), ('3%', 1), ('回饋', 1), ('千里', 1), ('尋他', 1), ('千百度', 1), ('刷卡', 1), ('分期', 1), ('我來助', 1), ('年底', 1), ('煥然', 1), ('一新', 1), ('大裝潢', 1), ('主題', 1), ('一同', 1), ('體驗', 1), ('實現', 1), ('幸福生活', 1), ('想像', 1), ('試算', 1), ('價值', 1), ('增貸', 1), ('空間', 1), ('優惠富邦', 1), ('icash', 1), ('聯名', 1), ('卡大首筆', 1), ('ELEVEN', 1), ('STORE', 1), ('首刷', 1), ('100%', 1), ('新卡週', 1), ('週送', 1), ('10%', 1), ('一路', 1), ('限時', 1), ('申購', 1), ('契約', 1), ('當月', 1), ('聚焦', 1), ('優質', 1), ('基金', 1), ('折遠', 1), ('銀卡友', 1), ('上理', 1), ('財服務', 1), ('全年', 1), ('無休', 1), ('房貸線', 1), ('試算別', 1), ('錯過', 1), ('分鐘', 1), ('步驟', 1), ('立即', 1), ('得到', 1), ('可貸', 1), ('額度', 1), ('利率', 1), ('新年', 1), ('換件', 1), ('新衣', 1), ('居家', 1), ('裝潢', 1), ('百寶箱', 1), ('大主題', 1), ('打造', 1), ('獨特', 1), ('生活', 1), ('品味', 1), ('來來', 1)

In [8]:
#計算出用字詞頻，以利未來製作feature標籤
df_frq = pd.DataFrame(frq, columns=["words","frq"])
df_frq

Unnamed: 0,words,frq
0,房子,2
1,線上,2
2,快速,2
3,房屋,2
4,手續費,2
5,定期,2
6,定額,2
7,1.68,2
8,小時,2
9,免費,2


In [9]:
#feature標籤不適合有中文欄位，透過googletrans翻譯
#Guildance: https://pypi.org/project/googletrans/
from googletrans import Translator
translator = Translator(service_urls=['translate.google.com','translate.google.com'])

In [10]:
import re
output = []
for a in range(df_frq.shape[0]):
        translations = translator.translate(df_frq.loc[a,'words'], dest='en')
        #print(translations.origin, ' -> ', translations.text)
        #output.append(translations.text)
        output.append(re.sub('\s','',translations.text))#去空格
        output[a] = output[a].rstrip('%')#去除右邊的%不然會jupyter會亂碼
        print(output[a])

house
on-line
fast
houses
Handlingfee
Regularly
Fixedamount
1.68
hour
FREE
Exclusive
Sunrise
Is
JCB
Designation
entity
store
consumption
highest
3
Feedback
Thousandmiles
Lookforhim
ThousandBaidu
Swipe
Staging
Iamhelping
Endofyear
refresh
Renewal
Bigdecoration
theme
together
Experience
achieve
Happylife
imagine
Trialcalculation
value
Increaseloan
space
DiscountFubon
icash
Jointname
Cardfirst
ELEVEN
STORE
Firstbrush
100
Newcardweek
Weeklydelivery
10
Alltheway
Timelimit
Purchase
Agreement
Themonth
Focus
highquality
fund
Foldaway
Silvercardfriend
Rational
Financialservices
annual
Norest
Mortgageline
Byestimate
miss
minute
step
immediately
get
Loanable
Amount
interestrate
newYear
Replacement
Newclothes
Home
Decoration
Treasurebox
Bigtheme
Build
unique
life
taste
Comeon
lottery
Loanexemption
Goingout
sun
Athome
Skeleton
Youcan
Grant
Application
Canalso
Dyson
Coolandwarm
air
Cleaner


In [11]:
out = pd.DataFrame({"words":df_frq["words"],"frq":df_frq["frq"], "English":output})

In [12]:
out

Unnamed: 0,words,frq,English
0,房子,2,house
1,線上,2,on-line
2,快速,2,fast
3,房屋,2,houses
4,手續費,2,Handlingfee
5,定期,2,Regularly
6,定額,2,Fixedamount
7,1.68,2,1.68
8,小時,2,hour
9,免費,2,FREE
