<a href="https://colab.research.google.com/github/victorgau/Python_ML_DL/blob/master/4-03%20文字特徵萃取.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 文字特徵萃取

參考連結：

* [Bag-of-words model](https://en.wikipedia.org/wiki/Bag-of-words_model)
* [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)

In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba

## CountVectorizer

In [None]:
edocs = ['John likes to watch movies. Mary likes movies too.','Mary also likes to watch football games.']

In [None]:
vr = CountVectorizer()
dv = vr.fit_transform(edocs)
tokens = vr.get_feature_names()

In [None]:
df = pd.DataFrame(dv.toarray())

In [None]:
df.columns = tokens

In [None]:
df

In [None]:
vr = CountVectorizer(stop_words="english", lowercase=False)
dv = vr.fit_transform(edocs)
tokens = vr.get_feature_names()

In [None]:
df = pd.DataFrame(dv.toarray())
df.columns = tokens

In [None]:
df

## 中文特徵萃取

In [None]:
# set of documents
cdocs = ['我要去學校。','我想要去看電影。']

In [None]:
' '.join(jieba.cut(cdocs[0]))

In [None]:
def tokenizer(text):
    punctuation = """。！？｡＂＃＄％＆＇（）＊＋－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛“”„‟…‧﹏"""
    re_punctuation = r"[{}]".format(punctuation)
    text = re.sub(re_punctuation, "", text)
    text = re.sub(r"\n", "", text)
    text = re.sub(r"[\d\.%]+", "", text)
    return list(jieba.cut(text))

In [None]:
vr = CountVectorizer(analyzer="word", tokenizer=tokenizer)

In [None]:
tokenizer(cdocs[0])

In [None]:
dv = vr.fit_transform(cdocs)

In [None]:
tokens = cv.get_feature_names()

In [None]:
tokens

In [None]:
df = pd.DataFrame(dv.toarray())

In [None]:
df.columns = tokens

In [None]:
df

## TfidfVectorizer

TF (Term Frequency)
    
    這個字在這篇文章中的出現次數。

IDF (Inverse Document Frequency)
    
    在所有文章中，有幾篇文章有這個字。

In [None]:
vr = TfidfVectorizer(analyzer="word", tokenizer=tokenizer)

In [None]:
dv = vr.fit_transform(cdocs)

In [None]:
tokens = vr.get_feature_names()

In [None]:
tokens

In [None]:
df = pd.DataFrame(dv.toarray())

In [None]:
df.columns = tokens

In [None]:
df

## 讀取訓練資料

In [None]:
with open("up.txt",encoding="UTF-8") as f:
    up = f.readlines()

In [None]:
with open("down.txt",encoding="UTF-8") as f:
    down = f.readlines()

In [None]:
import numpy as np

In [None]:
y = np.array([1]*len(up)+[0]*len(down))

In [None]:
y

In [None]:
# 使用 CountVectorizer
vr = CountVectorizer(analyzer="word", tokenizer=tokenizer)
dv = cv.fit_transform(up+down)

In [None]:
# 使用 CountVectorizer
vr = TfidfVectorizer(analyzer="word", tokenizer=tokenizer)
dv = vr.fit_transform(up+down)

## 建立模型

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
cross_val_score(clf, dv, y, cv=10).mean()
clf.fit(dv, y)

In [None]:
from sklearn.svm import SVC
clf = SVC(probability=True, random_state=0)
cross_val_score(clf, dv, y, cv=10).mean()
clf.fit(dv, y)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=0)
cross_val_score(clf, dv, y, cv=10).mean()
clf.fit(dv, y)

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
cross_val_score(clf, dv, y, cv=10).mean()
clf.fit(dv, y)

In [None]:
from sklearn.model_selection import GridSearchCV
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid = [{'kernel':['linear'], 
               'C':param_range},
              {'kernel':['rbf'],
               'gamma':param_range,
               'C':param_range}]
svc = SVC()
gs = GridSearchCV(svc, param_grid)
gs.fit(dv, y)
print(gs.best_score_)
print(gs.best_params_)

## 實作情緒指標分析

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
url = "https://news.cnyes.com/news/cat/tw_stock_news"

In [None]:
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')

In [None]:
news_list = soup.select('div._2bFl.theme-list h3')

In [None]:
data = []
for news in news_list:
    data.append(news.text)

In [None]:
nv = vr.transform(data)

In [None]:
se = clf.predict_proba(nv)

In [None]:
for i in range(len(data)):
    print(data[i],"==>",se[i][1])