# 前期處理

* 輸入套件

In [3]:
import pandas as pd
import numpy as np
import monpa
import re
from datetime import datetime, timedelta
from collections import Counter

def clearlist(list1):  # 清除無意義字元
    for j in range(len(list1)):
        for i in range(len(list1[j])):
            list1[j][i] = re.sub(r'[\W\dA-Za-z]','',list1[j][i])
    return(list1)

+---------------------------------------------------------------------+
  Welcome to MONPA: Multi-Objective NER POS Annotator for Chinese
+---------------------------------------------------------------------+
已找到 model檔。Found model file.


* 資料匯入與初期整理（資料型態、空值、合併資訊）

In [4]:
bbs = pd.read_csv("bbs.csv",encoding="utf-8")
news = pd.read_csv("news.csv",encoding="utf-8")
forum = pd.read_csv("forum.csv",encoding="utf-8")

In [5]:
# Drop掉無意義的欄位
news.drop(news.columns[[0,1,2,3,6]], axis = 1, inplace = True)
bbs.drop(bbs.columns[[0,1,2,3,4,7]], axis = 1, inplace = True)
forum.drop(forum.columns[[0,1,2,3,4,7]], axis = 1, inplace = True)

# 將post_time欄位的data type由object改為datetime
news["post_time"] = pd.to_datetime(news["post_time"], format = "%Y/%m/%d")
bbs["post_time"] = pd.to_datetime(bbs["post_time"], format = "%Y/%m/%d")
forum["post_time"] = pd.to_datetime(forum["post_time"], format = "%Y/%m/%d")

# 把全部的新聞、部落格、與論壇文章合併為單一個dataframe，取名為articles
articles = pd.concat([news, bbs, forum])
articles["post_time"] = articles["post_time"].dt.date
articles.dropna(inplace = True)
articles.reset_index(drop = True, inplace = True)

# 匯入股價資訊
stockprice = pd.read_excel("大立光股價2016-2018.xlsx", sheet_name = "Total Price")
stockprice["年月日"] = stockprice["年月日"].dt.date

# 第一題：各挑選出看漲及看跌的一批文章，從中取出關鍵字列表， 建構向量空間

### Step1: 找出股價波動超過±σ的前n日日期，並抓出該些日期中有包含主題的文章

* 先挑出σ在3%以上的日期

In [6]:
Up_date = stockprice[stockprice["波動(漲跌幅)"] >= 0.03]["年月日"].values
Down_date = stockprice[stockprice["波動(漲跌幅)"] <= -0.03]["年月日"].values

* 接著利用上述所查詢到的157個日期，回推該日期前n日內的所有新聞，在這我們取n = 2

In [7]:
# 挑選前n日內所有文章
n = int(input("挑選n日內所有文章，n ="))  # 註：當日文章請輸入0
Up_publish_date = []
Down_publish_date = []

for i in range(len(Up_date)):
    for j in range(n):
        Up_publish_date.append(Up_date[i] - pd.Timedelta(days = j+1))
for i in range(len(Down_date)):
    for j in range(n):
        Down_publish_date.append(Down_date[i] - pd.Timedelta(days = j+1))
        
Up_title = []  # 上漲文章標題
Up_content = []  # 上漲文章內容
Down_title = []  # 下跌文章標題
Down_content = []  # 下跌文章內容

for i in range(len(articles)):
    if articles["post_time"][i] in Up_publish_date:
        Up_title.append(articles["title"][i])
        Up_content.append(articles["content"][i])
        
    if articles["post_time"][i] in Down_publish_date:
        Down_title.append(articles["title"][i])
        Down_content.append(articles["content"][i])

print("上漲文章有：" + str(len(Up_title)) + "篇")
print("下跌文章有：" + str(len(Down_title)) + "篇")

挑選n日內所有文章，n =2
上漲文章有：48486篇
下跌文章有：49122篇


* 抓出大立光漲跌超過2%前兩日裡，與大立光相關的文章

In [8]:
target = input("輸入要尋找的主題")  # 輸入要尋找的主題
Up_indexlist = []  # 上漲文章index
Up_artlist = []  # 所有上漲文章
Down_indexlist = []  # 下跌文章index
Down_artlist = []  # 所有下跌文章

for i in range(len(Up_title)):
    count = 0
    if target in str(Up_title[i]):
        count += 1
    elif target in str(Up_content[i]):
        count += 1

    if count >= 1:
        Up_indexlist.append(i)
        Up_artlist.append(Up_content[i])  

for i in range(len(Down_title)):
    count = 0
    if target in str(Down_title[i]):
        count += 1
    elif target in str(Down_content[i]):
        count += 1

    if count >= 1:
        Down_indexlist.append(i)
        Down_artlist.append(Down_content[i])  

for idx,i in enumerate(Up_artlist):
    Up_artlist[idx] = i.split("，")
clearlist(Up_artlist)  
for idx,i in enumerate(Down_artlist):
    Down_artlist[idx] = i.split("，")
clearlist(Down_artlist)

print("關於『"+target+"』的上漲文章總共出現『"+str(len(Up_indexlist))+"』篇")
print("關於『"+target+"』的下跌文章總共出現『"+str(len(Down_indexlist))+"』篇")

輸入要尋找的主題大立光
關於『大立光』的上漲文章總共出現『1872』篇
關於『大立光』的下跌文章總共出現『1664』篇


### Step2. 使用monpa切出適當字詞，並計算分析指標

In [9]:
def LongCut(long_sentence):  # Monpa切字
    seg = []
    for item in long_sentence.split(" "):
        if item != "\n": 
            seg.extend(monpa.cut(item))
    return seg


def GramCounter(Content):  # 計算TF、DF
    data_list = []
    data_monpa = []

    for i in range(len(Content)):
        data_monpa = []
        for j in Content[i]:
            data_monpa.extend(LongCut(j))
        data_list.append(data_monpa)

    m = data_list
    c = Counter()
    dfcount = Counter()
    for i in range(len(m)):
        for j in range(len(m[i])):
            c[m[i][j]]+=1

    for i in range(len(m)):
        k = list(set(m[i]))
        for j in range(len(k)):
            dfcount[k[j]]+=1 

    df = pd.DataFrame(dfcount.items(),columns = ["Term","DF"])    
    dfc = pd.DataFrame(c.items(),columns = ["Term","TF"])
    final = pd.merge(dfc,df, on="Term")
    
    return final


def OtherInfo(gram): # 計算W(t,f)、idft、TF-IDF
    N = 12735  # 全部有關大立光的文章
    
    WtdList = []   # 算W(t,d)
    Wtd = 1 + np.log10(np.array(gram["TF"])) 
    for i in Wtd:
        wtd = float("%0.3f" %i)
        WtdList.append(wtd)
        
    IDFtList = []   # 算 idf(t)
    IDFt = np.log10(N / np.array(gram["DF"]))   
    for j in IDFt:
        idft = float("%0.3f" %j)
        IDFtList.append(idft)
     
    TFIDFList = []   # 算TF-IDF
    TFIDF = np.array(WtdList) * np.array(IDFt)
    for k in TFIDF:
        tfidf = float("%0.3f" %k)
        TFIDFList.append(tfidf)
    
    gram["Wtf"] = WtdList
    gram["IDFt"] = IDFtList
    gram["TF-IDF"] = TFIDFList
    
    return gram

* 以下四段為切字與計算TF-IDF等指標，運算會消耗約25分鐘，我已經把結果存成兩個csv檔了，可以直接跳到輸入cvs檔的部分繼續運行

In [None]:
Up_gram = GramCounter(Up_artlist)
Down_gram = GramCounter(Down_artlist)

In [1]:
Up_gram = OtherInfo(Up_gram)
Up_gram

NameError: name 'OtherInfo' is not defined

In [40]:
Down_gram = OtherInfo(Down_gram)
Down_gram

Unnamed: 0,Term,TF,DF,Wtf,IDFt,TF-IDF
0,台積電,1494,882,4.174,1.160,4.840
1,奈米,74,40,2.869,2.503,7.181
2,完勝,1,1,1.000,4.105,4.105
3,三星,111,56,3.045,2.357,7.176
4,製程,57,37,2.756,2.537,6.991
5,強壓,1,1,1.000,4.105,4.105
6,對手,25,21,2.398,2.783,6.673
7,的,8234,1369,4.916,0.969,4.762
8,蘋果,1438,616,4.158,1.315,5.470
9,代工,71,60,2.851,2.327,6.634


* 先存為Excel檔，方便未來存取

In [41]:
# Up_gram.to_excel(r"monpa上漲字串.xlsx",index = False,header = True,encoding="utf-8")
# Down_gram.to_excel(r"monpa下跌字串.xlsx",index = False,header = True,encoding="utf-8")

In [10]:
Up_gram = pd.read_excel("monpa上漲字串.xlsx")
Down_gram = pd.read_excel("monpa下跌字串.xlsx")

### Step3. 刪除頻繁重複出現在上漲與下跌的字詞

* 為了加速計算，我們先設一個DF > 2的門檻，將出現次數極低的字詞先刪除

In [11]:
Up_gram = Up_gram[Up_gram["DF"] > 10].reset_index(drop = True)
Down_gram = Down_gram[Down_gram["DF"] > 10].reset_index(drop = True)

* 利用inner merge，找出重複字再分別從兩個字詞集刪除

In [12]:
Repeat = pd.merge(Up_gram, Down_gram, on = ["Term"], how = "inner")
Repeat

Unnamed: 0,Term,TF_x,DF_x,Wtf_x,IDFt_x,TF-IDF_x,TF_y,DF_y,Wtf_y,IDFt_y,TF-IDF_y
0,蘋果,1665,712,4.221,1.253,5.287,1438,616,4.158,1.315,5.470
1,銷售,321,229,3.507,1.745,6.120,284,208,3.453,1.787,6.170
2,無力,79,75,2.898,2.230,6.462,72,61,2.857,2.320,6.627
3,加上,1026,663,4.011,1.283,5.148,934,609,3.970,1.320,5.242
4,大陸,243,181,3.386,1.847,6.255,220,146,3.342,1.941,6.486
...,...,...,...,...,...,...,...,...,...,...,...
3172,心得,62,53,2.792,2.381,6.647,45,44,2.653,2.462,6.530
3173,填寫,43,38,2.633,2.525,6.649,26,25,2.415,2.707,6.538
3174,充實,20,14,2.301,2.959,6.808,11,11,2.041,3.064,6.253
3175,覺得,37,20,2.568,2.804,7.201,22,16,2.342,2.901,6.794


In [13]:
for i in range(len(Up_gram)):
    if Up_gram["Term"][i] in Repeat["Term"].values:
        Up_gram.drop([i], inplace = True)
Up_gram = Up_gram.sort_values(by = "TF-IDF", ascending = False)
Up_gram= Up_gram.reset_index(drop=True)
Up_gram

Unnamed: 0,Term,TF,DF,Wtf,IDFt,TF-IDF
0,周轉率,64,20,2.806,2.804,7.868
1,沖,40,13,2.602,2.991,7.783
2,林恩舟,35,12,2.544,3.026,7.698
3,舜宇,32,13,2.505,2.991,7.493
4,定期,29,12,2.462,3.026,7.450
...,...,...,...,...,...,...
620,扮,11,11,2.041,3.064,6.253
621,跳漲,11,11,2.041,3.064,6.253
622,光聯,11,11,2.041,3.064,6.253
623,富邦金國泰金,11,11,2.041,3.064,6.253


In [14]:
for i in range(len(Down_gram)):
    if Down_gram["Term"][i] in Repeat["Term"].values:
        Down_gram.drop([i], inplace = True)
Down_gram = Down_gram.sort_values(by = "TF-IDF", ascending = False)
Down_gram = Down_gram.reset_index(drop = True)
Down_gram

Unnamed: 0,Term,TF,DF,Wtf,IDFt,TF-IDF
0,億萬,73,20,2.863,2.804,8.028
1,週六,37,13,2.568,2.991,7.681
2,佔到,32,11,2.505,3.064,7.674
3,蔡明翰,28,14,2.447,2.959,7.240
4,貼息,26,13,2.415,2.991,7.223
...,...,...,...,...,...,...
268,區塊,11,11,2.041,3.064,6.253
269,連漲,11,11,2.041,3.064,6.253
270,服務業,11,11,2.041,3.064,6.253
271,映泰,11,11,2.041,3.064,6.253


* 現在我們確定了出現在上漲以及下跌字詞集裡的字了，分別有2237與1461個，我們將他們合併為一個叫做Words的List

In [15]:
# LPC = pd.read_excel("大立光新聞.xlsx")
Up_word = Up_gram["Term"][:].values.tolist()
Down_word = Down_gram["Term"][:].values.tolist()
word = Up_word + Down_word

In [16]:
print("Model Dimension:", len(word))

Model Dimension: 898


### Step 4. 篩選出大立光的訓練與測試集文章

In [17]:
# 找有股價漲跌達3%以上的前兩日日期
Up_publish_date = []
Down_publish_date = []

for i in range(len(Up_date)):
    for j in range(n):
        Up_publish_date.append(Up_date[i] - pd.Timedelta(days = j+1))
for i in range(len(Down_date)):
    for j in range(n):
        Down_publish_date.append(Down_date[i] - pd.Timedelta(days = j+1))

Up_publish_date = list(set(Up_publish_date))
Down_publish_date = list(set(Down_publish_date))

LPC = articles[(articles["content"].str.contains("大立光") | articles["title"].str.contains("大立光"))]  # 所有大立光的文章
LPC = LPC[LPC["post_time"].isin(Up_publish_date + Down_publish_date)].reset_index(drop = True)  # 所有大立光出現在有2%漲幅前兩日的文章
LPC_stock = stockprice[stockprice["年月日"].isin(list(set(LPC["post_time"].tolist())))]

# # # 把股價資料集和新聞資料集做合併，刪除不需要的columns
LPC_stock.rename(columns={"年月日":"post_time"}, inplace = True)
LPC = pd.merge(LPC, LPC_stock, on = ["post_time"], how = "inner")
LPC = LPC.dropna().reset_index(drop = True)
LPC = LPC.iloc[:, [0,1,2,14]]

# # # 將股價波動轉為dummy，只取波動3％以上，其餘刪除
LPC.loc[LPC["波動(漲跌幅)"] >= 0.03, "波動(漲跌幅)"] = 1
LPC.loc[LPC["波動(漲跌幅)"] <= -0.03, "波動(漲跌幅)"] = 0
LPC = LPC[LPC["波動(漲跌幅)"].isin([0,1])]
LPC = LPC.reset_index(drop = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [18]:
LPC

Unnamed: 0,post_time,title,content,波動(漲跌幅)
0,2016-01-06,《各報要聞》蘋果不甜，大立光12月營收掉3成,蘋果iPhone 6s/6s Plus銷售無力，加上大陸在內的其餘品牌業績乏善可陳，全球手機...,0.0
1,2016-01-06,台股盤前－兩岸貨貿+陸客中轉選前釋利多 可望收復8100,台股昨(5)日跌破8100點關卡，下跌39點，指數收在8075點，成交值802億元。歐美股市...,0.0
2,2016-01-06,《外資》台股電子股，麥格理喊進6檔,麥格理資本證券昨（5）日發表2016年大中華科技產業研究報告，認為產品轉型與新產品週期將重塑...,0.0
3,2016-01-06,《今日焦點新聞》央行穩匯市，掛三大保證,時報-今日焦點新聞<BR> 國內頭條：<BR> 1.蘋果不甜，大立光12月營收掉3成。(工商...,0.0
4,2016-01-06,【Ｙ早報】央行穩匯市 掛三大保證,（開盤日9:00出刊）美股止跌，道瓊小漲9點；選前政策放利多，陸客來台中轉放行，航空、免稅店...,0.0
...,...,...,...,...
1063,2018-02-06,[請益] 跌停家數117家，來聊聊錯殺股吧！,今天跌停家數117家，\n-5%內都算強勢股了\n\n新聞，事後理由一推。但還是覺得崩的莫名...,0.0
1064,2018-02-06,外資落井下石，股王股價將會如何反應?,當昨天美股大跌，<BR>台灣股市跟跌，<BR>股王大立光也大跌4.5%，<BR>–180元，...,0.0
1065,2018-02-07,Re: [標的] 個人進場 2330台積電 @237.5,先來一張盤後圖接回安心留倉的期貨部位後..\n（沒po文）\nhttps://i.imgur...,1.0
1066,2018-02-07,[閒聊] 2018/02/07 盤後閒聊,台 股 10551.54 ▲ 147.54 (1.42%) 1735.44億\n台指0...,1.0


* 接著我們將這1068篇的文章，分別以字串方式切字斷句

In [None]:
news = LPC["content"].tolist()
news_split = []
for i in range(len(news)):
    news1 = LongCut(news[i])
    str1 = " ".join(news1)
    news_split.append(str1)
news_split

* 切詞部分需要跑大約10分多鐘，所以我把檔案存成"大立光訓練與測試集文章.txt"，之後要跑直接open file就不用重新切字

In [18]:
# with open('大立光訓練與測試集文章.txt', 'w') as f:
#     for item in news_split:
#         f.write("%s\n" % item)

In [19]:
with open("大立光訓練與測試集文章.txt", "r", encoding = "utf-8") as text:
    news_split = text.readlines()
text.close()

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(vocabulary = word)
vectorizer.fit_transform(word)
tf = vectorizer.transform(news_split)
vec = pd.DataFrame(tf.toarray(), columns = vectorizer.get_feature_names()) 
vec.head()

Unnamed: 0,周轉率,沖,林恩舟,舜宇,定期,定額,專利,光電類,平均值,停產,...,南亞科華邦電,落定,初期,塵埃,曇花一現,區塊,連漲,服務業,映泰,指口櫃
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


* 我們先從完全不降維開始做

## ＊第二題：將前述兩批文章作為訓練資料及測試資料，使用監督式學習之分類演算法，評估分類模型之準確率

* 我們要用來分析的大立光漲跌文章為以下1755篇與大立光相關的文章：

In [21]:
from sklearn.model_selection import train_test_split
X = vec
Y = LPC["波動(漲跌幅)"]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

* 針對文字矩陣，我們試用Variance Threshold刪除掉變異數過小的特徵值，降低維度

### Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty = "l2", solver = "lbfgs", max_iter = 100, random_state = 101)
lr.fit(X_train, Y_train)
y_pred = lr.predict(X_test)

from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score
print("Accuracy：",accuracy_score(Y_test, y_pred))
print("Recall：", recall_score(Y_test, y_pred))
print("Precision：", precision_score(Y_test, y_pred))
confusion_matrix(Y_test, y_pred)

Accuracy： 0.7523364485981309
Recall： 0.4782608695652174
Precision： 0.66


array([[128,  17],
       [ 36,  33]])

### KNN

In [23]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, Y_train)
y_pred = knn.predict(X_test)

print(pd.DataFrame(y_pred).iloc[:, 0].value_counts())

from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score
print("Accuracy：",accuracy_score(Y_test, y_pred))
print("Recall：", recall_score(Y_test, y_pred))
print("Precision：", precision_score(Y_test, y_pred))
confusion_matrix(Y_test, y_pred)

0.0    201
1.0     13
Name: 0, dtype: int64
Accuracy： 0.7009345794392523
Recall： 0.13043478260869565
Precision： 0.6923076923076923


array([[141,   4],
       [ 60,   9]])

### SVM

In [24]:
from sklearn.svm import SVC
svc = SVC(C = 1, kernel = "rbf", gamma = "scale", random_state = 101)
svc.fit(X_train, Y_train)
y_pred = svc.predict(X_test)

print(pd.DataFrame(y_pred).iloc[:, 0].value_counts())

from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score
print("Accuracy：",accuracy_score(Y_test, y_pred))
print("Recall：", recall_score(Y_test, y_pred))
print("Precision：", precision_score(Y_test, y_pred))
confusion_matrix(Y_test, y_pred)

0.0    201
1.0     13
Name: 0, dtype: int64
Accuracy： 0.7383177570093458
Recall： 0.18840579710144928
Precision： 1.0


array([[145,   0],
       [ 56,  13]])

### Decision Tree

In [25]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(criterion = "gini", random_state = 101)
dtc.fit(X_train, Y_train)
y_pred = dtc.predict(X_test)

print(pd.DataFrame(y_pred).iloc[:, 0].value_counts())

from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score
print("Accuracy：",accuracy_score(Y_test, y_pred))
print("Recall：", recall_score(Y_test, y_pred))
print("Precision：", precision_score(Y_test, y_pred))
confusion_matrix(Y_test, y_pred)

0.0    134
1.0     80
Name: 0, dtype: int64
Accuracy： 0.6308411214953271
Recall： 0.5072463768115942
Precision： 0.4375


array([[100,  45],
       [ 34,  35]])

### Random Forest

In [26]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 100, criterion = "gini", random_state = 101)
rfc.fit(X_train, Y_train)
y_pred = rfc.predict(X_test)

print(pd.DataFrame(y_pred).iloc[:, 0].value_counts())

from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score
print("Accuracy：",accuracy_score(Y_test, y_pred))
print("Recall：", recall_score(Y_test, y_pred))
print("Precision：", precision_score(Y_test, y_pred))
confusion_matrix(Y_test, y_pred)

0.0    134
1.0     80
Name: 0, dtype: int64
Accuracy： 0.6682242990654206
Recall： 0.5652173913043478
Precision： 0.4875


array([[104,  41],
       [ 30,  39]])

* 在沒有降維的情況下，準確度最高的是Logistic和Random Foreset，準確度約7成，接著我們嘗試用Variance Threshold做feature selection降低維度，通過對dataframe做describe可以找出變異數的最大與最小值落在(0.046, 0)，所以我們會針對這個區間內的值做微調，找出最佳維度

In [27]:
var_max = max((vec.describe().iloc[2, :]) ** 2)
var_min = min((vec.describe().iloc[2, :]) ** 2)

05/11/2020 20:03:39 - INFO - numexpr.utils - NumExpr defaulting to 4 threads.


In [28]:
np.arange(var_min, var_max, 0.002)

array([0.   , 0.002, 0.004, 0.006, 0.008, 0.01 , 0.012, 0.014, 0.016,
       0.018, 0.02 , 0.022, 0.024, 0.026, 0.028, 0.03 , 0.032, 0.034,
       0.036, 0.038, 0.04 , 0.042, 0.044, 0.046, 0.048, 0.05 , 0.052,
       0.054, 0.056, 0.058, 0.06 , 0.062, 0.064, 0.066, 0.068, 0.07 ,
       0.072, 0.074, 0.076, 0.078, 0.08 , 0.082, 0.084, 0.086, 0.088,
       0.09 , 0.092, 0.094, 0.096, 0.098, 0.1  , 0.102, 0.104, 0.106,
       0.108, 0.11 , 0.112, 0.114, 0.116, 0.118, 0.12 , 0.122, 0.124,
       0.126, 0.128, 0.13 , 0.132, 0.134])

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

lr = LogisticRegression(penalty = "l2", solver = "lbfgs", max_iter = 100, random_state = 101)
knn = KNeighborsClassifier(n_neighbors = 10)
svc = SVC(C = 1, kernel = "rbf", gamma = "scale", random_state = 101)
dtc = DecisionTreeClassifier(criterion = "gini", random_state = 101)
rfc = RandomForestClassifier(n_estimators = 100, criterion = "gini", random_state = 101)

def ModelAccuracy(X_train, X_test, Y_train, Y_test):
    lr.fit(X_train, Y_train)
    knn.fit(X_train, Y_train)
    svc.fit(X_train, Y_train)
    dtc.fit(X_train, Y_train)
    rfc.fit(X_train, Y_train)
    
    LR_acc = accuracy_score(Y_test, lr.predict(X_test))
    KNN_acc = accuracy_score(Y_test, knn.predict(X_test))
    SVC_acc = accuracy_score(Y_test, svc.predict(X_test))
    DTC_acc = accuracy_score(Y_test, dtc.predict(X_test))
    RFC_acc = accuracy_score(Y_test, rfc.predict(X_test))         
    
    return LR_acc, KNN_acc, SVC_acc, DTC_acc, RFC_acc

In [79]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

LR_Best_Dimension = 3000
LR_Best_Score = 0
KNN_Best_Dimension = 3000
KNN_Best_Score = 0
SVC_Best_Dimension = 3000
SVC_Best_Score = 0
DTC_Best_Dimension = 3000
DTC_Best_Score = 0
RFC_Best_Dimension = 3000
RFC_Best_Score = 0

for i in np.arange(var_min, var_max, 0.002):
    vt = VarianceThreshold(threshold = i)
    vt_vec = vt.fit_transform(vec)
    
    X = vt_vec
    Y = LPC["波動(漲跌幅)"]
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
    
    LR_acc, KNN_acc, SVC_acc, DTC_acc, RFC_acc = ModelAccuracy(X_train, X_test, Y_train, Y_test)
    if  LR_acc >= LR_Best_Score:
        LR_Best_Score = LR_acc
        LR_Best_Dimension = vt_vec.shape[1]
    if  KNN_acc >= KNN_Best_Score:
        KNN_Best_Score = KNN_acc
        KNN_Best_Dimension = vt_vec.shape[1]
    if  SVC_acc >= SVC_Best_Score:
        SVC_Best_Score = SVC_acc
        SVC_Best_Dimension = vt_vec.shape[1]
    if  DTC_acc >= DTC_Best_Score:
        DTC_Best_Score = DTC_acc
        DTC_Best_Dimension = vt_vec.shape[1]
    if  RFC_acc >= RFC_Best_Score:
        RFC_Best_Score = RFC_acc
        RFC_Best_Dimension = vt_vec.shape[1]
        
print("Logistic Regression Dimension:", LR_Best_Dimension, "Accuracy:", LR_Best_Score)
print("KNN Dimension:", KNN_Best_Dimension, "Accuracy:", KNN_Best_Score)
print("SVM Dimension:", SVC_Best_Dimension, "Accuracy:", SVC_Best_Score)
print("Decision Tree Dimension:", DTC_Best_Dimension, "Accuracy:", DTC_Best_Score)
print("Random Forest: Dimension =", RFC_Best_Dimension, ", Accuracy =", RFC_Best_Score)

Logistic Regression Dimension: 330 Accuracy: 0.7616822429906542
KNN Dimension: 46 Accuracy: 0.705607476635514
SVM Dimension: 245 Accuracy: 0.7383177570093458
Decision Tree Dimension: 28 Accuracy: 0.6962616822429907
Random Forest: Dimension = 28 , Accuracy = 0.7102803738317757


* 結論來看，Random Forest還是有最佳的準確度，且相對於其他模型能夠使用更高維度的資料（因為Random Forest本身就有降維的作用，所以這樣的結果也算合理）