In [48]:
import pickle
import numpy as np
import random
import pandas as pd
from pprint import pprint
from pyod.models.iforest import IForest
from pyod.models.lof import LOF
from pyod.models.abod import ABOD
from pyod.models.knn import KNN
from pyod.models.lscp import LSCP
from pyod.models.abod import ABOD
from tqdm.notebook import tqdm

In [88]:
class EnsembleAD():
    def __init__(self, name, threshold=0.5):
        print("Initializing...")
        
        vec_path = './data/doc_vec/title_' + name + '_vec.np'
        csv_path = './data/company/' + name + '.csv'
        title_path = './data/company/title_' + name + '.txt'
        
        # load titles' vec
        with open(vec_path, 'rb') as f:
            self.X = pickle.load(f)
        
        # number of samples
        self.N = len(self.X)
        # dimension of feature
        self.D = len(self.X[0])
        #Iteration rounds
#         self.T = T
            
        #load csv data
        self.df = pd.read_csv(csv_path)
        
        #load titles
        self.titles = []
        with open(title_path, "r" ) as f:
            for line in f:
                self.titles.append(line)
                
#         #Ensemble Anomaly Detection
#         self.iforest_prob = self.AD_iforest()
#         self.LOF_prob = self.AD_LOF()
#         self.KNN_prob = self.AD_KNN()
        
#         #Adding Anomaly probs together
#         self.PROB = self.iforest_prob + self.LOF_prob + self.KNN_prob
#         self.PROB = self.PROB[:, 1]
        
        self.PROB = self.AD_LSCP()[:, 1]
        
        #Getting Anomaly and Negative-sentiment index
        self.ano_idx = [i for i, val in enumerate(self.PROB) if val > threshold]
        self.neg_idx = self.Detect_negetive()
        print("Anomaly idx:{}".format(self.ano_idx))
        print("Negtive senti:{}".format(self.neg_idx))
        
        self.Print_Neg_Anomaly()
                
    def Generate_indicies(self):
        d = random.randint(self.D//2, self.D-2)
        indicies = random.sample(range(0, self.D - 1), d)
        return sorted(indicies)
    
    def Generate_corped_x(self):
        indices = self.Generate_indicies()
        return self.X[:, indices]
    
    def AD_iforest(self):
        print("detecting with iForest...")
        PR_iforest = []
        for t in tqdm(range(self.T)):
            X_corped = self.Generate_corped_x()
            clf_iforest = IForest().fit(X_corped)
            Anomaly_proba = clf_iforest.predict_proba(X_corped)
            PR_iforest.append(Anomaly_proba)
        PR_iforest = np.array(PR_iforest)
        iforest_prob = np.mean(PR_iforest, axis=0)
        
        return iforest_prob
    
    def AD_LOF(self):
        print("detecting with LOF...")
        PR_LOF = []
        for t in tqdm(range(self.T)):
            X_corped = self.Generate_corped_x()
            clf_LOF = LOF(n_neighbors=5).fit(X_corped)
            Anomaly_proba = clf_LOF.predict_proba(X_corped)
            PR_LOF.append(Anomaly_proba)
        PR_LOF = np.array(PR_LOF)
        LOF_prob = np.mean(PR_LOF, axis=0)
        
        return LOF_prob
    
    def AD_KNN(self):
        print("detecting with KNN...")
        PR_KNN = []
        for t in tqdm(range(self.T)):
            X_corped = self.Generate_corped_x()
            clf_KNN = KNN(contamination=0.1, n_neighbors=5, method='largest').fit(X_corped)
            Anomaly_proba = clf_KNN.predict_proba(X_corped)
            PR_KNN.append(Anomaly_proba)
        PR_KNN = np.array(PR_KNN)
        KNN_prob = np.mean(PR_KNN, axis=0)
        
        return KNN_prob
    
    def AD_LSCP(self):
        print("detecting with LSCP...")
        PR_LSCP = []
        detector_list = [LOF(), IForest(), KNN()]
#         for t in tqdm(range(self.T)):
#             X_corped = self.Generate_corped_x()
#             clf_LSCP = LSCP(detector_list, n_bins=3).fit(X_corped)
#             Anomaly_proba = clf_LSCP.predict_proba(X_corped)
#             PR_LSCP.append(Anomaly_proba)
#         PR_LSCP = np.array(PR_LSCP)
#         LSCP_prob = np.mean(PR_LSCP, axis=0)
        local_region_size =  30 if self.N > 30 else self.N // 2
        clf_LSCP = LSCP(detector_list, n_bins=3, local_region_size=local_region_size).fit(self.X)
        Anomaly_proba = clf_LSCP.predict_proba(self.X)
        LSCP_prob = np.array(Anomaly_proba)
        
        return LSCP_prob
    
        
    
    def Detect_negetive(self):
        return self.df.index[self.df['sentiment'] == -1].tolist()
    
    def Print_Neg_Anomaly(self):
        Neg_Ano_idx = np.intersect1d(self.ano_idx, self.neg_idx)
        print("Anomaly and Negavitve sentiment:{}".format(Neg_Ano_idx))
        for idx in Neg_Ano_idx:
            pprint(self.titles[idx])
        

In [89]:
apple = EnsembleAD("apple", threshold=0.5)

Initializing...
detecting with LSCP...
Anomaly idx:[3, 7, 8, 10, 15, 20, 22, 24, 26, 32, 35, 39, 43, 49, 52, 53, 58, 63, 75, 77, 83, 88, 119, 135, 172, 192, 202, 206, 213, 217, 218, 229, 230, 232, 242, 244, 248, 253, 260]
Negtive senti:[0, 5, 9, 19, 20, 23, 27, 44, 49, 57, 59, 60, 62, 63, 65, 69, 71, 74, 81, 83, 84, 94, 96, 108, 111, 119, 121, 129, 136, 140, 146, 154, 155, 156, 161, 163, 175, 181, 216, 217, 220, 225, 228, 229, 232, 238, 241, 245, 265, 268]
Anomaly and Negavitve sentiment:[ 20  49  63  83 119 217 229 232]
'苹果困局：股价飙涨背后的转型难题\n'
'苹果Q4每股收益及营收超出预期\n'
'苹果供应商遭遇冰火两重天：iPhone供应商业绩变脸 穿戴设备供应商增长明显\n'
'JDI连续第11个季度录得净亏损：苹果认栽了\n'
'苹果：自2009年以来公司每年在维修业务上都是亏钱的\n'
'传苹果或削减高端iPhone产量\n'
'苹果股价盘后一度跳水 跌幅近5%\n'
'苹果多头减持幅度较大\n'


In [54]:
apple = EnsembleAD("apple", 10, threshold=0.5)

Initializing...
detecting with LSCP...


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Anomaly idx:[3, 7, 8, 10, 15, 20, 24, 26, 32, 35, 39, 43, 44, 49, 52, 53, 58, 63, 66, 75, 77, 83, 88, 115, 118, 119, 135, 137, 160, 165, 172, 192, 202, 206, 213, 217, 218, 227, 229, 230, 232, 236, 242, 244, 248, 250, 253, 260]
Negtive senti:[0, 5, 9, 19, 20, 23, 27, 44, 49, 57, 59, 60, 62, 63, 65, 69, 71, 74, 81, 83, 84, 94, 96, 108, 111, 119, 121, 129, 136, 140, 146, 154, 155, 156, 161, 163, 175, 181, 216, 217, 220, 225, 228, 229, 232, 238, 241, 245, 265, 268]
Anomaly and Negavitve sentiment:[ 20  44  49  63  83 119 217 229 232]
'苹果困局：股价飙涨背后的转型难题\n'
'苹果市场份额下滑致全年预亏 百邦科技上市逾一年业绩“变脸”\n'
'苹果Q4每股收益及营收超出预期\n'
'苹果供应商遭遇冰火两重天：iPhone供应商业绩变脸 穿戴设备供应商增长明显\n'
'JDI连续第11个季度录得净亏损：苹果认栽了\n'
'苹果：自2009年以来公司每年在维修业务上都是亏钱的\n'
'传苹果或削减高端iPhone产量\n'
'苹果股价盘后一度跳水 跌幅近5%\n'
'苹果多头减持幅度较大\n'


In [47]:
apple = EnsembleAD("apple", 10, threshold=0.5)

Initializing...
detecting with LSCP...


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Anomaly idx:[3, 7, 8, 15, 20, 24, 26, 32, 39, 43, 49, 52, 53, 58, 75, 77, 83, 88, 119, 135, 172, 202, 206, 213, 217, 218, 229, 230, 232, 248, 253, 260]
Negtive senti:[0, 5, 9, 19, 20, 23, 27, 44, 49, 57, 59, 60, 62, 63, 65, 69, 71, 74, 81, 83, 84, 94, 96, 108, 111, 119, 121, 129, 136, 140, 146, 154, 155, 156, 161, 163, 175, 181, 216, 217, 220, 225, 228, 229, 232, 238, 241, 245, 265, 268]
Anomaly and Negavitve sentiment:[ 20  49  83 119 217 229 232]
'苹果困局：股价飙涨背后的转型难题\n'
'苹果Q4每股收益及营收超出预期\n'
'JDI连续第11个季度录得净亏损：苹果认栽了\n'
'苹果：自2009年以来公司每年在维修业务上都是亏钱的\n'
'传苹果或削减高端iPhone产量\n'
'苹果股价盘后一度跳水 跌幅近5%\n'
'苹果多头减持幅度较大\n'


In [79]:
microsoft = EnsembleAD("microsoft", 10, threshold=0.5)

Initializing...
detecting with LSCP...
Anomaly idx:[0, 10, 21, 22, 24, 34, 38, 51, 57, 59, 75, 85, 88, 89, 90]
Negtive senti:[12, 16, 18, 24, 25, 30, 42, 44, 45, 53, 55, 58, 59, 72, 74, 76, 77, 78, 82, 92]
Anomaly and Negavitve sentiment:[24 59]
'花旗：预计Azure 2020年增速放缓 予微软（MSFT.US）“中性”评级\n'
'美股科技股表现分化 微软刷新历史新高\n'


In [80]:
zzd = EnsembleAD("zzd", 10, threshold=0.5)

Initializing...
detecting with LSCP...
Anomaly idx:[3]
Negtive senti:[3, 5, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 23, 25, 26, 27]
Anomaly and Negavitve sentiment:[3]
'【图解季报】獐子岛2019年前三季度净利润-3403万元 同比下降245.53%\n'




In [35]:
zzd = EnsembleAD("zzd", 10, threshold=1.2)

Initializing...
detecting with iForest...


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


detecting with LOF...


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


detecting with KNN...


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Anomaly idx:[3]
Negtive senti:[3, 5, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 23, 25, 26, 27]
Anomaly and Negavitve sentiment:[3]
'【图解季报】獐子岛2019年前三季度净利润-3403万元 同比下降245.53%\n'


In [81]:
vanke = EnsembleAD("vanke", 10, threshold=0.5)

Initializing...
detecting with LSCP...
Anomaly idx:[9, 14, 15, 18, 19, 24, 25, 26, 30, 31, 34, 43]
Negtive senti:[12, 13, 14, 15, 17, 20, 21, 28, 30, 32, 35, 36, 37, 38, 40, 42, 43]
Anomaly and Negavitve sentiment:[14 15 30 43]
'宝能系减持万科6.49亿元\n'
'华侨城拟转让万科合资公司50%股权及债权 挂牌底价23.3亿元\n'
'【万科A：钜盛华所持4700万股公司A股解除质押】万科A(000002)12月13日晚间公告，12月12日，钜盛华将持有并质押给招商证券的万科4700万股无限售流通A股办理解除质押；此次解押股数占钜盛华所持股份的10.01%，占万科总股本的0.42%。截至目前，钜盛华及其一致行动人前海人寿累计质押股份4.45亿股，占其所持股份的63.78%，占公司总股本的3.94%。（证券时报）\n'
'宝能系减持万科股票5.65亿股 持股比例不足5%\n'


In [36]:
vanke = EnsembleAD("vanke", 10, threshold=1.2)

Initializing...
detecting with iForest...


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


detecting with LOF...


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


detecting with KNN...


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Anomaly idx:[2, 3, 5, 6, 9, 14, 15, 18, 19, 24, 25, 26, 27, 30, 31, 34, 39, 43]
Negtive senti:[12, 13, 14, 15, 17, 20, 21, 28, 30, 32, 35, 36, 37, 38, 40, 42, 43]
Anomaly and Negavitve sentiment:[14 15 30 43]
'宝能系减持万科6.49亿元\n'
'华侨城拟转让万科合资公司50%股权及债权 挂牌底价23.3亿元\n'
'【万科A：钜盛华所持4700万股公司A股解除质押】万科A(000002)12月13日晚间公告，12月12日，钜盛华将持有并质押给招商证券的万科4700万股无限售流通A股办理解除质押；此次解押股数占钜盛华所持股份的10.01%，占万科总股本的0.42%。截至目前，钜盛华及其一致行动人前海人寿累计质押股份4.45亿股，占其所持股份的63.78%，占公司总股本的3.94%。（证券时报）\n'
'宝能系减持万科股票5.65亿股 持股比例不足5%\n'


In [82]:
tesla = EnsembleAD("tesla", 10, threshold=0.5)

Initializing...
detecting with LSCP...
Anomaly idx:[6, 7, 12, 16, 17, 35, 38, 41, 45, 46, 53, 56, 61, 82, 147, 148, 158, 162, 163, 167, 169, 173, 174, 177]
Negtive senti:[1, 2, 7, 29, 38, 39, 47, 56, 73, 77, 83, 89, 94, 98, 106, 107, 111, 113, 114, 127, 139, 146, 149, 152, 153, 154, 161, 164, 183]
Anomaly and Negavitve sentiment:[ 7 38 56]
'10月18日特斯拉板块跌幅达2%\n'
'特斯拉第三季度在美销售额暴跌39% 中国市场大增64%\n'
'11月11日特斯拉板块跌幅达2%\n'


In [37]:
tesla = EnsembleAD("tesla", 10, threshold=1.2)

Initializing...
detecting with iForest...


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


detecting with LOF...


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


detecting with KNN...


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Anomaly idx:[6, 12, 17, 32, 35, 38, 39, 41, 45, 46, 50, 53, 61, 77, 97, 110, 146, 151, 158, 162, 163, 167, 173, 174, 177]
Negtive senti:[1, 2, 7, 29, 38, 39, 47, 56, 73, 77, 83, 89, 94, 98, 106, 107, 111, 113, 114, 127, 139, 146, 149, 152, 153, 154, 161, 164, 183]
Anomaly and Negavitve sentiment:[ 38  39  77 146]
'特斯拉第三季度在美销售额暴跌39% 中国市场大增64%\n'
'特斯拉第三季度销量下降39% 其美国增长将面临强劲挑战\n'
'特斯拉新泽西州一超级充电站起火 起火原因尚不清楚\n'
'特斯拉法律总顾问Jonathan Chang离职 上任不到一年\n'


In [83]:
jd = EnsembleAD("jd", 10, threshold=0.5)

Initializing...
detecting with LSCP...
Anomaly idx:[3, 11, 13, 20, 23, 35, 38]
Negtive senti:[7, 18, 25, 33, 34, 36, 37]
Anomaly and Negavitve sentiment:[]
