In [48]:
import pickle
import numpy as np
import random
import pandas as pd
from pprint import pprint
from pyod.models.iforest import IForest
from pyod.models.lof import LOF
from pyod.models.abod import ABOD
from pyod.models.knn import KNN
from pyod.models.lscp import LSCP
from pyod.models.abod import ABOD
from tqdm.notebook import tqdm

In [106]:
class EnsembleAD():
    def __init__(self, name, threshold=0.5):
        print("Initializing...")
        
        vec_path = './data/doc_vec/title_' + name + '_vec.np'
        csv_path = './data/company/' + name + '.csv'
        title_path = './data/company/title_' + name + '.txt'
        
        # load titles' vec
        with open(vec_path, 'rb') as f:
            self.X = pickle.load(f)
        
        # number of samples
        self.N = len(self.X)
        # dimension of feature
        self.D = len(self.X[0])
        #Anomaly prob threshold
        self.threshold = threshold
            
        #load csv data
        self.df = pd.read_csv(csv_path)
        
        #load titles
        self.titles = []
        with open(title_path, "r" ) as f:
            for line in f:
                self.titles.append(line)
        
        #Getting Anomaly Probabilities
        self.Ano_Prob = self.AD_LSCP()[:, 1]
        
        #Getting Anomaly and Negative-sentiment index
        self.neg_idx = self.Get_negetive()

        self.Print_Neg_Anomaly()
        print("Anomaly idx:{}".format(self.ano_idx))
        print("Negtive senti:{}".format(self.neg_idx))
                
    
    def AD_LSCP(self):
        print("detecting with LSCP...")
        PR_LSCP = []
        detector_list = [LOF(), IForest(), KNN()]
        local_region_size =  30 if self.N > 30 else self.N // 2
        clf_LSCP = LSCP(detector_list, n_bins=3, local_region_size=local_region_size).fit(self.X)
        Anomaly_proba = clf_LSCP.predict_proba(self.X)
        LSCP_prob = np.array(Anomaly_proba)
        
        return LSCP_prob
    
    def Get_negetive(self):
        return self.df.index[self.df['sentiment'] == -1].tolist()
    
    def Print_Neg_Anomaly(self):
        self.ano = [(i, val) for i, val in enumerate(self.Ano_Prob) if val > self.threshold]
        #descending sort anomaly probablity
        self.ano.sort(key=lambda x:-x[1])
        self.ano_idx = [i for (i, val) in self.ano]
        Neg_Ano_idx = np.intersect1d(self.ano_idx, self.neg_idx)
        print("Anomaly and Negavitve sentiment:{}".format(Neg_Ano_idx))
        if Neg_Ano_idx.size == 0:
            for i in range(min(len(self.ano_idx), 3)):
                pprint(self.titles[self.ano_idx[i]])
        else:
            for idx in Neg_Ano_idx:
                pprint(self.titles[idx])
        

In [107]:
apple = EnsembleAD("apple", threshold=0.5)

Initializing...
detecting with LSCP...
Anomaly and Negavitve sentiment:[ 49  83 119 229 232]
'苹果Q4每股收益及营收超出预期\n'
'JDI连续第11个季度录得净亏损：苹果认栽了\n'
'苹果：自2009年以来公司每年在维修业务上都是亏钱的\n'
'苹果股价盘后一度跳水 跌幅近5%\n'
'苹果多头减持幅度较大\n'
Anomaly idx:[24, 135, 52, 206, 58, 15, 32, 253, 43, 172, 3, 213, 75, 202, 7, 260, 229, 83, 39, 77, 230, 53, 49, 232, 88, 8, 119, 35]
Negtive senti:[0, 5, 9, 19, 20, 23, 27, 44, 49, 57, 59, 60, 62, 63, 65, 69, 71, 74, 81, 83, 84, 94, 96, 108, 111, 119, 121, 129, 136, 140, 146, 154, 155, 156, 161, 163, 175, 181, 216, 217, 220, 225, 228, 229, 232, 238, 241, 245, 265, 268]


In [110]:
microsoft = EnsembleAD("microsoft", threshold=0.5)

Initializing...
detecting with LSCP...
Anomaly and Negavitve sentiment:[24 25 59 82]
'花旗：预计Azure 2020年增速放缓 予微软（MSFT.US）“中性”评级\n'
'微软Azure增速下滑不是重点 智能云业务存在持续扩张空间\n'
'美股科技股表现分化 微软刷新历史新高\n'
'超4400万微软用户在其它平台使用相同密码遭泄露\n'
Anomaly idx:[59, 88, 24, 0, 90, 85, 38, 21, 75, 57, 10, 51, 22, 34, 89, 47, 19, 17, 67, 7, 25, 82, 15, 50, 33]
Negtive senti:[12, 16, 18, 24, 25, 30, 42, 44, 45, 53, 55, 58, 59, 72, 74, 76, 77, 78, 82, 92]


In [111]:
zzd = EnsembleAD("zzd", threshold=0.5)

Initializing...
detecting with LSCP...
Anomaly and Negavitve sentiment:[3]
'【图解季报】獐子岛2019年前三季度净利润-3403万元 同比下降245.53%\n'
Anomaly idx:[3]
Negtive senti:[3, 5, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 23, 25, 26, 27]




In [112]:
vanke = EnsembleAD("vanke", threshold=0.5)

Initializing...
detecting with LSCP...
Anomaly and Negavitve sentiment:[14 15 30 43]
'宝能系减持万科6.49亿元\n'
'华侨城拟转让万科合资公司50%股权及债权 挂牌底价23.3亿元\n'
'【万科A：钜盛华所持4700万股公司A股解除质押】万科A(000002)12月13日晚间公告，12月12日，钜盛华将持有并质押给招商证券的万科4700万股无限售流通A股办理解除质押；此次解押股数占钜盛华所持股份的10.01%，占万科总股本的0.42%。截至目前，钜盛华及其一致行动人前海人寿累计质押股份4.45亿股，占其所持股份的63.78%，占公司总股本的3.94%。（证券时报）\n'
'宝能系减持万科股票5.65亿股 持股比例不足5%\n'
Anomaly idx:[25, 19, 34, 9, 18, 24, 43, 15, 26, 31, 14, 30]
Negtive senti:[12, 13, 14, 15, 17, 20, 21, 28, 30, 32, 35, 36, 37, 38, 40, 42, 43]


In [113]:
tesla = EnsembleAD("tesla", threshold=0.5)

Initializing...
detecting with LSCP...
Anomaly and Negavitve sentiment:[ 7 38 56]
'10月18日特斯拉板块跌幅达2%\n'
'特斯拉第三季度在美销售额暴跌39% 中国市场大增64%\n'
'11月11日特斯拉板块跌幅达2%\n'
Anomaly idx:[162, 7, 56, 46, 41, 82, 147, 148, 169, 53, 12, 167, 61, 174, 38, 6, 163, 97, 45, 177, 158, 35]
Negtive senti:[1, 2, 7, 29, 38, 39, 47, 56, 73, 77, 83, 89, 94, 98, 106, 107, 111, 113, 114, 127, 139, 146, 149, 152, 153, 154, 161, 164, 183]


In [114]:
jd = EnsembleAD("jd", threshold=0.5)

Initializing...
detecting with LSCP...
Anomaly and Negavitve sentiment:[]
'中金：预期京东（JD.US）3季度业绩超预期 维持其“跑赢行业”评级\n'
'京东2019三季报：净赚1348亿元\n'
'募资超100亿！港股年内第二大IPO来了 曾获京东领投21亿\n'
Anomaly idx:[11, 23, 3, 13, 20, 38, 35]
Negtive senti:[7, 18, 25, 33, 34, 36, 37]
