In [148]:
import numpy as np  
import matplotlib.pyplot as plt  
import pandas as pd  
from apyori import apriori  

# Here we first load co-occurrence data which has been proprocessed to remove noise
cooccurrence_data = pd.read_csv('./data/preprocessed-cooccurrence.csv')
rownum = len(cooccurrence_data.iloc[:,0])
colnum = len(cooccurrence_data.columns)
print("The number of records in the data: "+ str(rownum))
print("The maximum items in one record: "+ str(colnum))
records = []  
for i in range(0, rownum):  
    records.append([str(cooccurrence_data.values[i,j]) for j in range(0, colnum)])

The number of records in the data: 7312
The maximum items in one record: 38


In [145]:
# Here we print out the first 5 records. in the table, non-data parts are filled with NaN
# we can see in record 1 and 2, there is only one stock symbols in a news article
# in record 3, there are 3 stocks occurr in the same news article
cooccurrence_data.head(10)  

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
0,etfc,,,,,,,,,,...,,,,,,,,,,
1,fb,,,,,,,,,,...,,,,,,,,,,
2,nflx,,,,,,,,,,...,,,,,,,,,,
3,mu,amat,lrcx,,,,,,,,...,,,,,,,,,,
4,msft,symc,flt,de,lyb,,,,,,...,,,,,,,,,,
5,googl,fb,,,,,,,,,...,,,,,,,,,,
6,pm,bti,bti,,,,,,,,...,,,,,,,,,,
7,twtr,,,,,,,,,,...,,,,,,,,,,
8,amzn,,,,,,,,,,...,,,,,,,,,,
9,aapl,,,,,,,,,,...,,,,,,,,,,


In [127]:
# Here we use apriori algorithm to retrieve association rules
association_rules = apriori(records, min_support=0.002, min_confidence=0.1, min_lift=2, min_length=2) 
# the rules are converted to a list for reading
association_results = list(association_rules)  

In [128]:
# Here we print out these rules
print(association_results)  

[RelationRecord(items=frozenset({'aaba', 'vz'}), support=0.002735229759299781, ordered_statistics=[OrderedStatistic(items_base=frozenset({'aaba'}), items_add=frozenset({'vz'}), confidence=0.5263157894736842, lift=31.54443485763589), OrderedStatistic(items_base=frozenset({'vz'}), items_add=frozenset({'aaba'}), confidence=0.16393442622950818, lift=31.544434857635892)]), RelationRecord(items=frozenset({'aal', 'dal'}), support=0.0030087527352297594, ordered_statistics=[OrderedStatistic(items_base=frozenset({'aal'}), items_add=frozenset({'dal'}), confidence=0.5945945945945946, lift=94.51468860164512), OrderedStatistic(items_base=frozenset({'dal'}), items_add=frozenset({'aal'}), confidence=0.47826086956521735, lift=94.51468860164512)]), RelationRecord(items=frozenset({'luv', 'aal'}), support=0.002324945295404814, ordered_statistics=[OrderedStatistic(items_base=frozenset({'aal'}), items_add=frozenset({'luv'}), confidence=0.4594594594594595, lift=101.8050778050778), OrderedStatistic(items_base

In [129]:
# Here we build a dictionary on association rules which are used for recommendation later
stocks_related = {}
for item in association_results:    
    pair = item[0] 
    items = [x for x in pair]
    if len(items) > 2:
        continue
    stockA = items[0]
    stockB = items[1]
    # get association rule
    print("Rule: " + stockA + " -> " + stockB)     
    # get support value
    support = item[1]
    print("Support: " + str(support))
    # get confidence value
    confidence = item[2][0][2]
    print("Confidence: " + str(confidence))
    # get lift value
    lift = item[2][0][3]
    print("Lift: " + str(lift))
    print("=====================================")
    
    # record for later recommendation
    if stockA not in stocks_related:
        stocks_related[stockA] = [] 
    stock_related = (stockB, support, confidence, lift)
    stocks_related[stockA].append(stock_related)
        

Rule: aaba -> vz
Support: 0.002735229759299781
Confidence: 0.5263157894736842
Lift: 31.54443485763589
Rule: aal -> dal
Support: 0.0030087527352297594
Confidence: 0.5945945945945946
Lift: 94.51468860164512
Rule: luv -> aal
Support: 0.002324945295404814
Confidence: 0.4594594594594595
Lift: 101.8050778050778
Rule: ual -> aal
Support: 0.0030087527352297594
Confidence: 0.5945945945945946
Lift: 114.41251778093884
Rule: avgo -> aapl
Support: 0.006291028446389497
Confidence: 0.3739837398373984
Lift: 2.476964769647697
Rule: aapl -> fb
Support: 0.03569474835886215
Confidence: 0.2364130434782609
Lift: 2.0827134625458354
Rule: fit -> aapl
Support: 0.0031455142231947486
Confidence: 0.6216216216216217
Lift: 4.117117117117118
Rule: goog -> aapl
Support: 0.022428884026258207
Confidence: 0.14855072463768118
Lift: 2.8067258360483844
Rule: googl -> aapl
Support: 0.028036105032822757
Confidence: 0.18568840579710144
Lift: 2.688621036016645
Rule: msft -> aapl
Support: 0.02735229759299781
Confidence: 0.18115

In [139]:
# Here we start to recommend
# We first input stock symbol of google (googl) and the system will return a list of stocks 
# with high cooccurrence in financial news
# You are free to modify to other stock symbol to test

stock_input = "googl"
print("Stock you have input: " + stock_input)
print("---------------------------------------------------")
count = 0
if stock_input in stocks_related:
    print("Here are a list of stocks you may be interested in:\n")
    found_stocks = stocks_related[stock_input]
    for found_stock in found_stocks:
        count += 1
        print("Stock-" + str(count) + ": " + found_stock[0])
        print("Support: " + str(support))
        print("Confidence: " + str(confidence))
        print("Lift: " + str(lift))
        print("=====================================")
else:
    print("Sorry, we can not make any recommendation based on your input")

Stock you have input: googl
---------------------------------------------------
Here are a list of stocks you may be interested in:

Stock-1: aapl
Support: 0.004102844638949671
Confidence: 0.5882352941176471
Lift: 35.255544840887175
Stock-2: amzn
Support: 0.004102844638949671
Confidence: 0.5882352941176471
Lift: 35.255544840887175
Stock-3: crm
Support: 0.004102844638949671
Confidence: 0.5882352941176471
Lift: 35.255544840887175
Stock-4: csco
Support: 0.004102844638949671
Confidence: 0.5882352941176471
Lift: 35.255544840887175
Stock-5: fb
Support: 0.004102844638949671
Confidence: 0.5882352941176471
Lift: 35.255544840887175
Stock-6: ibm
Support: 0.004102844638949671
Confidence: 0.5882352941176471
Lift: 35.255544840887175
Stock-7: msft
Support: 0.004102844638949671
Confidence: 0.5882352941176471
Lift: 35.255544840887175
Stock-8: nflx
Support: 0.004102844638949671
Confidence: 0.5882352941176471
Lift: 35.255544840887175
Stock-9: orcl
Support: 0.004102844638949671
Confidence: 0.5882352941176