In [1]:
from common import *
from given import *
import re

In [2]:
pd.set_option('display.max_colwidth', 115)

In [3]:
_N_ = pd.read_pickle(big_data/'trans/N10.pkl') #get_N()
_N_['headlineTagId'] = _N_.headlineTag.map(headlineTagIdAssign)

In [36]:
N = _N_small = _N_[('2015-01-01'<=_N_.time)&(_N_.time<'2015-04-01')]

In [10]:
from akamap import akamap
xAssetNames = set(assetNameIdAssign.series)
_flag = False
for x in akamap:
    if x not in xAssetNames:
        _flag = True
        print(x)
assert not _flag

# Helpers

In [12]:
ljust = lambda s: s.str.ljust(s.str.len().max())
def ljust_c(columns):
    if isinstance(columns, str):
        columns = [columns]
    def ljust_df(df):
        df = df.copy()
        for c in columns:
            df[c] = ljust(df[c])
        return df
    return ljust_df

In [13]:
def sample(k):
    def sample_k(df):
        n = len(df)
        samps = sorted(random.sample(range(n), k))
        return df.iloc[samps]
    return sample_k

# Notes and TODOs

In [14]:
'STCKa.O vs STCKb.O';

In [15]:
'''
stock matching algorithm:
    match whole
    loop take away one token from right and try match
    repeat above with symbols removed (add no space)
''';

In [16]:
'''
preprocessing pipeline:
    replace \ => [delete]
    replace ` => '
    convert numbers
    convert stock ticker symbols
    replace ... => #ellipsis
    replace . => [delete]
    replace -- => #dash
    replace [symbol not backslash] => [same symbol with spaces on either side]
    lowercase
    split
    remove stock endings words
    match stock name
        match canonical name in decreasing num words right to left
            also try get rid of symbol words inside decreasing word loop # also commas before Inc, Ltd
        repeat prev 2 lines with all aka names
            order of which choose aka names as follows:
                if aka maps to list, do canon name first, then the list in order
                if aka maps to one str, see if wordified the aka str is sublist of canon name,
                    if so, do canon name first, then aka name
                    else, do aka name first, then canon
    remove CORPORATION/INCORPORATED and the likes from after the stock mention #TODO
''';

# Preprocessing

In [17]:
def make_replacer(key_values):
    try:
        replace_dict = dict(key_values)
    except (TypeError, ValueError):
        replace_dict = {a: '' for a in key_values}
    replacement_function = lambda match: replace_dict[match.group(0)]
    pattern = re.compile("|".join([re.escape(k) for k, v in replace_dict.items()]), re.M)
    return lambda string: pattern.sub(replacement_function, string)

## real code

#### define some constants

In [20]:
corpFull = {
    'Incorporated': 'Inc',
    ',Incorporated': ',Inc',
    'Corporation': 'Corp',
    'Company': 'Co',
    'Limited': 'Ltd',
    ',Limited': ',Ltd',
    ',Limited Liability Company': 'LLC',
    ',Limited Liability Company': ',LLC',
    'Public Limited Company': 'PLC',
    ',Public Limited Company': ',PLC',
    'Limited Partnership': ['LP', 'L P'],
    ',Limited Partnership': [',LP', ', L P'],
    'Companies': 'Cos',
    'Societe anonyme': ['SA','S A'],
    'Naamloze vennootschap': ['NV', 'N V'],
    # AS A/S ASA full names don't show up in news
    'Aktiengesellschaft': 'AG',
}
corpAbbrMapFull = {}
for full, abbrs in corpFull.items():
    if isinstance(abbrs, str):
        abbrs = [abbrs]
    for abbr in abbrs:
        corpAbbrMapFull[abbr] = full
corpFull = list(corpFull.keys())
corpAbbr = 'Inc ,Inc Corp Co Ltd ,Ltd LLC ,LLC PLC ,PLC LP ,LP Cos SA ,SA NV ,NV ASA A/S AS PAO AG'.split()
corpAbbr += ['S A',', S A','L P','P L C','N V','SA de CV',',SA de CV']
corpSpec = 'co company cos companies'.split()

#### simple subs

In [21]:
if TEST:
    re_money = re.compile(r'(?<!\w)\$[0-9]+(,[0-9][0-9][0-9])*.?[0-9]*')
    re_money.search('hi :$5,666.00').group(0)
    re_year = re.compile(r'(?<![0-9])('+'|'.join(str(i) for i in range(1999, 2022))+')(?![0-9])')
    re_year.search('hi what 2009e  ').group()
    re_percent = re.compile(r'(?<!\w)[0-9]+(,[0-9][0-9][0-9])*\.?[0-9]*\%')
    re_percent.search('hi 66%').group()

In [22]:
class resub(metaclass=staticclass):
    @classmethod
    def replace(cls, text):
        return cls.re.sub(cls.put, text)

# stupid fucking re.sub interprets backslash+<char> as escape sequences IN THE REPLACEMENT STRING!
# like, bitch, I give you a string when I could have equally well given you a function, I want you to take it as it fck'ng is
    
class HPL(metaclass=staticclass): # headline @pipeline #TODO test the regexes
    class Money(resub):
        re = re.compile(r'[\$\¤\¥\£\€]\ ?[0-9]+(\,[0-9][0-9][0-9])*\.?[0-9]*')
        put = r'\\money'+'\\'*4
    class Percent(resub): # a percent number - if connected to words don't include
        re = re.compile(r'(?<!\w)[0-9]+(\,[0-9][0-9][0-9])*\.?[0-9]*\%')
        put = r'\\decimal'+'\\'*4
    class Year(resub): # any digits that look like dates will be replaced
        re = re.compile(r'(?<![0-9])('+'|'.join(str(i) for i in range(2008, 2020))+'|'
                        +'|'.join(fr"\'{i:02}" for i in range(8,20))+')(?![0-9])')
        put = r'\\year'+'\\'*4
    class Decimal(resub): # a number with decimals - if connected to words don't include
        re = re.compile(r'(?<!\w)[0-9]+(,[0-9][0-9][0-9])*\.[0-9]+(?!\w)')
        put = r'\\decimal'+'\\'*4
    class Count(resub): # a natural number - if connected to words don't include
        re = re.compile(r'(?<!\w)[0-9]+(,[0-9][0-9][0-9])*(?!\w)')
        def put(match):
            guy = match.group().replace(',', '')
            sz = str(len(guy) if guy[0] not in '01' else len(guy)-1)
            return '\\count\\' + sz + r'\\'
        
    class Misc(resub): # add spaces around symbols that are not backslash
        re = re.compile(r'[^\w\.\s\\]')
        put = lambda match: ' ' + match.group() + ' '
    
    # post wordify
    
    class Quarter(resub):
        re = re.compile(r'\ '+
            r'(q1|q2|q3|q4|(first|1st|second|2nd|third|3rd|fourth|4th)\ \ quarter|quarter\ \ \\count\\(0|1)\.)'+
        r'\ ')
        put = r' \\quarter\\labelled. '
    class Month(resub):
        re = re.compile(r'\ (january|jan|february|feb|march|mar|april|apr|may|june|jun|july|jul|august|aug|'+
                        r'september|sep|sept|october|oct|november|nov|december|dec)\ ')
        put = r' \\month. '
    
# put into HPL context: HPL.self (an assetNameId) and HPL.assets (container of assetNameIds)
    
@inside(HPL)
class Ticker(resub):
    re_lowercase = re.compile(r'[a-z]')
    def norm(ticker):
        ticker = Ticker.re_lowercase.sub('', ticker)
        return ticker[:ticker.index('.')] if '.' in ticker else ticker
    
    re = re.compile(r'<[A-Z]+[a-z]*.?[A-Z]*>')
    
    def put(match):
        try:
            codes = assetNameMapCodes[HPL.self]
        except KeyError:
            return r' \ticker\\ '
        normed = Ticker.norm(match.group()[1:-1]) # match.group() is of form <AAPL.O>
        if any(Ticker.norm(x)==normed for x in assetCodeIdAssign.series[list(codes)]):
            return r' \ticker\self\\ '
        else:
            return r' \ticker\\ '

In [23]:
@inside(HPL)
def listify(text):
    text = text.replace('\\','').replace('`',"'")
    text = compose(*[getattr(getattr(HPL,a),'replace') for a in ['Year','Money','Percent','Decimal','Count']])(text)
    text = HPL.Ticker.replace(text)
    text = text.replace('...',r' \ellipsis\\ ').replace('.','').replace('\\\\','.').replace('--','—')
    text = HPL.Misc.replace(text).lower()
    return text.split()

@inside(HPL)
def wordify(text):
    return ' '+'  '.join(HPL.listify(text))+' '

@inside(HPL)
def postproc(text):
    return compose(*[getattr(getattr(HPL,a),'replace') for a in ['Quarter','Month']])(text)

In [24]:
# store the wordified things inside one place

@inside(HPL, name='words')
class words(O()):
    cAbbr = list(map(HPL.wordify, corpAbbr))
    cFull = list(map(HPL.wordify, corpFull))
    cSpec = list(map(HPL.wordify, corpSpec))

_replace_corpwords = make_replacer(HPL.words.cAbbr)
_rp = lambda x: _replace_corpwords(HPL.wordify(x))
HPL.words.assetNameSeries = assetNameIdAssign.series.map(_rp)
HPL.words.akamap = {x: (list([_rp(y)]) if isinstance(y, str) else list(map(_rp, y))) for x,y in dict.items(akamap)}


# store the regex of wordified things in one place

@memoized
def _make_stock_ending():
    maybe = '('+'|'.join(re.escape(' '+a+' ') for a in HPL.words.cSpec)+')?'
    corps = '('+'|'.join(re.escape(a) for a in HPL.words.cAbbr+HPL.words.cFull)+')?'
    return maybe+corps

def make_regex_for_stock(words): #TODO save time by considering only relevant stock endings
    lst = words.split()
    def rec_make_sub_chain(lst):
        return re.escape(' '+lst[0]+' ') if len(lst)==1 else re.escape(' '+lst[0]+' ')+'('+rec_make_sub_chain(lst[1:])+')?'
    return re.compile(rec_make_sub_chain(lst)+_make_stock_ending())


@inside(HPL, name='regex')
class regex(O()):
    assetNameSeries = HPL.words.assetNameSeries.map(make_regex_for_stock)
    akamap = {x: list(map(make_regex_for_stock, y)) for x,y in dict.items(HPL.words.akamap)}

In [25]:
# @inside(HPL) 
# def match_stocks(words):
#     for asset in HPL.assets:
#         put = r'\asset\self.' if asset==HPL.self else r'\asset.'
#         assetWords = HPL.words.assetNameSeries.loc[asset]
#         for i in range(len(assetWords), 0, -1):
#             useWords = assetWords[:i]
#             k = 0
#             while k+i <= len(words):
#                 if words[k:k+i] == useWords:
#                     words[k:k+i] = [put]
#                 k += 1
#     return words

@inside(HPL)
def match_stocks(words):
    for asset in [HPL.self] + [a for a in HPL.assets if a!=HPL.self]:
        put = r' \\asset\\self. ' if asset==HPL.self else r' \\asset. '
        
        assetCanon = assetNameIdAssign.series.loc[asset]
        assetWords, assetRegex = HPL.words.assetNameSeries.loc[asset], HPL.regex.assetNameSeries.loc[asset]
        akaWords, akaRegex = HPL.words.akamap.get(assetCanon, []), HPL.regex.akamap.get(assetCanon, [])
        if len(akaWords)==0:
            assetRegexList = [assetRegex]
        elif len(akaWords)==1 and next(iter(akaWords)) not in assetWords:
            assetRegexList = [next(iter(akaRegex)), assetRegex]
        else:
            assetRegexList = [assetRegex] + list(akaRegex)
        
        for assetRegex in assetRegexList:
            words = assetRegex.sub(put, words)
    return words

In [26]:
def transform_headlines(*, N):
    ret = [None] * len(N)
    # idk probably save time looking up N's hash table:
    sourceIds, assetNameIds, inHeadline = N.sourceId.values, N.assetNameId.values, (N.firstMentionSentence==1).values
    hTags, hTagIds = N.headlineTag.values, N.headlineTagId.values
    new_i = 0 # this tracks grouping of sourceIds
    # basically, I see that same sourceIds are always consecutive. so every time I see new news item I walk down table
    # to see where the same news item ends, and collect all the assetNameIds along the way and keep them for use
    for i, text in enumerate(N.headline.values):
        if i == new_i: # starting a new sourceId group, have to look ahead to see where it ends and gather assetNameIds
            HPL.assets = set()
            while new_i<len(sourceIds) and sourceIds[new_i]==sourceIds[i]:
                if inHeadline[new_i]:
                    HPL.assets.add(assetNameIds[new_i])
                new_i += 1
        HPL.self = assetNameIds[i]
        
        tag = hTags[i]
        text = text.strip()
        if tag and text[:len(tag)]==tag:
            _has_tag = True
            text = text[len(tag):].lstrip()
            if text[0] in '-–—:':
                text = text[1:]
                text = text.lstrip()
        else:
            _has_tag = False
            
        ret[i] = compose(HPL.wordify,HPL.match_stocks,HPL.postproc)(text)
        
        if _has_tag:
#             ret[i] = [fr'\htag\{hTagIds[i]}.', r'\htag\conn.'] + ret[i]
            ret[i] = fr' \htag\{hTagIds[i]}.  \htag\conn. ' + ret[i]
        
    N['headlineWords'] = ret

#### go do it

In [37]:
%%time
N = _N_small.pipe(lambda N: N[N.assetNameId.isin(assetNameMapCodes)])
transform_headlines(N=N)

CPU times: user 3min 5s, sys: 1.18 s, total: 3min 6s
Wall time: 3min 6s


In [50]:
_N_.to_pickle(big_data/'trans/N1010.pkl')

#### let's see the results

In [38]:
seeN = N.pipe(sample(10))

In [39]:
seeN[['assetName','assetCodes','headlineTag','headline']]

Unnamed: 0,assetName,assetCodes,headlineTag,headline
7300577,Covidien Ltd,"{'COV.BE', 'COV.N'}",,REG - Arrowgrass Capital Covidien plc - ISE Only - Covidien PLC <COV.N>
7309262,Celgene Corp,"{'CELG.O', 'CELG.OQ', 'CGNC.DE'}",,"Hogging the Limelight: New Research on Netlist, Celgene, Jamba, Citigroup and Tilly's"
7314338,Bank of America Corp,"{'BAC', 'BAC.N'}",UPDATE 2,UPDATE 2-Citigroup profit falls on legal and restructuring charges
7362110,DexCom Inc,"{'DXCM.OQ', 'DXCM.O'}",,DexCom to Present at 4th Annual Leerink Partners Global Healthcare Conference <DXCM.O>
7368600,Xencor Inc,"{'XNCR.OQ', 'XNCR.O'}",U.S. RESEARCH ROUNDUP,"U.S. RESEARCH ROUNDUP- Alibaba, Google, Amazon"
7509370,GlaxoSmithKline PLC,"{'GSK.L', 'GSK.F', 'GSK.DE', 'GSKy.DE', 'GSK.N'}",,SANOFI <SASY.PA>: CREDIT SUISSE RAISES PRICE TARGET TO 104 EUROS FROM 94 EUROS; RATING OUTPERFORM
7513857,Interoil Corp,{'IOC.N'},,INTEROIL ANNOUNCES POSITIVE 2014 RESULTS
7514585,Macerich Co,{'MAC.N'},,MACERICH BOARD ANNOUNCES GOVERNANCE CHANGES TO PROTECT STOCKHOLDER VALUE
7518041,Pfizer Inc,"{'PFE.N', 'PFE.DE', 'PFE.F'}",,"NIMBUS THERAPEUTICS SAYS COMPANY'S SERIES A INVESTORS, INCLUDING ATLAS VENTURE, SR ONE, LILLY VENTURES AND BILL..."
7548929,Auspex Pharmaceuticals Inc,{'ASPX.O'},,Reuters Insider - Stocks climb more than one percent


In [40]:
seeN.headlineWords

7300577                              reg  -  arrowgrass  capital  \asset\self.  -  ise  only  -  \asset\self.  \ticker\self. 
7309262      hogging  the  limelight  :  new  research  on  netlist  ,  \asset\self.  ,  jamba  ,  \asset.  and  tilly  '  s 
7314338                               \htag\10.  \htag\conn.  \asset.  profit  falls  on  legal  and  restructuring  charges 
7362110         \asset\self.  to  present  at  4th  annual  leerink  partners  global  healthcare  conference  \ticker\self. 
7368600                                                              \htag\151.  \htag\conn.  \asset.  ,  google  ,  \asset. 
7509370     sanofi  \ticker.  :  credit  suisse  raises  price  target  to  \count\2.  euros  from  \count\2.  euros  ;  r...
7513857                                                                   \asset\self.  announces  positive  \year.  results 
7514585                                 \asset\self.  board  announces  governance  changes  to  protect  stockholder 

#### see more crap

In [377]:
N.headlineTag.value_counts()

                          39501
BRIEF                      4013
UPDATE 1                   3602
US STOCKS                  2160
UPDATE 2                   1964
US RESEARCH SUMMARY        1600
RPT                        1266
STOCKS NEWS US             1054
UPDATE 3                   1035
FACTBOX                     645
ANALYSIS                    626
HEADLINE STOCKS             602
PRESS DIGEST                539
TEXT                        535
SNAPSHOT                    524
CORRECTED                   480
UPDATE 4                    439
CANADA STOCKS               390
US STOCKS SNAPSHOT          378
WRAPUP 1                    322
TAKE A LOOK                 307
INTERVIEW                   305
WRAPUP 2                    272
Japan Hot Stocks            263
REFILE                      243
WRAPUP 3                    207
PREVIEW                     201
ADR Report                  161
STOCKS NEWS EUROPE          159
UPDATE 5                    134
DEALTALK                    127
Swiss st

In [395]:
theN = _N_
for tag, head in zip(theN.headlineTag, theN.headline):
    if tag and head.strip()[:len(tag)+1] not in [tag+'-',tag+':'] and head.strip()[:len(tag)+2] != tag+' -':
        print(f'TAG{{{tag}}} HEAD{{{head}}}')
        print(head.strip()[:len(tag)+2], '!=', tag+' -')

TAG{NYMEX} HEAD{NYMEX <NMX.N>- crude oil futures volume record}
NYMEX < != NYMEX -
TAG{NYMEX} HEAD{NYMEX <NMX.N> -- To buy stake in Optionable}
NYMEX < != NYMEX -
TAG{New Issue} HEAD{New Issue  -American General Finance sells $600 mln}
New Issue   != New Issue -
TAG{UPDATE} HEAD{UPDATE – UPCOMING DEADLINE: LEVI & KORSINSKY, LLP Notifies Investors of Class Action Against Alcobra Ltd. and Its Board of Directors and a Lead Plaintiff Deadline of January 20, 2015 -- ADHD <ADHD.O>}
UPDATE – != UPDATE -
TAG{UPDATE} HEAD{UPDATE – UPCOMING DEADLINE: LEVI & KORSINSKY, LLP Notifies Investors of Class Action Against Vivint Solar, Inc. and Its Board of Directors and a Lead Plaintiff Deadline of January 20, 2015 <VSLR.N>}
UPDATE – != UPDATE -
TAG{UPDATE} HEAD{UPDATE – UPCOMING DEADLINE: LEVI & KORSINSKY, LLP Notifies Investors of Class Action Against FireEye, Inc. and Its Board of Directors and a Lead Plaintiff Deadline of January 26, 2015 <FEYE.O>}
UPDATE – != UPDATE -
TAG{UPDATE} HEAD{UPDATE – DEA

In [403]:
len(N)

65509

In [402]:
((_N_.firstMentionSentence==1) & (_N_.assetNameId.isin(assetNameMapCodes))).sum(), _N_.shape[0]

(3872830, 9328750)

In [404]:
(3872830/65509)*40 / 60

39.41270156263516

In [271]:
# the row index is 1547548
row

Unnamed: 0,time,sourceTimestamp,firstCreated,sourceId,headline,urgency,takeSequence,provider,subjects,audiences,bodySize,companyCount,headlineTag,marketCommentary,sentenceCount,wordCount,assetCodes,assetName,firstMentionSentence,relevance,sentimentClass,sentimentNegative,sentimentNeutral,sentimentPositive,sentimentWordCount,noveltyCount12H,noveltyCount24H,noveltyCount3D,noveltyCount5D,noveltyCount7D,volumeCounts12H,volumeCounts24H,volumeCounts3D,volumeCounts5D,volumeCounts7D,assetNameId,assetNamesCount,firstMentionFixed,groupbyTime
1547548,2009-01-15 18:43:11+00:00,2009-01-15 18:43:11+00:00,2009-01-15 18:30:09+00:00,70570c369ddd5b1a,"GENERAL MOTORS <GM.N> SAYS ITS U.S. HOURLY MANUFACTURING COSTS HAVE DECLINED TO AN ESTIMATED $8 BLN IN 2008, ES...",1,16,RTRS,"{'MRG', 'AUTO', 'MTG', 'CYCS', 'DBT', 'BACT', 'CDM', 'JOB', 'US', 'USC', 'AUT', 'RTRS', 'CARM', 'DEAL', 'DRV', ...","{'PCO', 'T', 'PCU', 'U', 'D', 'M', 'RNP', 'E', 'NAT'}",0,1,,False,1,27,"{'GM.N', 'GRM.N', 'GMS.N', 'GPM.N'}",Motors Liquidation Co,1,1.0,-1,0.811499,0.12971,0.058791,27,0,0,0,0,0,33,36,137,312,364,7203,1,1,2009-01-15 22:00:00+00:00


#### see which companies haven't found match

In [104]:
N = _N_

In [52]:
N['headlineList'] = N.headlineWords.map(str.split)
N['inHeadline'] = N.headlineList.map(lambda x: r'\asset\self.' in x)

CPU times: user 25.7 s, sys: 3.11 s, total: 28.8 s
Wall time: 28.7 s


In [105]:
Nnomatch = N.pipe(lambda N: N[(N.firstMentionSentence==1) & ~N.inHeadline & N.assetNameId.isin(assetNameMapCodes)])

In [107]:
nomatch = Nnomatch.assetNameId.value_counts()
nomatch.index.name, nomatch.name = nomatch.name, 'Count'

In [233]:
nomatch.reset_index(drop=True)#.plot();

0       15336
1        9985
2        9964
3        9878
4        9344
5        9164
6        8224
7        7780
8        7593
9        7467
10       7369
11       6457
12       6091
13       6079
14       5730
15       5679
16       5582
17       5064
18       4818
19       4651
20       4384
21       4012
22       3859
23       3770
24       3769
25       3494
26       3275
27       3229
28       3164
29       3039
30       2952
31       2930
32       2921
33       2867
34       2863
35       2833
36       2753
37       2696
38       2669
39       2560
40       2473
41       2461
42       2412
43       2270
44       2255
45       2204
46       2151
47       2007
48       1993
49       1983
50       1944
51       1940
52       1885
53       1875
54       1875
55       1821
56       1781
57       1779
58       1683
59       1618
60       1599
61       1593
62       1548
63       1490
64       1488
65       1460
66       1455
67       1446
68       1424
69       1385
70       1372
71    

In [357]:
(Nnomatch, nomatch, lookup) = pd.read_pickle('stock-help.pkl')

In [390]:
nomatch_i = 99
asset = nomatch.index[nomatch_i]
seeN = Nnomatch.pipe(lambda N: N[N.assetNameId==asset])
print(assetNameIdAssign.series.loc[asset])
seeN.headline.pipe(ljust)

Flex Ltd


53903      Next Inning Technology Previews Earnings for SiRF Technology, Flextronics International,... <SLAB.O><FLEX.O><HR...
54659      FLEXTRONICS INTERNATIONAL LTD.                                                                                 ...
54661      FLEXTRONICS INTERNATIONAL LTD <FLEX.O> Q3 NON-GAAP SHR $0.20                                                   ...
54668      FLEXTRONICS INTERNATIONAL LTD <FLEX.O> Q3 SHR $0.23 EXCLUDING ITEMS                                            ...
54675      Flextronics Announces Third Quarter Record Results <FLEX.O>                                                    ...
54679      FLEXTRONICS INTERNATIONAL LTD <FLEX.O> REUTERS ESTIMATES Q3 SHR VIEW $0.22                                     ...
54693      RPT-FLEXTRONICS INTERNATIONAL LTD <FLEX.O> Q3 SHR $0.23 EXCLUDING ITEMS                                        ...
54705      FLEXTRONICS INTERNATIONAL LTD <FLEX.O> SEES Q4 REVENUE AT ABOUT $4.8 BILLION                               

In [377]:
what = 24536
_N_.loc[what-1:what+1+1]

Unnamed: 0,time,sourceTimestamp,firstCreated,sourceId,headline,urgency,takeSequence,provider,subjects,audiences,bodySize,companyCount,headlineTag,marketCommentary,sentenceCount,wordCount,assetCodes,assetName,firstMentionSentence,relevance,sentimentClass,sentimentNegative,sentimentNeutral,sentimentPositive,sentimentWordCount,noveltyCount12H,noveltyCount24H,noveltyCount3D,noveltyCount5D,noveltyCount7D,volumeCounts12H,volumeCounts24H,volumeCounts3D,volumeCounts5D,volumeCounts7D,assetNameId,assetNamesCount,firstMentionFixed,groupbyTime,headlineTagId
24535,2007-01-16 22:31:18+00:00,2007-01-16 22:31:18+00:00,2007-01-16 22:30:57+00:00,7d9666055def0ccf,ONEOK PARTNERS LP <OKS.N> SAYS HAS INCREASED ITS QUARTERLY CASH DISTRIBUTION TO 98 CENTS PER UNIT,1,2,RTRS,"{'US', 'ELG', 'NGS', 'RTRS', 'LEN'}","{'ELN', 'O', 'E', 'U'}",0,2,,False,1,16,{'OKS.N'},ONEOK Partners LP,1,1.0,0,0.202568,0.600913,0.196519,16,2,2,2,2,2,2,2,2,2,2,2057,2,1,2007-02-01 22:00:00+00:00,-1
24536,2007-01-16 22:31:18+00:00,2007-01-16 22:31:18+00:00,2007-01-16 22:30:57+00:00,7d9666055def0ccf,ONEOK PARTNERS LP <OKS.N> SAYS HAS INCREASED ITS QUARTERLY CASH DISTRIBUTION TO 98 CENTS PER UNIT,1,2,RTRS,"{'US', 'ELG', 'NGS', 'RTRS', 'LEN'}","{'ELN', 'O', 'E', 'U'}",0,2,,False,1,16,{'OKE.N'},ONEOK Inc,1,1.0,0,0.202568,0.600913,0.196519,16,2,2,2,2,2,2,2,2,2,2,935,2,1,2007-02-01 22:00:00+00:00,-1
24537,2007-01-16 22:31:34+00:00,2007-01-16 22:31:34+00:00,2007-01-16 22:30:20+00:00,17b3ae118093971f,RENASANT CORP <RNST.O> REUTERS ESTIMATES Q4 SHR VIEW $0.43,1,4,RTRS,"{'FIN', 'RESF', 'US', 'RES', 'BNK', 'RTRS', 'LEN'}","{'E', 'U'}",0,1,,False,1,10,"{'RNST.OQ', 'RNST.O'}",Renasant Corp,1,1.0,0,0.016425,0.806599,0.176976,10,2,2,2,2,2,3,3,3,3,3,3452,1,1,2007-02-01 22:00:00+00:00,-1
24538,2007-01-16 22:31:58+00:00,2007-01-16 22:31:58+00:00,2007-01-16 22:31:58+00:00,5c23311c475412af,Spherion Updates Fourth Quarter 2006 Earnings Guidance and Schedules Conference Call <SFN.N>,3,1,PRN,"{'NEWR', 'RESF', 'BACT', 'RES', 'DPR', 'LEN'}",{'PRN'},7331,1,,False,45,1147,{'SFN.N'},SFN Group LLC,1,1.0,1,0.208569,0.328809,0.462622,700,0,0,0,0,0,0,0,0,0,0,8241,1,1,2007-02-01 22:00:00+00:00,-1


In [381]:
oks, oke = M[M.assetCode=='OKS.N'], M[M.assetCode=='OKE.N']

In [387]:
oks.set_index('time').returnsOpenNextMktres10.corr(oke.set_index('time').returnsOpenNextMktres10)

0.6580509598009644

## just looking

In [183]:
for name in _N_.assetName.unique():
    print(name)

PetroChina Co Ltd
Travelers Companies Inc
Wal-Mart Stores Inc
Google Inc
XM Satellite Radio Holdings Inc
Sirius XM Radio Inc
Walt Disney Co
Microsoft Corp
Yahoo! Inc
Tenet Healthcare Corp
Siebert Financial Corp
Monsanto Co
Constellation Brands Inc
Rite Aid Corp
Cyren Ltd
Qualcomm Inc
Siemens AG
Internet Gold Golden Lines Ltd
Goldman Sachs Group Inc
Merrill Lynch & Co Inc
Citigroup Inc
General Electric Co
Time Warner Inc
Twenty-First Century Fox Inc
Viacom Inc
Sony Corp
BHP Billiton Ltd
Ryanair Holdings PLC
Fomento Economico Mexicano SAB de CV
Sanofi SA
AU Optronics Corp
Taiwan Semiconductor Manufacturing Co Ltd
Royal Bank of Scotland Group PLC
Hutchison Telecommunications International Ltd
Indosat Tbk PT
Korea Electric Power Corp
Nasdaq Inc
Kookmin Bank
Woori Finance Holdings Co Ltd
Honda Motor Co Ltd
LG Display Co Ltd
Apple Inc
Adobe Systems Inc
China Mobile Ltd
HSBC Holdings PLC
Willis Towers Watson PLC
Banco Santander SA
Daimler AG
Titanium Metals Corp
Companhia Siderurgica Nacional

Tellurian Inc
Del Monte Foods Co
Borders Group Inc
Allergan Finance LLC
Medifast Inc
KP Pharmaceuticals LLC
Jack in the Box Inc
Cigna Corp
Kelly Services Inc
SFN Group LLC
Hope Bancorp Inc
McEwen Mining Inc
SQBG Inc
FEI Co
Endo Health Solutions Inc
Hungarian Telephone And Cable Corp
Newmont Mining Corp
Here Holding Corp
Mair Holdings Inc
NiSource Inc
Astora Women's Health Holdings LLC
Mountain Province Diamonds Inc
Oil States International Inc
Lifetime Brands Inc
Schmitt Industries Inc
Fleetwood Enterprises Inc
A. O. Smith Corp
Western Union Co
Imperial Oil Ltd
Pepco Holdings LLC
Ingredion Inc
Unisys Corp
SAP AG
BEA Systems Inc
Sonde Resources Corp
Occidental Petroleum Corp
Buca Inc
Abaxis Inc
VF Corp
Grupo Aeroportuario del Pacifico SAB de CV
Colonial Properties Trust
Mentor Graphics Corp
LSI Industries Inc
Rexnord-Zurn Holdings Inc
Microsemi Corp Memory and Storage Solutions
Taseko Mines Ltd
Associated Banc-Corp
Potlatch Corp
Compania Cervecerias Unidas SA
Polymet Mining Corp
Central

Butler National Corp
Hemagen Diagnostics Inc
Gungnir Resources Inc
Avigen Inc
Electro Rent Corp
MGT Capital Investments Inc
Ultrapetrol (Bahamas) Ltd
NII Holdings Inc
C&F Financial Corp
Edwards Lifesciences Corp
Donaldson Company Inc
Veramark Technologies Inc
SRC Liquidation LLC
DNP Select Income Fund Inc
American Medical Alert Corp
Rand Capital Corp
Minrad International Inc
Glen Rose Petroleum Corp
Delcath Systems Inc
Cleco Corporate Holdings LLC
Genlyte Group Inc
Alliance HealthCare Services Inc
Yadkin Financial Corp
LL&E Royalty Trust
Tortoise Energy Infrastructure Corp
Tortoise North American Energy Corp
Camco Financial Corp
Quality Distribution Inc
Britton & Koontz Capital Corp
Zilog Inc
Champion Enterprises Inc
Sturm Ruger & Company Inc
TIB Financial Corp
Datawatch Corp
Integrated Silicon Solution Inc
MTM Technologies Inc
Magyar Bancorp Inc
Farmers National Bancorp
Smart & Final LLC
North Central Bancshares Inc
Catalina Marketing Corp
Eagle Rock Energy Partners LP
Devcon Internat

Northwest Airlines Corp
Clean Energy Fuels Corp
LDK Solar Co Ltd
Fuwei Films Holdings Co Ltd
PimCo Municipal Advantage Fund InCorp
PIMCO Income Strategy Fund II
PIMCO Income Strategy Fund
Allianzgi Convertible & Income Fund
Allianzgi Convertible & Income Fund II
PIMCO New York Municipal Income Fund II
PIMCO California Municipal Income Fund II
PIMCO Municipal Income Fund III
PIMCO California Municipal Income Fund Iii
Pimco New York Municipal Income Fund III
PIMCO Municipal Income Fund
PIMCO California Municipal Income Fund
PIMCO New York Municipal Income Fund
Nuveen Core Equity Alpha Fund
Nuveen Diversified Currency Opportunities Fund
Alpine Global Dynamic Dividend Fund
Gpc Biotech AG
TomoTherapy Inc
Neuro-Hitech Inc
Response Genetics Inc
Starent Networks LLC
WaferGen Bio-Systems Inc
Towerstream Corp
DGT Holdings Corp
Florida Gaming Corp
Infinera Corp
Deutsche Global High Income Fund Inc
FBR & Co
Yingli Green Energy Holding Co Ltd
Limelight Networks Inc
Boss Holdings Inc
Chang-On Intern

Palo Alto Networks Inc
Chuy's Holdings Inc
Natural Grocers By Vitamin Cottage Inc
Northern Tier Energy LP
E2open LLC
Horizon Therapeutics Inc
Del Frisco's Restaurant Group Inc
Globus Medical Inc
Howard Bancorp Inc
Eloqua Inc
Xylem Inc
Bloomin' Brands Inc
Peregrine Semiconductor Corp
Tronox Ltd
Manchester United PLC
Performant Financial Corp
Engility Holdings Inc
Hi Crush Partners LP
CU Bancorp
Sandstorm Gold Ltd
IF Bancorp Inc
Seven Arts Entertainment Inc
Trulia LLC
National Bank Holdings Corp
Capital Bank Financial Corp
Emerald Oil Inc
Corts Trust Weyerhauser Co
Pareteum Corp
Qualys Inc
Summit Midstream Partners LP
ADT Corp
Kraft Foods Group Inc
LifeLock Inc
JAVELIN Mortgage Investment Corp
Tile Shop Holdings Inc
W. P. Carey Inc
Berry Global Group Inc
Regulus Therapeutics Inc
Fleetmatics Group Ltd
Blackrock Municipal 2030 Target Term Trust
Ryman Hospitality Properties Inc
Amira Nature Foods Ltd
Realogy Holdings Corp
RLJ Entertainment Inc
Shutterstock Inc
Kythera Biopharmaceuticals Inc

In [297]:
_N_.pipe(lambda N: N.headline[N.assetNameId==7760].pipe(ljust))

20772     Asia Netcom in $636 mln trans-Pacific cable project                                                            ...
91927     PRESS DIGEST - Singapore newspapers - Feb 14                                                                   ...
290017    Singapore Hot Stocks-STATS ChipPAC in focus amid bid speculation                                               ...
321336    Pacific Internet to restate results from 2000 to 2006                                                          ...
322182    CONNECT HOLDINGS SAYS TO DATE, THERE ARE NO ALTERNATIVE OFFERS OR CONCRETE PROPOSALS AVAILABLE TO ALL SHAREHOLD...
322183    CONNECT HOLDINGS SAYS REVISED OFFER PRICE FOR PACIFIC INTERNET <PCNTF.O> REPRESENTS AN INCREASE OF 10 PCT FROM ...
324658    PRESS DIGEST - Singapore newspapers - June 8                                                                   ...
474706    PACIFIC INTERNET LTD <PCNTF.O> SAYS CEO PHEY TECK MOH TO STEP DOWN - SEC FILING                                ...


# Explore

In [94]:
/pd.DataFrame sorted(set(zip(M.assetCode,M.assetName)))

Unnamed: 0,0,1
0,A.N,Agilent Technologies Inc
1,AA.N,Alcoa Corp
2,AAI.N,AirTran Holdings Inc
3,AAL.O,American Airlines Group Inc
4,AAMRQ.OB,Unknown
5,AAN.N,Aaron's Inc
6,AAON.O,Aaon Inc
7,AAP.N,Advance Auto Parts Inc
8,AAPL.O,Apple Inc
9,AAT.N,American Assets Trust Inc


In [119]:
N[N.assetNamesCount==2].headline.pipe(ljust)

1521196    SNAPSHOT - Financial Crisis - 0045 GMT                                                                         ...
1521197    SNAPSHOT - Financial Crisis - 0045 GMT                                                                         ...
1521199    UPDATE 2-GM gets $4 bln rescue loan, Chrysler on hold                                                          ...
1521200    UPDATE 2-GM gets $4 bln rescue loan, Chrysler on hold                                                          ...
1521202    RPT-India's TCS sees back office deals                                                                         ...
1521203    RPT-India's TCS sees back office deals                                                                         ...
1521204    IBM in sales alliance with Japan's Ricoh - paper                                                               ...
1521205    IBM in sales alliance with Japan's Ricoh - paper                                                           

In [150]:
N.loc[[1527026,1527027]]

Unnamed: 0,time,sourceTimestamp,firstCreated,sourceId,headline,urgency,takeSequence,provider,subjects,audiences,bodySize,companyCount,headlineTag,marketCommentary,sentenceCount,wordCount,assetCodes,assetName,firstMentionSentence,relevance,sentimentClass,sentimentNegative,sentimentNeutral,sentimentPositive,sentimentWordCount,noveltyCount12H,noveltyCount24H,noveltyCount3D,noveltyCount5D,noveltyCount7D,volumeCounts12H,volumeCounts24H,volumeCounts3D,volumeCounts5D,volumeCounts7D,assetNameId,assetNamesCount,firstMentionFixed,groupbyTime
1527026,2009-01-07 04:10:50+00:00,2009-01-07 04:10:50+00:00,2009-01-07 04:10:50+00:00,06a6b51052ef9c67,UPDATE 2-Bank of America sells $2.8 bln CCB stake,3,1,RTRS,"{'MRG', 'DE', 'ISU', 'FINS', 'INVS', 'SG', 'COFS', 'RTRS', 'MEVN', 'EMRG', 'BUS', 'BNK', 'FIN', 'FUND', 'EUROPE...","{'Z', 'UKI', 'T', 'DNP', 'PSC', 'U', 'D', 'M', 'EMK', 'RNP', 'NAW', 'PTD', 'E'}",3737,4,UPDATE 2,False,27,709,"{'UBS.N', 'UBSN.DE', 'UBSN.VX', 'UBSN.F'}",UBS AG,15,0.09759,1,0.11289,0.15927,0.72784,97,0,0,0,0,0,13,23,32,36,56,8948,2,15,2009-01-07 22:00:00+00:00
1527027,2009-01-07 04:10:50+00:00,2009-01-07 04:10:50+00:00,2009-01-07 04:10:50+00:00,06a6b51052ef9c67,UPDATE 2-Bank of America sells $2.8 bln CCB stake,3,1,RTRS,"{'MRG', 'DE', 'ISU', 'FINS', 'INVS', 'SG', 'COFS', 'RTRS', 'MEVN', 'EMRG', 'BUS', 'BNK', 'FIN', 'FUND', 'EUROPE...","{'Z', 'UKI', 'T', 'DNP', 'PSC', 'U', 'D', 'M', 'EMK', 'RNP', 'NAW', 'PTD', 'E'}",3737,4,UPDATE 2,False,27,709,"{'IKM.N', 'IKL.N', 'BAC.N', 'IKJ.N', 'IKR.N'}",Bank of America Corp,1,1.0,-1,0.722047,0.184978,0.092975,581,4,4,4,4,4,18,19,50,62,100,141,2,1,2009-01-07 22:00:00+00:00


##### loose stuff

In [21]:
re.findall(r'<[A-Z]+[a-z]*.?[A-Z]*>', '<AAPL.O> hi <WHAa.N>')

['<AAPL.O>', '<WHAa.N>']

In [85]:
def iter_tickers(N):
    re_ticker = re.compile(r'<[A-Z]*[a-z]*.?[A-Z]*>')
    for head in N.headline:
        yield from re_ticker.findall(head)
tickers = set(t[1:-1] for t in iter_tickers(_N_))

In [29]:
codes = {(a[:a.index('.')] if '.' in a else a): a for a in assetCodeIdAssign.series}

In [81]:
codes['USE']

KeyError: 'USE'

In [84]:
for code in assetCodeIdAssign.series:
    if any('a'<=x<='z' for x in code):
        print(code)

BFb.N
CMGb.N
HUBb.N
MWAb.N
PBRa.N
RDSa.N
RDSb.N
TRYb.N
VIAb.N
TIa.N
RGAa.N
RGAb.N
EBRb.N
JWa.N
MOGa.N
BRKb.N
PRISb.N
VALEp.N
FCEa.N
UAc.N
LGFb.N


In [None]:
corpFullWords = {
    'Incorporated': 'Inc',
    'Corporation': 'Corp',
    'Company': 'Co',
    'Limited': 'Ltd',
    'Limited Liability Company': 'LLC',
    'Public Limited Company': 'PLC',
    'Limited Partnership': ['LP', 'L P'],
    'Companies': 'Cos',
    'Societe anonyme': ['SA','S A'],
    'Naamloze vennootschap': ['NV', 'N V'],
    # AS A/S ASA full names don't show up in news
    'Aktiengesellschaft': 'AG',
}
corpWords = set('Inc ,Inc Corp Co Ltd ,Ltd LLC ,LLC PLC ,PLC LP ,LP Cos SA ,SA NV ,NV ASA A/S AS PAO AG'.split())
corpWords |= set(['S A',', S A','L P','P L C','N V','SA de CV',',SA de CV'])
corpSpecial = 'set(Co Company Cos Companies'.split())

In [366]:
pd.Series(list(filter(lambda x: x.lower()[-2:]=='nv', assetNameIdAssign.series)))

0                                 Aegon NV
1                       AerCap Holdings NV
2                  Anheuser Busch Inbev NV
3         Chicago Bridge & Iron Company NV
4                     Core Laboratories NV
5                             ING Groep NV
6                   Koninklijke Philips NV
7                          Schlumberger NV
8                    STMicroelectronics NV
9                              Unilever NV
10                           CNH Global NV
11                               Eurand NV
12                                Mylan NV
13                                 Relx NV
14                              Crucell NV
15                             Cimpress NV
16                   NXP Semiconductors NV
17            LyondellBasell Industries NV
18                               Yandex NV
19                    InterXion Holding NV
20                     AVG Technologies NV
21                         ASML Holding NV
22                 Franks International NV
23         

filtering news to see patterns; see if anything

In [65]:
re_dot = re.compile(r'[a-tv-zA-TV-Z]\.[a-rt-zA-RT-Z]')
re_ticker = re.compile(r'<[A-Z]+[a-z]*\.[A-Z]+>')
re_money = re.compile(r'\$[0-9]+(,[0-9][0-9][0-9])*\.[0-9][0-9][0-9]')
#symb = Counter()
for head in _N_.headline:
    #symb += Counter(head)
    if '¢' in (re_ticker.sub('', head).replace('.','')).lower():
        print(head)

99¢ Only Stores Reports Total Sales of $351.1 Million for the Third Quarter of Fiscal 2009 Ended December 27, 2008 <NDN.N>
99¢ Only Stores Will Sell 9 Apple iPod Nanos for Only 99¢ Each to Celebrate the Grand Opening of Its Newest 99¢ Only Stores on Thursday, January 22nd, 2009, in Norwalk, California <NDN.N>
99¢ Only Stores to Report Third Quarter Fiscal 2009 Financial Results on February 4, 2009 <NDN.N>
99¢ ONLY STORES� ANNOUNCES SUSPENSION OF EXIT FROM TEXAS MARKET TO RE-EVALUATE ITS TEXAS OPERATIONS IN LIGHT OF A SIGNIFICANT IMPROVEMENT IN JANUARY SALES RESULTS AND THE POTENTIAL BENEFICIAL OPPORTUNITIES RESULTING FROM A RECESSIONARY ECON
99¢ Only Stores Announces Suspension of Exit from Texas Market to Re-evaluate its Texas Operations in Light of a Significant Improvement in January Sales Results and the Potential Beneficial Opportunities Result <NDN.N>
99¢ ONLY STORES� REPORTS THIRD QUARTER FISCAL 2009 FINANCIAL RESULTS
99¢ Only Stores Reports Third Quarter Fiscal 2009 Financial R

99¢ Only Stores Will Sell 22\\" Sylvania Flat Screen LCD TVs for Only 99 Cents Each to Celebrate the Grand Opening of Its 99¢ Only Stores Thousand Oaks Location in San Antonio, Texas, on Friday the 13th of August <NDN.N>
99¢ Only Stores Reports $0.24 EPS for Q1 Fiscal 2011 versus $0.14 for Q1 Fiscal 2010 <NDN.N>
99¢ Only Stores Will Sell 19\\" Flat Screen LCD TVs for Only 99 Cents Each to the First 9 Customers to Celebrate the Grand Opening of Its First 99¢ Only Stores in Artesia, California, on Friday the 13th of August <NDN.N>
Common Cents Mobile Hosts 7¢ Appetizers at Black Bear Saloon August 25, 2010 <S.N>
99¢ Only Stores Will Sell 22\\" Phillips Flat Screen LCD TVs for Only 99 Cents Each to the First 9 Customers to Celebrate the Grand Opening of the New 99¢ Only Stores Location in North Hollywood, California on Th <NDN.N>
99¢ Only Stores to Sell Gallons of Milk for Only 89¢ in all of its Dallas-Fort Worth, Texas Stores Starting Saturday, September 18 <NDN.N>
99¢ Only Stores Will S

INTERCONTINENTAL HOTELS GROUP PLC <IHG.L> - TOTAL DIVIDEND PER SHARE 64.0¢, UP 16 PCT
Sturm, Ruger & Company, Inc. Reports 2012 Fully Diluted Earnings of $3.60 Per Share and Declares Dividend of 40.4¢ Per Share <RGR.N>
STURM, RUGER & COMPANY, INC. REPORTS FIRST QUARTER FULLY DILUTED EARNINGS OF $1.20 PER SHARE AND DIVIDEND OF 49¢ PER SHARE
Sturm, Ruger & Company, Inc. Reports First Quarter Fully Diluted Earnings of $1.20 Per Share and Dividend of 49¢ Per Share <RGR.N>
TARGA RESOURCES CORP <TRGP.N> SAYS BOARD OF DIRECTORS HAS DECLARED A QUARTERLY CASH DIVIDEND OF 53.25¢ PER SHARE
STURM, RUGER & COMPANY, INC. REPORTS SECOND QUARTER FULLY DILUTED EARNINGS OF $1.63 PER SHARE AND DIVIDEND OF 65¢ PER SHARE
Sturm, Ruger & Company, Inc. Reports Second Quarter Fully Diluted Earnings of $1.63 Per Share and Dividend of 65¢ Per Share <RGR.N>
INTERCONTINENTAL HOTELS GROUP PLC <IHG.L> - 10% INCREASE IN THE INTERIM DIVIDEND TO 23¢
RPT-INTERCONTINENTAL HOTELS GROUP PLC <IHG.L> - 10% INCREASE IN THE IN

Sturm, Ruger & Company, Inc. Reports 2015 Fully Diluted Earnings of $3.21 Per Share and Declares Dividend of 35¢ Per Share <RGR.N>
STURM, RUGER & COMPANY, INC. REPORTS 2015 FULLY DILUTED EARNINGS OF $3.21 PER SHARE AND DECLARES DIVIDEND OF 35¢ PER SHARE
STURM RUGER & CO INC <RGR.N> SAYS BOARD OF DIRECTORS DECLARED A DIVIDEND OF 35¢ PER SHARE FOR THE FOURTH QUARTER
STURM, RUGER & COMPANY, INC. REPORTS 2015 FULLY DILUTED EARNINGS OF $3.21 PER SHARE AND DECLARES DIVIDEND OF 35¢ PER SHARE
STURM, RUGER & COMPANY, INC. REPORTS FIRST QUARTER DILUTED EARNINGS OF $1.21 PER SHARE AND DECLARES DIVIDEND OF 48¢ PER SHARE
Sturm, Ruger & Company, Inc. Reports First Quarter Diluted Earnings of $1.21 Per Share and Declares Dividend of 48¢ Per Share <RGR.N>
ALASKA AIR GROUP INC SEES Q2 COST PER ASM EXCLUDING ITEMS 8.00¢ - 8.05¢
Carriage Services Raises Quarterly Cash Dividend To 5¢/QTR <CSV.N>
CARRIAGE SERVICES RAISES QUARTERLY CASH DIVIDEND TO 5¢/QTR
CARRIAGE SERVICES INC <CSV.N> SAYS BOARD HAS APPROVE

In [61]:
pd.DataFrame([t for t in symb.most_common() if t[0] not in ' abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'])

Unnamed: 0,0,1
0,.,9361350
1,-,7011447
2,>,4418923
3,<,4418899
4,",",4295058
5,$,2368578
6,',1110070
7,:,900175
8,/,752278
9,&,542335


In [91]:
test = pd.read_pickle(the_data/'given/test.pkl')

In [99]:
for t, (__, NN, ____) in enumerate(test):
    smapi = {}
    for i, src in enumerate(NN.sourceId):
        if src in smapi and smapi[src] != i-1:
            print(t, smapi[src], i)
        smapi[src] = i

In [86]:
pd.Series(list(tickers-set(assetCodeIdAssign.series)))

0              SILD.L
1              SSPH.L
2              PINN.O
3              MDME.L
4              LARD.L
5              KEL.TO
6               CEO.L
7               SAL.O
8          PVASTAT.AX
9               EFF.N
10              DHG.N
11            CAGR.PA
12             GEMS.O
13           NHLDD.OB
14             LII.TO
15              PEO.N
16            UVIC.PK
17            ERCS.WA
18             MMZ.TO
19              ICS.N
20           CCTYQ.PK
21             IGAS.L
22            UBSVX.N
23              PGP.N
24            BPTH.OB
25              SMO.V
26              PNS.A
27            MRPL.BO
28             ATSC.A
29            BOKF.OQ
30            CTICD.O
31              TVL.V
32             WWW.TO
33             AGIL.O
34             VMT.AX
35           ABGek.MC
36             CWBS.O
37              RAS.L
38            ADAV.AD
39             AERO.O
40               PQ.N
41             WHLM.O
42              BVM.L
43              JSN.N
44              ACG.L
45        

# Scratch

In [39]:
'FDK' in codes

False

In [45]:
pd.set_option('display.max_colwidth', 115)

In [50]:
pd.set_option('display.max_colwidth', 115)
N[N.firstMentionSentence==1][['assetName','headline']].pipe(sample(5000)).pipe(ljust_c('headline'))

Unnamed: 0,assetName,headline
1521238,Bank of America Corp,UPDATE 1-Bank of America completes Merrill Lynch purchase ...
1521247,Bank of America Corp,UPDATE 1-Bank of America completes Merrill Lynch purchase ...
1521255,Microsoft Corp,Microsoft blames leap year for Zune glitch ...
1521329,KB Financial Group Inc,"S.KOREA DEC HOUSING PRICES UP 3.1 PCT YR/YR, GROWTH SLOWS FOR 4TH MONTH IN ROW -KOOKMIN BANK ..."
1521336,BP PLC,DGAP-Adhoc: BP p.l.c.: Total Voting Rights <BP.L> ...
1521347,Eli Lilly and Co,REG-Lilly (Eli) & Co: Regulatory Application update - prasugrel <LLY.N> ...
1521361,Deutsche Bank AG,UPDATE 1-Deutsche Post CFO to step down in June ...
1521372,Tata Motors Ltd,"INDIA'S TATA MOTORS <TAMO.BO> DEC VEHICLE SALES 25,219 UNITS ..."
1521405,Lloyds Banking Group PLC,REG-Lloyds TSB Group Plc Rule 8.1 - Lloyds TSB Group PLC <LLOY.L> ...
1521425,Stantec Inc,Stantec Completes Acquisition of Environmental Consulting Firm Jacques Whitford <STN.TO> ...
