In [1]:
import gensim
import os

from tqdm import tqdm
from gensim.models.keyedvectors import KeyedVectors

path_10k = './data/10k_1900_org/'
model_name = "fin_word2vec"

In [2]:
class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in tqdm(os.listdir(self.dirname)):
            for line in open(os.path.join(self.dirname, fname)):
                yield line.split()
                
sentences = MySentences(path_10k)

In [4]:
num_features = 300

# Min number of word count to be considered in the Word2vec model. If your corpus is
# small, reduce the min count. If you’re training with a large corpus, increase the min count.
min_word_count = 3

num_workers = 15   # CPU cores

window_size = 6    # Context window

subsampling = 1e-3     # Subsampling rate for frequent terms

In [6]:
model_fin = gensim.models.Word2Vec(sentences,
                workers=num_workers,
                size=num_features,
                min_count=min_word_count,
                window=window_size,
                sample=subsampling)

100%|██████████| 33612/33612 [04:42<00:00, 118.93it/s]
100%|██████████| 33612/33612 [22:38<00:00, 24.75it/s]
100%|██████████| 33612/33612 [22:09<00:00, 25.28it/s]
100%|██████████| 33612/33612 [22:03<00:00, 25.40it/s]
100%|██████████| 33612/33612 [23:01<00:00, 24.34it/s]
100%|██████████| 33612/33612 [24:03<00:00, 23.28it/s]


In [7]:
model_fin.init_sims(replace=True)
model_fin.save(model_name)

In [4]:
model_fin = gensim.models.Word2Vec.load(model_name)

In [5]:
general_word2vec_file_path = '/hdd/data/NLP_data/GoogleNews-vectors-negative300.bin.gz'
model_gen = KeyedVectors.load_word2vec_format(general_word2vec_file_path, binary=True)

In [6]:
model_fin.most_similar('loss')

  """Entry point for launching an IPython kernel.


[('losses', 0.5062823295593262),
 ('reduction', 0.449474036693573),
 ('write-offs', 0.4352741837501526),
 ('pretax', 0.4253714680671692),
 ('catastrophic', 0.4217339754104614),
 ('Loss', 0.41880518198013306),
 ('claims', 0.410952091217041),
 ('adjusting', 0.4031469523906708),
 ('write-downs', 0.40068864822387695),
 ('realized', 0.3899797797203064)]

In [7]:
model_gen.most_similar('loss')

[('losses', 0.7114114761352539),
 ('losing', 0.5708736181259155),
 ('Loss', 0.5704946517944336),
 ('lost', 0.5399519801139832),
 ('setback', 0.535179615020752),
 ('defeat', 0.5325874090194702),
 ('losss', 0.49183201789855957),
 ('Losses', 0.4891048073768616),
 ('drubbing', 0.48261427879333496),
 ('lossof', 0.4781075119972229)]

In [9]:
model_fin.most_similar('seasonality')

  """Entry point for launching an IPython kernel.


[('seasonal', 0.831345796585083),
 ('Seasonality', 0.6415965557098389),
 ('Seasonal', 0.5762810707092285),
 ('cyclical', 0.5667065382003784),
 ('seasons', 0.5256400108337402),
 ('summer', 0.521816611289978),
 ('cyclicality', 0.5193940997123718),
 ('patterns', 0.5144165754318237),
 ('season', 0.5143308639526367),
 ('somewhat', 0.5067400932312012)]

In [10]:
model_gen.most_similar('seasonality')

[('seasonally', 0.6071540713310242),
 ('seasonally_slower', 0.6037262678146362),
 ('seasonally_weak', 0.5952291488647461),
 ('seasonally_weaker', 0.5834262371063232),
 ('seasonally_slowest', 0.5686658024787903),
 ('seasonal', 0.5668664574623108),
 ('seasonally_weakest', 0.5654483437538147),
 ('seasonal_fluctuations', 0.5628937482833862),
 ('gross_margins', 0.5598294734954834),
 ('seasonal_slowness', 0.556877613067627)]

In [11]:
model_fin.most_similar('risk')

  """Entry point for launching an IPython kernel.


[('risks', 0.6855504512786865),
 ('exposures', 0.5466366410255432),
 ('volatility', 0.47754788398742676),
 ('movements', 0.4771957993507385),
 ('exposure', 0.4766755998134613),
 ('sensitivity', 0.4718496799468994),
 ('portfolios', 0.46601906418800354),
 ('portfolio', 0.4542062282562256),
 ('sensitive', 0.44436657428741455),
 ('modeling', 0.4413449764251709)]

In [12]:
model_gen.most_similar('risk')

[('risks', 0.7139054536819458),
 ('Risk', 0.6394845843315125),
 ('probability', 0.5938692688941956),
 ('danger', 0.5917248725891113),
 ('likelihood', 0.5871132612228394),
 ('therisk', 0.526685357093811),
 ('risky', 0.5120391845703125),
 ('Risks', 0.4975811541080475),
 ('hazard', 0.48922520875930786),
 ('peril', 0.48811468482017517)]

In [13]:
model_fin.most_similar('Apple')

  """Entry point for launching an IPython kernel.


[('Roku', 0.5481631755828857),
 ('HomeKit', 0.5420140027999878),
 ('Acer', 0.5328850746154785),
 ('Zynga', 0.5059128999710083),
 ('Muvee', 0.48369506001472473),
 ('Windows', 0.48292556405067444),
 ('iPads', 0.47787484526634216),
 ('LogMeIn', 0.47731736302375793),
 ('Fitbit', 0.47358956933021545),
 ('iOS', 0.47351932525634766)]

In [14]:
model_gen.most_similar('Apple')

[('Apple_AAPL', 0.7456985712051392),
 ('Apple_Nasdaq_AAPL', 0.7300410270690918),
 ('Apple_NASDAQ_AAPL', 0.7175089716911316),
 ('Apple_Computer', 0.7145973443984985),
 ('iPhone', 0.6924266219139099),
 ('Apple_NSDQ_AAPL', 0.6868604421615601),
 ('Steve_Jobs', 0.6758422255516052),
 ('iPad', 0.6580768823623657),
 ('Apple_nasdaq_AAPL', 0.6444970965385437),
 ('AAPL_PriceWatch_Alert', 0.6439753770828247)]

### Analogy Intrinsic Task

In [15]:
def analogy(model, x1, x2, y1):
    result = model.most_similar(positive=[y1, x2], negative=[x1])
    return result

In [16]:
analogy(model_fin, 'Apple', 'iOS', 'Google')

  


[('Android', 0.5993649959564209),
 ('apps', 0.5661303997039795),
 ('downloadable', 0.5332924127578735),
 ('Apps', 0.5264043211936951),
 ('Macs', 0.5256156921386719),
 ('App', 0.524298906326294),
 ('Kindle', 0.5140968561172485),
 ('download', 0.5076683759689331),
 ('iPad', 0.5064690709114075),
 ('browser', 0.4977918565273285)]

In [17]:
analogy(model_gen, 'Apple', 'iOS', 'Google')

[('Gmail', 0.623252272605896),
 ('Firefox', 0.610449492931366),
 ('Google_Chrome_browser', 0.5981649160385132),
 ('Instant_Previews', 0.597310483455658),
 ('search_engine', 0.5957738161087036),
 ('Realtime_Search', 0.5949864983558655),
 ('Google_Maps', 0.5920776724815369),
 ('GoogleBot', 0.5913113355636597),
 ('GMail', 0.5819740891456604),
 ('GTalk', 0.581884503364563)]

In [18]:
analogy(model_fin, 'Microsoft', 'Windows', 'Apple')

  


[('Multi-Touch', 0.47000741958618164),
 ('phones', 0.4420347809791565),
 ('iOS', 0.44144952297210693),
 ('Mini-Note', 0.43403080105781555),
 ('Roku', 0.432131826877594),
 ('iBooks', 0.4201308488845825),
 ('iPads', 0.4152991771697998),
 ('iPhone', 0.4151184856891632),
 ('appliances', 0.4105103313922882),
 ('HomeKit', 0.40963250398635864)]

In [19]:
analogy(model_gen, 'Microsoft', 'Windows', 'Apple')

[('Macs', 0.6735683679580688),
 ('iMac', 0.6463401317596436),
 ('Mac_OS', 0.6407142877578735),
 ('iPhone', 0.6405883431434631),
 ('iPad', 0.6334643363952637),
 ('OS_X', 0.632135808467865),
 ('iBook', 0.626196563243866),
 ('iMacs', 0.6192452311515808),
 ('iOS', 0.6171784400939941),
 ('Mac_mini', 0.6111396551132202)]

In [20]:
analogy(model_fin, 'Microsoft', 'Nadella', 'Google') ###########

  


[('Ex21', 0.47667866945266724),
 ('MENEAR', 0.4631417393684387),
 ('ESRX', 0.4462171792984009),
 ('Goggins', 0.44502824544906616),
 ('Slessor', 0.44067633152008057),
 ('Peltzman', 0.4400343894958496),
 ('shutterfly', 0.4358164668083191),
 ('NakedApartments', 0.43540889024734497),
 ('$73.93', 0.4330419898033142),
 ('10533', 0.4322330355644226)]

In [21]:
analogy(model_gen, 'Microsoft', 'Nadella', 'Google')

[('search_engine', 0.6103251576423645),
 ('Udi_Manber', 0.5701125860214233),
 ('search_engines', 0.542169451713562),
 ('Sukhinder_Singh_Cassidy', 0.5397337675094604),
 ('Baidu', 0.52626633644104),
 ('Susan_Wojcicki', 0.5186780691146851),
 ('Costolo', 0.5149146914482117),
 ('Google_GOOG', 0.512500524520874),
 ('Shashi_Seth', 0.5101230144500732),
 ('Ask_Jeeves', 0.5079712867736816)]

In [22]:
analogy(model_fin, 'Footlocker', 'footwear', 'Guess')

  


[('apparel', 0.531299352645874),
 ('watches', 0.5268425941467285),
 ('boots', 0.49671846628189087),
 ('shirts', 0.4928165078163147),
 ('Lacoste', 0.47323545813560486),
 ('sweaters', 0.47144144773483276),
 ('leather', 0.4669017195701599),
 ('Couture', 0.4624824523925781),
 ('sportswear', 0.4598991870880127),
 ('Nautica', 0.45740416646003723)]

In [23]:
analogy(model_gen, 'Footlocker', 'footwear', 'Guess')

[('shoe', 0.47444984316825867),
 ('sandals', 0.44797807931900024),
 ('shoes', 0.4443754255771637),
 ('footware', 0.4354614019393921),
 ('apparel', 0.42583325505256653),
 ('clothing', 0.42472803592681885),
 ('Dansko_clogs', 0.4087808132171631),
 ('denims', 0.4050804674625397),
 ('denim', 0.40449780225753784),
 ('MC_Hammer_pants', 0.4019485116004944)]

In [24]:
analogy(model_fin, 'Google', 'California', 'Microsoft')

  


[('Minnesota', 0.5028643608093262),
 ('Texas', 0.49553221464157104),
 ('Georgia', 0.49167919158935547),
 ('Indiana', 0.4916401505470276),
 ('Florida', 0.4826250672340393),
 ('Washington', 0.4682881236076355),
 ('Oregon', 0.4650631844997406),
 ('Mississippi', 0.4469985365867615),
 ('Sylmar', 0.4395696520805359),
 ('Salt', 0.43903231620788574)]

In [25]:
analogy(model_gen, 'Google', 'California', 'Microsoft')

[('Southern_California', 0.6022380590438843),
 ('San_Diego', 0.5298261642456055),
 ('Calfornia', 0.528758704662323),
 ('Califonia', 0.5060601830482483),
 ('Gov._Arnold_Schwarzenegger', 0.4928062856197357),
 ('Calif.', 0.49020737409591675),
 ('Sacramento', 0.488997220993042),
 ('Califor_nia', 0.48835262656211853),
 ('Californians', 0.4864913821220398),
 ('Califronia', 0.4828268587589264)]

### Odd-one-out Intrinsic Task

In [42]:
print(model_fin.doesnt_match("breakfast cereal dinner lunch".split()))

cereal


  """Entry point for launching an IPython kernel.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


In [66]:
print(model_fin.doesnt_match("Microsoft Google Facebook Netflix Apple".split()))

Apple


  """Entry point for launching an IPython kernel.


In [65]:
print(model_gen.doesnt_match("Microsoft Google Facebook Netflix Apple".split()))

Facebook


In [49]:
print(model_fin.doesnt_match("Guess McDonald's Darden Starbucks".split()))

Guess


  """Entry point for launching an IPython kernel.


In [50]:
print(model_gen.doesnt_match("Guess McDonald's Darden Starbucks".split()))

Darden


In [51]:
print(model_fin.doesnt_match("Inventory Receivables Payable Cashflows".split()))

Cashflows


  """Entry point for launching an IPython kernel.


In [52]:
print(model_gen.doesnt_match("Inventory Receivables Payable Cashflows".split()))

Inventory


### Vocab Size

In [71]:
len(model_fin.wv.vocab)

493577

In [72]:
len(model_gen.wv.vocab)

  """Entry point for launching an IPython kernel.


3000000