# Idea:
Our solution: LDA + keywords from clusters of BERT based embeddings of noun phrases and verbs :
- Each noun phrase and verb in the texts is  transformed to embedding vector using Universal Sentence Encoder (transformer based on BERT)
- Embedding vectors from (a) are grouped into clusters with cosign similarity >= 70%
- Words/phrases with embedding vectors closest to the centers of resulting clusters form key word/phrase
- Each text in the training sample is converted to collection of key-phrases by replacing its noun phrases and verbs with keyword/phrases and deleting other words
- LDA is performed on the transformed texts


**Reference:**<br>
- Daniel Cer, Yinfei Yang, Sheng-yi Kong, Nan Hua, Nicole Limtiaco, Rhomni St. John, Noah Constant, Mario Guajardo-Céspedes, Steve Yuan, Chris Tar, Yun-Hsuan Sung, Brian Strope, Ray Kurzweil. **Universal Sentence Encoder.** *arXiv:1803.11175, 2018.*

# Load data and python libraries

In [1]:
# data processing libraries
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# display wider columns in pandas data frames where necessary
pd.set_option('max_colwidth',150)

import tensorflow as tf
print("TensorFlow version:", tf.__version__)

import tensorflow_hub as hub
#Load the Universal Sentence Encoder's TF Hub module
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
model = hub.load(module_url)
print ("module %s loaded" % module_url)

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

import time
import pickle

TensorFlow version: 2.2.0
module https://tfhub.dev/google/universal-sentence-encoder-large/5 loaded


In [2]:
df_train = pd.read_csv("./data/train_grouped.tsv", sep="\t")
print("df_train.shape:", df_train.shape)
print("df_train.shape:",df_train.columns)

df_train.shape: (33982, 16)
df_train.shape: Index(['date', 'author', 'title', 'url', 'section', 'publication',
       'first_10_sents', 'list_of_first_10_sents', 'list_of_verb_lemmas',
       'noun_phrases', 'list_of_nouns', 'list_of_lemmas', 'ID',
       'group_level_1', 'group_level_2', 'group_level_3'],
      dtype='object')


# Getting text clusters through sentence embedding comparison

In [5]:
def get_embeddings(input):
    return model(input)

In [6]:
def get_word_embeddings(df_data, column = "word", N_batches=1):
    #split data into N batches
    N = N_batches

    part = int(len(df_data)/N)
    print(N, "batches with", part + 1, column + "s each")

    #get embeddings for each N words
    index = 0
    batch_num = 0
    list_dfs = []

    while index < len(df_data): 
        df_tmp = df_data.iloc[index : index + part].copy()
        df_tmp = df_tmp.reset_index(drop=True)
        print ("Batch number:", batch_num + 1, "out of ", N, "index:", index)

        df_batch_embeddings = pd.DataFrame(get_embeddings(list(df_tmp[column])).numpy())

        num_embeddings = df_batch_embeddings.shape[1]
        columns = ["emb_" + str(i) for i in range(512)]
        df_tmp[columns] = df_batch_embeddings

        list_dfs.append(df_tmp)
        batch_num = batch_num + 1
        index = index + part

    #concatinate batches into single dataset
    df_emb = pd.concat(list_dfs)

    return df_emb

In [7]:
df_train['noun_phrases'] = df_train['noun_phrases'].str[2:-2]
df_train['noun_phrases'] = df_train['noun_phrases'].str.lower().str.split("', '")
df_train['noun_phrases'].head()

0    [rise, big emerging economy, china, india, steady march, globalisation, surge, number, people, business, tourism, result, demand, visa, unpreceden...
1    [pfizer, commitment, corporate social responsibility csr, drugs giant talk, responsibility, society, world, access, product, work, ngos, global he...
2    [week, federal reserve, interest rate, time, year, world, central bank, rate, recent year, long spell, course, chart, outcome, americas rate rise,...
3    [cruise line, wave, year, nearly, holiday, sea, result, december 18th carnival, worlds largest operator, global market, fullyear earning, demand, ...
4    [investors, calendar year, buoyant mood, unexpected event, consensus, respect, view, investor, market price, column, potential surprise, definitio...
Name: noun_phrases, dtype: object

In [8]:
all_NPs = list(df_train['noun_phrases'])
all_NPs = [np for l in all_NPs for np in l if len(np)>0]
all_NPs[:5], len(all_NPs)

(['rise', 'big emerging economy', 'china', 'india', 'steady march'], 1417049)

In [9]:
df_train['list_of_verb_lemmas'].iloc[0]

'[emerging, led, wanting, travel, granted, Upgrade, travel, apply, submit, streamline, scrap]'

In [10]:
df_train['list_of_verb_lemmas'] = df_train['list_of_verb_lemmas'].str[2:-2]
df_train['list_of_verb_lemmas'] = df_train['list_of_verb_lemmas'].str.lower().str.split(", ")
df_train['list_of_verb_lemmas'].head()

0                                                               [merging, led, wanting, travel, granted, upgrade, travel, apply, submit, streamline, scra]
1    [rided, embracing, insists, gain, strengthen, improve, deterred, seeking, intends, shift, domiciled, rejoiced, saved, paid, outraged, promised, im...
2    [aised, ended, celebrate, tried, lift, forced, reverse, cut, help, understand, upgrade, strike, wish, save, spend, try, escape, slashing, encourag...
3      [race, booked, improve, announced, control, demand, peaking, piling, based, got, moving, upgrade, increase, announced, establish, aimed, based, ad]
4    [tart, caught, proved, reflected, like, suggest, judged, betting, expect, upgrade, weakens, having, pushed, tighten, buy, priced, doubt, tighten, ...
Name: list_of_verb_lemmas, dtype: object

In [11]:
all_Vs = list(df_train['list_of_verb_lemmas'])
all_Vs = [v for l in all_Vs for v in l if len(v)>0]
all_Vs[:5], len(all_Vs)

(['merging', 'led', 'wanting', 'travel', 'granted'], 675330)

In [12]:
all_words =  list(set(all_NPs + all_Vs))
len(set(all_words))

419327

In [13]:
df_words = pd.DataFrame({'word': all_words})
df_words.head()

Unnamed: 0,word
0,trusted trading partner
1,german sociologist
2,largest movie theater chain
3,emergency condition
4,right cameraa


In [12]:
%%time
#creating word2vec matrix
df_w2v = get_word_embeddings(df_words, column = "word", N_batches=100)
df_w2v.head()

100 batches with 4194 words each
Batch number: 1 out of  100 index: 0
Batch number: 2 out of  100 index: 4193
Batch number: 3 out of  100 index: 8386
Batch number: 4 out of  100 index: 12579
Batch number: 5 out of  100 index: 16772
Batch number: 6 out of  100 index: 20965
Batch number: 7 out of  100 index: 25158
Batch number: 8 out of  100 index: 29351
Batch number: 9 out of  100 index: 33544
Batch number: 10 out of  100 index: 37737
Batch number: 11 out of  100 index: 41930
Batch number: 12 out of  100 index: 46123
Batch number: 13 out of  100 index: 50316
Batch number: 14 out of  100 index: 54509
Batch number: 15 out of  100 index: 58702
Batch number: 16 out of  100 index: 62895
Batch number: 17 out of  100 index: 67088
Batch number: 18 out of  100 index: 71281
Batch number: 19 out of  100 index: 75474
Batch number: 20 out of  100 index: 79667
Batch number: 21 out of  100 index: 83860
Batch number: 22 out of  100 index: 88053
Batch number: 23 out of  100 index: 92246
Batch number: 24

Unnamed: 0,word,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_502,emb_503,emb_504,emb_505,emb_506,emb_507,emb_508,emb_509,emb_510,emb_511
0,understated eyecatching design,-0.019827,-0.006422,0.01457,-0.0393,-0.024002,0.054702,0.004713,-0.029246,0.004955,...,0.070869,-0.03697,-0.019612,-0.023454,0.000452,-0.076762,-0.026164,0.079127,-0.016291,0.000221
1,proinflammatory diet,0.047472,-0.036095,0.003528,-0.057485,-0.034209,0.014092,-0.045369,-0.006103,0.041479,...,0.018932,-0.031069,-0.012415,0.022317,-0.053924,0.067012,-0.02176,0.060036,-0.045503,0.067614
2,peoples pocket,-0.077422,0.006368,0.007027,0.026393,-0.011804,0.010887,-0.002622,0.091837,0.07526,...,0.017116,-0.077545,0.012053,-0.000137,0.016847,0.053931,0.004807,0.054155,-0.033633,0.000742
3,wacos bsr cable park,-0.028665,0.033037,-0.008898,0.02561,-0.021672,0.002751,-0.074617,-0.022317,-0.077283,...,-0.057017,-0.00156,0.057492,0.008258,0.024517,0.086742,0.026507,-0.024543,0.035418,0.036425
4,andrea josse,-0.030393,0.026402,0.040522,-0.055174,0.03781,-0.047927,0.042512,-0.012625,0.005063,...,-0.009257,0.040241,-0.040541,0.00701,0.017564,-0.009971,-0.039097,0.013524,0.041107,0.057602


In [13]:
df_w2v.shape

(419327, 513)

In [14]:
df_w2v.iloc[::100000]

Unnamed: 0,word,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_502,emb_503,emb_504,emb_505,emb_506,emb_507,emb_508,emb_509,emb_510,emb_511
0,understated eyecatching design,-0.019827,-0.006422,0.01457,-0.0393,-0.024002,0.054702,0.004713,-0.029246,0.004955,...,0.070869,-0.03697,-0.019612,-0.023454,0.000452,-0.076762,-0.026164,0.079127,-0.016291,0.000221
3561,usual gear,0.030396,0.048733,0.006757,0.03005,0.014397,-0.028559,0.00208,0.052073,-0.000604,...,-0.043511,0.063861,-0.002875,0.009074,0.020639,-0.019907,-0.046328,0.063082,0.022643,0.024093
2929,banco popular,-0.006136,0.053859,-0.027051,0.028665,0.007711,0.084763,-0.041701,-0.051199,-0.015713,...,0.01046,0.015526,0.015577,-0.044722,-0.044955,0.02115,-0.029704,-0.033842,-0.046479,0.021843
2297,trekkies,-0.027514,0.053677,0.028022,-0.000125,0.069076,-0.055233,0.026868,-0.025693,0.00756,...,0.037944,0.019393,-0.008219,-0.013544,-0.010799,0.046405,-0.01986,0.016044,-0.011341,-0.003216
1665,harvard universitys t.h. chan school,0.019255,0.03157,-0.016233,-0.085584,-0.027504,0.023182,-0.050383,-0.115471,0.0635,...,0.034805,-0.001658,0.077751,-0.030084,-0.040467,-0.052682,-0.035337,-0.062311,-0.051079,-0.017573


# Clustering

In [15]:
threshold = 0.7

In [16]:
# load data
with open('./transition_files/df_result_'+str(52000)+'.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    df_res = pickle.load(f)
df_res.head(1)

Unnamed: 0,word,cl_number,cl_size
0,monthly service charge,0,7


In [17]:
words_list = list(set(df_w2v['word']) - set(df_res['word']))
print("Total number of words:", len(words_list), "out of", len(df_w2v))
df_tmp = df_w2v[df_w2v['word'].isin(words_list) == True]
print(df_res.shape, df_tmp.shape)

columns = ["emb_" + str(i) for i in range(512)]
df_list = [df_res]
cl_num = df_res['cl_number'].max() + 1

Total number of words: 176291 out of 419327
(243036, 3) (176291, 513)


In [18]:
#this step took approximatelly 48h

words_list = list(set(df_w2v['word']) - set(df_res['word']))
print("Total number of words:", len(words_list), "out of", len(df_w2v))
df_tmp = df_w2v[df_w2v['word'].isin(words_list) == True]
print(df_res.shape, df_tmp.shape)

columns = ["emb_" + str(i) for i in range(512)]
df_list = [df_res]
cl_num = df_res['cl_number'].max() + 1

start = time.time()
while len(df_tmp)>0:
    word = df_tmp['word'].iloc[0]
    word_emb = df_tmp[df_tmp['word'] == word][columns].values
    df_tmp["sim"] = cosine_similarity(df_tmp[columns].values,word_emb)
    df_tmp["sim"] = df_tmp["sim"].apply(np.abs)

    df_cluster = df_tmp[df_tmp["sim"] >= threshold]
    df_cluster["cl_number"] = cl_num
    selected_words = list(df_cluster['word'])
    df_cluster["cl_size"] = len(selected_words)
    df_list.append(df_cluster[['word', "cl_number", "cl_size"]])

    df_tmp = df_tmp[df_tmp['word'].isin(selected_words) == False]
    cl_num = cl_num + 1

    if (cl_num % 1000) == 0:
        finish = time.time()
        print("\nTIME ELAPCED (in minutes):", round((finish-start)/60,2),cl_num)
        print("Cluster number:", cl_num, "\tCluster size:", len(df_cluster), "\t", len(df_tmp))
        
        df_result = pd.concat(df_list)
        print(df_result.shape)
        with open('./transition_files/df_result_'+str(cl_num)+'.pickle', 'wb') as f:
            # Pickle the 'data' dictionary using the highest protocol available.
            pickle.dump(df_result, f, pickle.HIGHEST_PROTOCOL)
        start = time.time()
          

Total number of words: 176291 out of 419327
(243036, 3) (176291, 513)

TIME ELAPCED (in minutes): 8.44 53000
Cluster number: 53000 	Cluster size: 2 	 174063
(245264, 3)

TIME ELAPCED (in minutes): 8.34 54000
Cluster number: 54000 	Cluster size: 2 	 171847
(247480, 3)

TIME ELAPCED (in minutes): 8.23 55000
Cluster number: 55000 	Cluster size: 1 	 169680
(249647, 3)

TIME ELAPCED (in minutes): 8.12 56000
Cluster number: 56000 	Cluster size: 2 	 167593
(251734, 3)

TIME ELAPCED (in minutes): 8.04 57000
Cluster number: 57000 	Cluster size: 3 	 165483
(253844, 3)

TIME ELAPCED (in minutes): 7.97 58000
Cluster number: 58000 	Cluster size: 1 	 163442
(255885, 3)

TIME ELAPCED (in minutes): 7.87 59000
Cluster number: 59000 	Cluster size: 1 	 161517
(257810, 3)

TIME ELAPCED (in minutes): 7.78 60000
Cluster number: 60000 	Cluster size: 1 	 159477
(259850, 3)

TIME ELAPCED (in minutes): 8.51 61000
Cluster number: 61000 	Cluster size: 1 	 157438
(261889, 3)

TIME ELAPCED (in minutes): 7.57 62000


(370290, 3)

TIME ELAPCED (in minutes): 3.57 135000
Cluster number: 135000 	Cluster size: 1 	 47843
(371484, 3)

TIME ELAPCED (in minutes): 3.54 136000
Cluster number: 136000 	Cluster size: 1 	 46663
(372664, 3)

TIME ELAPCED (in minutes): 2.94 137000
Cluster number: 137000 	Cluster size: 1 	 45513
(373814, 3)

TIME ELAPCED (in minutes): 2.58 138000
Cluster number: 138000 	Cluster size: 1 	 44353
(374974, 3)

TIME ELAPCED (in minutes): 3.03 139000
Cluster number: 139000 	Cluster size: 1 	 43166
(376161, 3)

TIME ELAPCED (in minutes): 3.2 140000
Cluster number: 140000 	Cluster size: 1 	 42018
(377309, 3)

TIME ELAPCED (in minutes): 3.06 141000
Cluster number: 141000 	Cluster size: 1 	 40865
(378462, 3)

TIME ELAPCED (in minutes): 2.99 142000
Cluster number: 142000 	Cluster size: 1 	 39729
(379598, 3)

TIME ELAPCED (in minutes): 4.02 143000
Cluster number: 143000 	Cluster size: 1 	 38596
(380731, 3)

TIME ELAPCED (in minutes): 2.06 144000
Cluster number: 144000 	Cluster size: 1 	 37468
(

In [19]:
df_result = pd.concat(df_list)
print(df_result.shape)
with open('./transition_files/df_result.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(df_result, f, pickle.HIGHEST_PROTOCOL)

(419327, 3)


# Prepare data for LDA

##### get cluster label as most frequent word/phrase of the cluster used in the training dataset

In [4]:
# load data
with open('./transition_files/df_result.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    df_res = pickle.load(f)
print(df_res.shape)
df_res.head(1)

(419327, 3)


Unnamed: 0,word,cl_number,cl_size
0,monthly service charge,0,7


In [16]:
all_words =  all_NPs + all_Vs
df_all_words = pd.DataFrame({'word': all_words})
print(len(df_words))
df_all_words.head()

2092379


Unnamed: 0,word
0,rise
1,big emerging economy
2,china
3,india
4,steady march


In [17]:
df_all_words = df_all_words.merge(df_res, on='word', how='inner')
print(df_all_words.shape)

(2092379, 3)


In [20]:
df_all_words['word_frequency'] = df_all_words.groupby(['cl_number', 'word'])['word'].transform("count")
df_all_words['word_max_frequency'] = df_all_words.groupby(['cl_number'])['word_frequency'].transform("max")
df_all_words.iloc[::222220]

Unnamed: 0,word,cl_number,cl_size,word_frequency,word_max_frequency
0,rise,1486,20,795,795
222220,detroit,1837,23,85,349
444440,president trump,1171,38,276,479
666660,hostility,31108,4,20,20
888880,ink,15204,11,29,29
1111100,school bus,481,32,23,130
1333320,private new space company,219,17,1,6
1555540,pterodactyl,102039,3,1,1
1777760,needed,2311,42,812,2756
1999980,monitors,51690,3,40,40


In [36]:
pd.DataFrame(df_all_words.describe(percentiles=[0.01,0.09, 0.1,0.20,0.3,0.4,0.5,0.75,0.95,0.99])).T

Unnamed: 0,count,mean,std,min,1%,9%,10%,20%,30%,40%,50%,75%,95%,99%,max
cl_number,2092379.0,30647.862875,40231.480295,0.0,131.0,1141.0,1242.0,3112.0,5376.0,8209.0,13151.0,40117.0,129618.0,167044.0,179256.0
cl_size,2092379.0,18.311795,27.750696,1.0,1.0,1.0,2.0,3.0,5.0,7.0,10.0,23.0,60.0,114.0,513.0
word_frequency,2092379.0,840.679864,2361.341249,1.0,1.0,1.0,1.0,3.0,13.0,46.0,122.0,694.0,3290.0,10360.0,20130.0
word_max_frequency,2092379.0,963.430633,2440.771968,1.0,1.0,2.0,2.0,9.0,34.0,92.0,206.0,927.0,3477.0,20130.0,20130.0


In [29]:
df_cl_labeled = df_all_words[df_all_words['word_max_frequency'] == df_all_words['word_frequency']]
df_cl_labeled['cluster_label'] = df_cl_labeled['word']
df_cl_labeled = df_cl_labeled.groupby('cl_number')[['cluster_label','cl_size']].last().reset_index()
print(df_cl_labeled.shape)

(179257, 3)


In [37]:
df_cl_labeled.iloc[::15500]

Unnamed: 0,cl_number,cluster_label,cl_size
0,0,monthly fee,7
15500,15500,akerman,3
31000,31000,frustrated swiss bank,2
46500,46500,company cofounder tim brown,1
62000,62000,trappist1 systema,1
77500,77500,best institution,1
93000,93000,governments book,1
108500,108500,takashi takano,1
124000,124000,fixed chronic airflow obstruction,1
139500,139500,nonstopon conference panel,1


In [39]:
df_word_clusters = df_all_words[['word', 'cl_number']]
print(df_word_clusters.shape)

df_word_clusters = df_word_clusters.drop_duplicates()
df_word_clusters = df_word_clusters.merge(df_cl_labeled, on='cl_number', how='inner')

print(df_word_clusters.shape)
df_word_clusters.iloc[::222222].T

(2092379, 2)
(419327, 4)


Unnamed: 0,0,222222
word,rise,preliminary budget proposal
cl_number,1486,12841
cluster_label,rise,initial proposal
cl_size,20,7


In [42]:
df_word_clusters[df_word_clusters['cl_number'] == 12841]

Unnamed: 0,word,cl_number,cluster_label,cl_size
222222,preliminary budget proposal,12841,initial proposal,7
222223,initial proposal,12841,initial proposal,7
222224,launch service agreement proposal,12841,initial proposal,7
222225,contract proposal,12841,initial proposal,7
222226,initial deal,12841,initial proposal,7
222227,initial contract proposal,12841,initial proposal,7
222228,initial offer,12841,initial proposal,7


In [43]:
df_word_clusters.to_csv('./transition_files/word_cluster_label.csv', index=False)

***
# Replace text words with their cluster names (KeyWords)

In [44]:
df_train.columns

Index(['date', 'author', 'title', 'url', 'section', 'publication',
       'first_10_sents', 'list_of_first_10_sents', 'list_of_verb_lemmas',
       'noun_phrases', 'list_of_nouns', 'list_of_lemmas', 'ID',
       'group_level_1', 'group_level_2', 'group_level_3'],
      dtype='object')

In [45]:
df_train['all_words'] = df_train['list_of_verb_lemmas'] + df_train['noun_phrases']

In [46]:
word_cluster_label_dict =dict(zip(df_word_clusters['word'], df_word_clusters['cluster_label']))

In [47]:
df_train['all_key_words'] = df_train['all_words'].apply(lambda Wlist: 
                                                        [word_cluster_label_dict[w] for w in  Wlist
                                                                                    if w in word_cluster_label_dict
                                                        ])
df_train[['all_key_words', 'all_words']].head()

Unnamed: 0,all_key_words,all_words
0,"[merge, led, looking, travel, granted, upgrade, travel, apply, post, streamline, scra, rise, emerging economy, china, india, steady march, globali...","[merging, led, wanting, travel, granted, upgrade, travel, apply, submit, streamline, scra, rise, big emerging economy, china, india, steady march,..."
1,"[rided, embracing, denied, gain, reduce, upgrade, desire, looking, aimed, shift, domiciled, joy, save, pay, puzzled, promise, asks, engage, sugges...","[rided, embracing, insists, gain, strengthen, improve, deterred, seeking, intends, shift, domiciled, rejoiced, saved, paid, outraged, promised, im..."
2,"[aised, stop, celebrate, trying, lift, forced, reverse, cut, help, understand, upgrade, strike, wish, save, spend, try, escape, slashed, encourage...","[aised, ended, celebrate, tried, lift, forced, reverse, cut, help, understand, upgrade, strike, wish, save, spend, try, escape, slashing, encourag..."
3,"[race, reserved, upgrade, told, control, demand, peaking, piling, based, got, moving, upgrade, increase, told, created, aimed, based, buy, cruise,...","[race, booked, improve, announced, control, demand, peaking, piling, based, got, moving, upgrade, increase, announced, establish, aimed, based, ad..."
4,"[tart, caught, prove, reflected, like, suggest, judged, betting, expected, upgrade, disrupt, getting, push, tighten, buy, priced, doubt, tighten, ...","[tart, caught, proved, reflected, like, suggest, judged, betting, expect, upgrade, weakens, having, pushed, tighten, buy, priced, doubt, tighten, ..."


In [48]:
ind = 10
print(df_train['all_words'].iloc[ind])
print("="*50)
print(df_train['all_key_words'].iloc[ind])

['eld', 'featuring', 'held', 'attending', 'companieswere', 'tied', 'holding', 'minted', 'set', 'gruelling', 'likened', 'speeddating', 'review', 'seen', 'looking', 'plan', 'grill', 'inviting', 'visit', 'meet', 'upgrad', 'american economic associations annual conference', 'gigantic teachin', 'lot', 'seminar', 'famous economist', 'threeday event', 'san francisco', 'employersboth university', 'hotel room', 'marathon interview session', 'freshly minted phds', 'ballroom', 'marriott', 'candidate', 'exhausted phd', 'recruiter', 'end', 'day', 'alan green', 'christopher de bodisco', 'stetson university', 'small private college', 'florida', 'candidate', 'interest', 'health', 'development', 'dozen candidate', 'promising one', 'campus', 'rest', 'faculty', 'inbox', 'daily dispatch', 'editors']
['eld', 'involving', 'held', 'going', 'companieswere', 'tied', 'held', 'minted', 'set', 'gruelling', 'comparison', 'speeddating', 'consider', 'seen', 'looking', 'plan', 'grill', 'invited', 'visit', 'meet', 'up

In [49]:
with open('./transition_files/df_train_for_LDA.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(df_train, f, pickle.HIGHEST_PROTOCOL)