## __TREC Pretrain__

Creates a pretrain word embedding model created using the CoNLL2003 train dataset.  
It expects no changes from the raw data for it to work.

In [1]:
from gensim.models import Word2Vec
import pandas as pd

In [2]:
TREC_dir = '../Datasets/TREC_dataset/'
train = f'{TREC_dir}/train.csv'

In [4]:
df = pd.read_csv(train)
df

Unnamed: 0,label-coarse,label-fine,text
0,0,0,How did serfdom develop in and then leave Russ...
1,1,1,What films featured the character Popeye Doyle ?
2,0,0,How can I find a list of celebrities ' real na...
3,1,2,What fowl grabs the spotlight after the Chines...
4,2,3,What is the full form of .com ?
...,...,...,...
5447,1,14,What 's the shape of a camel 's spine ?
5448,1,46,What type of currency is used in China ?
5449,4,41,What is the temperature today ?
5450,4,41,What is the temperature for cooking ?


In [7]:
text = df['text'].to_list()
sentences = []

# Convert each text into separate words in list format
for t in text: sentences.append(t.split(' '))

for s in sentences: print(s)

['How', 'did', 'serfdom', 'develop', 'in', 'and', 'then', 'leave', 'Russia', '?']
['What', 'films', 'featured', 'the', 'character', 'Popeye', 'Doyle', '?']
['How', 'can', 'I', 'find', 'a', 'list', 'of', 'celebrities', "'", 'real', 'names', '?']
['What', 'fowl', 'grabs', 'the', 'spotlight', 'after', 'the', 'Chinese', 'Year', 'of', 'the', 'Monkey', '?']
['What', 'is', 'the', 'full', 'form', 'of', '.com', '?']
['What', 'contemptible', 'scoundrel', 'stole', 'the', 'cork', 'from', 'my', 'lunch', '?']
['What', 'team', 'did', 'baseball', "'s", 'St.', 'Louis', 'Browns', 'become', '?']
['What', 'is', 'the', 'oldest', 'profession', '?']
['What', 'are', 'liver', 'enzymes', '?']
['Name', 'the', 'scar-faced', 'bounty', 'hunter', 'of', 'The', 'Old', 'West', '.']
['When', 'was', 'Ozzy', 'Osbourne', 'born', '?']
['Why', 'do', 'heavier', 'objects', 'travel', 'downhill', 'faster', '?']
['Who', 'was', 'The', 'Pride', 'of', 'the', 'Yankees', '?']
['Who', 'killed', 'Gandhi', '?']
['What', 'is', 'considered

In [8]:
# Initialize and train the model (this will take some time)
# vector_size: The number of dimensions in which we wish to represent our word. Common values are 100, 150, 200, 300, etc.
# window: Maximum distance between the current and predicted word within a sentence.
# min_count: Minimum number of occurrences of a word in the training data to consider it during model training.
# workers: Number of CPU cores to use during training.
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# To save the trained model:
path = '../Pretrained_Models/'
model.save(f'{path}TREC_pretrain.model')

### Testing model

Things to note: If the words do not exists in the dataset it will generate an error.

In [9]:
vector = model.wv['boycott']
vector

array([-0.0199549 ,  0.02290884,  0.00685931,  0.00977112,  0.00935809,
       -0.0231225 ,  0.02177819,  0.02213169, -0.00799088, -0.01954124,
       -0.00060998, -0.01348552, -0.01162539,  0.01078635, -0.00472831,
       -0.01364246,  0.0078787 ,  0.00357668,  0.00098665, -0.0374372 ,
        0.01557832,  0.00070406,  0.02093869, -0.00343472, -0.01132509,
        0.00085618, -0.01115488,  0.00772979, -0.0123713 ,  0.00446903,
        0.01757042,  0.00078673,  0.01080291, -0.0081037 , -0.00201669,
        0.01347971,  0.00284867,  0.0006405 , -0.00168023, -0.01565425,
        0.00537435, -0.02460099, -0.01838366,  0.01051917,  0.00349019,
        0.00735244, -0.01506824, -0.0002941 ,  0.0115536 ,  0.00293694,
        0.00080889, -0.02026046, -0.00523344,  0.00816796, -0.00552027,
        0.00577852,  0.0227413 ,  0.00717831, -0.01931227,  0.00154306,
       -0.00110475,  0.0038604 , -0.00151344,  0.00905871, -0.01492647,
        0.02499817,  0.00722764,  0.01796217, -0.015652  ,  0.01

In [10]:
similar_words = model.wv.most_similar('machine', topn=5)
similar_words

[('Bette', 0.8051574230194092),
 ('Delaware', 0.7980145215988159),
 ('tip', 0.7953810691833496),
 ('skin', 0.7950931787490845),
 ('spins', 0.7943560481071472)]