In [1]:
from gensim.models.fasttext import FastText
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/main-processed.tsv', sep='\t')

In [3]:
df['title'] = df['title'].str.lower().str.split()

df['log_tc'] = df['tc'].apply(lambda x: np.log(x))

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

import category_encoders as ce

In [5]:
X = df.drop(['tc', 'base', 'stock', 'bonus', 'log_tc'], axis=1)
y = df['log_tc']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [6]:
model = FastText(vector_size=10)

In [7]:
corpus = X_train['title'].dropna()

In [8]:
corpus

37618     [software, development, engineer]
8602                    [technical, expert]
26912    [sr, software, engineer, level, 1]
885        [full-stack, software, engineer]
27285                  [software, engineer]
                        ...                
45891                      [sde, new, gard]
52416     [software, development, engineer]
42613         [site, reliability, engineer]
43567                  [software, engineer]
2732                   [software, engineer]
Name: title, Length: 52195, dtype: object

In [9]:
model.build_vocab(corpus.values)

In [10]:
model.train(
    corpus_iterable=corpus, epochs=model.epochs,
    total_examples=model.corpus_count, total_words=model.corpus_total_words,
)

(135734, 614285)

In [11]:
print(model)

FastText<vocab=277, vector_size=10, alpha=0.025>


In [13]:
wv = model.wv
wv.most_similar('junior')

[('4', 0.991966724395752),
 ('se', 0.9916954636573792),
 ('assistant', 0.9913736581802368),
 ('intern', 0.9875046014785767),
 ('advisor', 0.9866610765457153),
 ('intermediate', 0.9863461256027222),
 ('iv', 0.9861136674880981),
 ('algorithm', 0.9857125878334045),
 ('controls', 0.9856036901473999),
 ('expert', 0.984853208065033)]

In [12]:
# import gensim.downloader as api
# wv = api.load('word2vec-google-news-300')