In [1]:
import subprocess
import numpy as np
import pandas as pd

In [2]:
word2vec = "./word2vec"
sizes = [50, 100, 150, 200]
windows = [3, 5, 8, 10]
iters = [5, 8, 10, 20]
cbow = [0, 1]

In [3]:
def train(program, input, sizes, windows, iters, cbow):
    sample = 1e-4
    threads = 16
    negative = 5
    count = len(sizes) * len(windows) * len(iters) * len(cbow)
    i = 0
    for size in sizes:
        for window in windows:
            for iter in iters:
                for c in cbow:
                    alg = "cbow" if c else "skip-gram"
                    output = f"output/size{size}_window{window}_iter{iter}_{alg}.txt"
                    params = {
                        "output": output,
                        "size": str(size),
                        "window": str(window),
                        "iter": str(iter),
                        "cbow": str(c),
                    }
                    print(f"{i+1}/{count}")
                    print("-" * 50)
                    print(f"Training with:")
                    print(f"Size:   {size}")
                    print(f"Window: {window}")
                    print(f"Iter:   {iter}")
                    print("CBOW" if c else "Skip-Gram")
                    print("-" * 50)
                    i += 1
                    w2v = subprocess.run(
                        [
                            program,
                            "-train",
                            input,
                            "-output",
                            params["output"],
                            "-size",
                            params["size"],
                            "-window",
                            params["window"],
                            "-iter",
                            params["iter"],
                            "-cbow",
                            params["cbow"],
                            "-sample",
                            str(sample),
                            "-threads",
                            str(threads),
                            "-negative",
                            str(negative),
                        ],
                        stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE,
                        text=True,
                    )

In [4]:
download = subprocess.run(
    "./word2vec.sh",
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    text=True,
    shell=True,
)

train(word2vec, "text8", sizes, windows, iters, cbow)

1/128
--------------------------------------------------
Training with:
Size:   50
Window: 3
Iter:   5
Skip-Gram
--------------------------------------------------
2/128
--------------------------------------------------
Training with:
Size:   50
Window: 3
Iter:   5
CBOW
--------------------------------------------------
3/128
--------------------------------------------------
Training with:
Size:   50
Window: 3
Iter:   8
Skip-Gram
--------------------------------------------------
4/128
--------------------------------------------------
Training with:
Size:   50
Window: 3
Iter:   8
CBOW
--------------------------------------------------
5/128
--------------------------------------------------
Training with:
Size:   50
Window: 3
Iter:   10
Skip-Gram
--------------------------------------------------
6/128
--------------------------------------------------
Training with:
Size:   50
Window: 3
Iter:   10
CBOW
--------------------------------------------------
7/128
-----------------------

In [5]:
output_path = "output"
files = subprocess.run(
    ["ls", output_path],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    text=True,
)
files_list = files.stdout.split()
files_list

['size100_window10_iter10_cbow.txt',
 'size100_window10_iter10_skip-gram.txt',
 'size100_window10_iter20_cbow.txt',
 'size100_window10_iter20_skip-gram.txt',
 'size100_window10_iter5_cbow.txt',
 'size100_window10_iter5_skip-gram.txt',
 'size100_window10_iter8_cbow.txt',
 'size100_window10_iter8_skip-gram.txt',
 'size100_window3_iter10_cbow.txt',
 'size100_window3_iter10_skip-gram.txt',
 'size100_window3_iter20_cbow.txt',
 'size100_window3_iter20_skip-gram.txt',
 'size100_window3_iter5_cbow.txt',
 'size100_window3_iter5_skip-gram.txt',
 'size100_window3_iter8_cbow.txt',
 'size100_window3_iter8_skip-gram.txt',
 'size100_window5_iter10_cbow.txt',
 'size100_window5_iter10_skip-gram.txt',
 'size100_window5_iter20_cbow.txt',
 'size100_window5_iter20_skip-gram.txt',
 'size100_window5_iter5_cbow.txt',
 'size100_window5_iter5_skip-gram.txt',
 'size100_window5_iter8_cbow.txt',
 'size100_window5_iter8_skip-gram.txt',
 'size100_window8_iter10_cbow.txt',
 'size100_window8_iter10_skip-gram.txt',
 's

In [6]:
def cosine_distance(u, v):
    uv = np.dot(u, v)
    u_norm = np.linalg.norm(u)
    v_norm = np.linalg.norm(v)
    return 1 - (uv / (u_norm * v_norm))


def get_params(file_name):
    size, window, iter, cbow = file_name.split("_")
    size = int(size[4:])
    window = int(window[6:])
    iter = int(iter[4:])
    cbow = "CBOW" if cbow[:4] == "cbow" else "Skip-Gram"
    return size, window, iter, cbow


def get_data(file_path):
    words = {}
    with open(file_path, "r") as f:
        n_words, n_dim = f.readline().split()
        n_words, n_dim = int(n_words), int(n_dim)
        for i in range(n_words):
            line = f.readline().split()
            word = line[0]
            vec = np.array(line[1:], dtype=float)
            words[word] = vec
    return words


def rmse(lines, data):
    rmse = 0
    n_lines = 0
    for line in lines:
        if line.startswith(":"):
            continue
        line = line.lower()
        word1, word2, word3, word4 = line.split()
        if (
            word1 not in data
            or word2 not in data
            or word3 not in data
            or word4 not in data
        ):
            continue
        n_lines += 1
        X = data[word1] + data[word2] - data[word3]
        Y = data[word4]
        rmse += cosine_distance(X, Y) ** 2
    rmse = np.sqrt(rmse / n_lines)
    return rmse

In [7]:
df = pd.DataFrame(columns=["Size", "Window", "Iter", "CBOW or Skip-gram", "RMSE"])

with open("questions-words.txt", "r") as f:
    lines = f.readlines()
    for file_name in files_list:
        size, window, iter, cbow = get_params(file_name)
        print(f"Size: {size}\tWindow: {window}\tIters: {iter}\t{cbow}")
        data = get_data(output_path + "/" + file_name)
        rmse_value = rmse(lines, data)
        df.loc[len(df)] = [size, window, iter, cbow, rmse_value]
        print(f"RMSE for {file_name}: {rmse_value}")
        print("-" * 50)

df.to_csv("results.csv", index=False)
df

Size: 100	Window: 10	Iters: 10	CBOW
RMSE for size100_window10_iter10_cbow.txt: 1.0608072431572995
--------------------------------------------------
Size: 100	Window: 10	Iters: 10	Skip-Gram
RMSE for size100_window10_iter10_skip-gram.txt: 0.9337970723754218
--------------------------------------------------
Size: 100	Window: 10	Iters: 20	CBOW
RMSE for size100_window10_iter20_cbow.txt: 1.0578055783516787
--------------------------------------------------
Size: 100	Window: 10	Iters: 20	Skip-Gram
RMSE for size100_window10_iter20_skip-gram.txt: 0.9571166672319956
--------------------------------------------------
Size: 100	Window: 10	Iters: 5	CBOW
RMSE for size100_window10_iter5_cbow.txt: 1.0498213556300933
--------------------------------------------------
Size: 100	Window: 10	Iters: 5	Skip-Gram
RMSE for size100_window10_iter5_skip-gram.txt: 0.8962118623498666
--------------------------------------------------
Size: 100	Window: 10	Iters: 8	CBOW
RMSE for size100_window10_iter8_cbow.txt: 1.0

Unnamed: 0,Size,Window,Iter,CBOW or Skip-gram,RMSE
0,100,10,10,CBOW,1.060807
1,100,10,10,Skip-Gram,0.933797
2,100,10,20,CBOW,1.057806
3,100,10,20,Skip-Gram,0.957117
4,100,10,5,CBOW,1.049821
...,...,...,...,...,...
123,50,8,20,Skip-Gram,0.892788
124,50,8,5,CBOW,1.024465
125,50,8,5,Skip-Gram,0.834142
126,50,8,8,CBOW,1.031018
