In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
%cd /content/drive/MyDrive/word2vec/trunk

/content/drive/MyDrive/word2vec/trunk


In [3]:
%%shell

mkdir data
cd data
wget https://www.cnts.ua.ac.be/conll2000/chunking/train.txt.gz
gzip -d train.txt.gz

--2023-03-12 21:00:07--  https://www.cnts.ua.ac.be/conll2000/chunking/train.txt.gz
Resolving www.cnts.ua.ac.be (www.cnts.ua.ac.be)... 146.175.13.81
Connecting to www.cnts.ua.ac.be (www.cnts.ua.ac.be)|146.175.13.81|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 611540 (597K) [application/x-gzip]
Saving to: ‘train.txt.gz’


2023-03-12 21:00:07 (12.1 MB/s) - ‘train.txt.gz’ saved [611540/611540]





In [4]:
%%shell
cd /content/drive/MyDrive/word2vec/trunk/data
cut -d ' ' -f 1,2 train.txt > train_processed.txt



In [8]:
import re

# Load the dataset
with open('/content/drive/MyDrive/word2vec/trunk/data/train_processed.txt', 'r') as file:
    dataset = file.readlines()

# Remove empty lines and punctuation marks
cleaned_dataset = []
for line in dataset:
    if line.strip() and not re.match('[.,!?\-()$]', line):
        cleaned_dataset.append(line)

# Write the cleaned dataset to a file
with open('dataset.txt', 'w') as file:
    file.writelines(cleaned_dataset)

In [12]:
import random

# Open the original dataset file
with open('/content/drive/MyDrive/word2vec/trunk/data/dataset.txt', 'r') as f:
    data = f.read()

# Split the data by newline characters to get individual data points
data_points = data.strip().split('\n')

# Shuffle the data points randomly
random.shuffle(data_points)

# Calculate the index to split the data points
split_idx = int(len(data_points) * 0.8)

# Divide the data points into train and dev sets
train_data = data_points[:split_idx]
dev_data = data_points[split_idx:]

# Save the train and dev sets to separate files
with open('/content/drive/MyDrive/word2vec/trunk/data/train_set.txt', 'w') as f:
    f.write('\n'.join(train_data))

with open('/content/drive/MyDrive/word2vec/trunk/data/dev_set.txt', 'w') as f:
    f.write('\n'.join(dev_data))

In [13]:
import string

# Open input and output files
with open('/content/drive/MyDrive/word2vec/trunk/data/test_data.txt', 'r') as infile, open('/content/drive/MyDrive/word2vec/trunk/data/processed_test_data.txt', 'w') as outfile:
    # Remove special characters and empty lines from each line in input file
    for line in infile:
        # Remove special characters from line
        line = line.translate(str.maketrans('', '', string.punctuation))
        # Remove leading/trailing white space
        line = line.strip()
        # Skip empty lines
        if not line:
            continue
        # Write processed line to output file
        outfile.write(line + '\n')

In [20]:
%%shell

make

gcc word2vec.c -o word2vec -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result
gcc distance.c -o distance -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result
[01m[Kdistance.c:[m[K In function ‘[01m[Kmain[m[K’:
   31 |   char [01;35m[Kch[m[K;
      |        [01;35m[K^~[m[K
gcc word-analogy.c -o word-analogy -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result
[01m[Kword-analogy.c:[m[K In function ‘[01m[Kmain[m[K’:
   31 |   char [01;35m[Kch[m[K;
      |        [01;35m[K^~[m[K
gcc compute-accuracy.c -o compute-accuracy -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result
[01m[Kcompute-accuracy.c:[m[K In function ‘[01m[Kmain[m[K’:
   29 |   char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], [01;35m[Kch[m[K;
      |                                                                                                          



In [21]:
%%shell

./demo-train-big-model-v1.sh

Starting training using file data.txt
100K200K300K400KVocab size: 1952
Words in train file: 433655
Alpha: 0.000128  Progress: 99.99%  Words/thread/sec: 3306.66k  



In [22]:
import gensim
import nltk
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

nltk.download('popular')

# Load the word embeddings from the vectors.bin file
word_vectors = gensim.models.KeyedVectors.load_word2vec_format('vectors.bin', binary=True)

# Load the train_set and dev_set
train_set = []
with open('/content/drive/MyDrive/data/train_set.txt', 'r') as f:
    for line in f:
        word, pos = line.strip().split()
        train_set.append((word, pos))

dev_set = []
with open('/content/drive/MyDrive/data/dev_set.txt', 'r') as f:
    for line in f:
        word, pos = line.strip().split()
        dev_set.append((word, pos))

# Define a function to extract features from a sentence
def extract_features(sentence, word_vectors):
    features = []
    for i in range(len(sentence)):
        word = sentence[i][0]
        if word in word_vectors:
            features.append(word_vectors[word])
        else:
            features.append(np.zeros(word_vectors.vector_size))
    return np.concatenate(features)

# Extract features and labels from the train_set
train_X = []
train_y = []
for i in range(len(train_set)):
    sentence = train_set[max(i-2, 0):i+3]  # Use a 5-word window
    features = extract_features(sentence, word_vectors)
    label = train_set[i][1]
    train_X.append(features)
    train_y.append(label)

# Find the maximum length of the arrays in train_X
max_len = max(len(x) for x in train_X)

# Pad the arrays in train_X with zeros to match the maximum length
for i in range(len(train_X)):
    train_X[i] = np.pad(train_X[i], (0, max_len - len(train_X[i])), mode='constant')

# Train a logistic regression model
clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial', max_iter=1000).fit(train_X, train_y)

# Extract features and labels from the dev_set
dev_X = []
dev_y_true = []
for i in range(len(dev_set)):
    sentence = dev_set[max(i-2, 0):i+3]  # Use a 5-word window
    features = extract_features(sentence, word_vectors)
    label = dev_set[i][1]
    dev_X.append(features)
    dev_y_true.append(label)

# Find the maximum length of the arrays in dev_X
max_len = max(len(x) for x in dev_X)

# Pad the arrays in dev_X with zeros to match the maximum length
for i in range(len(dev_X)):
    dev_X[i] = np.pad(dev_X[i], (0, max_len - len(dev_X[i])), mode='constant')

# Use the trained model to predict labels for the dev_set
dev_y_pred = clf.predict(dev_X)

# Calculate accuracy on the dev_set
accuracy = accuracy_score(dev_y_true, dev_y_pred)
print('Accuracy:', accuracy)


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

Accuracy: 0.3982625686523025


In [25]:
# Load the test_set
test_set = []
with open('/content/drive/MyDrive/data/processed_test_data.txt', 'r') as f:
    for line in f:
        test_set.append(line.strip())

# Extract features from the test_set
test_X = []
for i in range(len(test_set)):
    sentence = test_set[max(i-2, 0):i+3]  # Use a 5-word window
    features = extract_features(sentence, word_vectors)
    test_X.append(features)

# Find the maximum length of the arrays in test_X
max_len = max(len(x) for x in test_X)

# Pad the arrays in test_X with zeros to match the maximum length
for i in range(len(test_X)):
    test_X[i] = np.pad(test_X[i], (0, max_len - len(test_X[i])), mode='constant')

# Use the trained model to predict labels for the test_set
test_y_pred = clf.predict(test_X)

# Save the predicted output to a new file
with open('/content/drive/MyDrive/data/test_output.txt', 'w') as f:
    for i in range(len(test_set)):
        f.write(test_set[i] + ' ' + test_y_pred[i] + '\n')


In [26]:
cat /content/drive/MyDrive/data/test_output.txt

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
four CC
and DT
cbs IN
had IN
one NNP
cbs IN
held IN
the WP
previous IN
record IN
for CC
consecutive IN
no WP
1 NNP
victories NNP
46 NNP
weeks NN
during IN
the ``
196263 NNP
season ``
procter IN
gamble NN
co IN
cincinnati IN
expanding WP
its NN
presence IN
in NN
the ``
food CC
service ``
market IN
said ``
it NN
acquired DT
maryland IN
club IN
foods CC
a DT
coffee IN
supplier ``
from CC
an DT
investor NN
group NN
led IN
by IN
f CC
philip IN
handy IN
of NNP
winter NN
park IN
fla CC
terms WP
were NN
nt ``
disclosed IN
houstonbased IN
maryland IN
club IN
foods CC
which NN
had IN
sales CC
of NNP
about DT
200 NNP
million IN
last IN
year IN
sells ``
coffee IN
under IN
the WP
maryland IN
club IN
and DT
butternut IN
brands IN
to ``
restaurants IN
hotels IN
offices NNP
and DT
airlines DT
the ``
acquisition DT
gives NN
us IN
additional DT
production IN
capacity IN
for CC
the WP
food CC
service WP
coffee IN
business IN
and DT
a DT
str