In [35]:
import multiprocessing
import os, json, requests
import re
import nltk
from nltk import word_tokenize,sent_tokenize
import gensim.models.word2vec as w2v
import sklearn.manifold
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

In [5]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /home/vegardbs/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vegardbs/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [93]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ",raw)
    words = clean.split()
    return list(map(lambda x:x.lower(), words))

In [62]:
# Principles of Geology, from Gutenberg
filepath = "http://www.gutenberg.org/files/33224/33224-0.txt"
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")

corpus_raw = requests.get(filepath).text

In [63]:
# This is the raw text data from gutenberg, given as a string
corpus_raw[:200]

'\ufeffThe Project Gutenberg EBook of Principles of Geology, by Charles Lyell\r\n\r\nThis eBook is for the use of anyone anywhere at no cost and with\r\nalmost no restrictions whatsoever.  You may copy it, give i'

In [67]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [70]:
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [108]:
# The total number of characters in the raw data is
clean_raw = re.sub("[^a-zA-Z]"," ",corpus_raw.lower())
words = word_tokenize(clean_raw)
unique_words = len(set(words))

In [111]:
print(f"The total number of words in raw data is {len(words)}, and there are {unique_words} unique words.")

The total number of words in raw data is 425782, and there are 16977 unique words.


In [20]:
# We use nltk to split the raw data into sentences
raw_sentences = tokenizer.tokenize(corpus_raw)

In [53]:
# The first sentence is
raw_sentences[0]

'\ufeffThe Project Gutenberg EBook of Principles of Geology, by Charles Lyell\r\n\r\nThis eBook is for the use of anyone anywhere at no cost and with\r\nalmost no restrictions whatsoever.'

In [95]:
# Here we replace all irregular notation by spaces
clean_sentences = []
for sentences in raw_sentences:
    clean_sentence = re.sub("[^a-zA-Z]"," ",sentences)
    clean_sentences.append(clean_sentence)

In [100]:
clean_sentences[0]

' The Project Gutenberg EBook of Principles of Geology  by Charles Lyell    This eBook is for the use of anyone anywhere at no cost and with  almost no restrictions whatsoever '

In [103]:
# Word tokenize sentences
sentences = []
for sentence in clean_sentences:
    sentences.append(sentence.lower().split())

In [107]:
print(f"There are {len(sentences)} sentences in the training data.")

There are 18225 sentences in the data.


In [112]:
num_features = 300

#Minimum word count threshold.
min_word_count = 3

num_workers = multiprocessing.cpu_count()

context_size = 7

downsampling = 1e-3

seed = 1

In [120]:
model2vec  = w2v.Word2Vec(
                sg = 1,
                seed = seed,
                workers = num_workers,
                size = num_features,
                min_count = min_word_count,
                window = context_size,
                sample = downsampling)
model2vec.build_vocab(sentences)

In [121]:
if not os.path.exists(os.path.join("trained",'sample')):
    os.makedirs(os.path.join("trained",'sample',".w2v"))

In [122]:
model2vec.train(sentences, total_examples=model2vec.corpus_count, epochs = 100)

(29989093, 42563300)

In [125]:
model2vec.most_similar("granite")

  """Entry point for launching an IPython kernel.


[('gneiss', 0.4801788926124573),
 ('schists', 0.3700888156890869),
 ('weigh', 0.36876243352890015),
 ('porphyry', 0.34800484776496887),
 ('trap', 0.34250277280807495),
 ('glen', 0.3391730487346649),
 ('hartz', 0.33399856090545654),
 ('mica', 0.3324624300003052),
 ('rock', 0.33182093501091003),
 ('schist', 0.3296322822570801)]