In [1]:
import gzip
import gensim
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
data_file="gnue_irc_chat_logs.tsv.gz"

with gzip.open (data_file, 'rb') as f:
    for i,line in enumerate (f):
        print(line)
        break

b'****************DEREK**************************\n'


In [4]:
def read_input(input_file):
    """This method reads the input file which is in gzip format"""

    logging.info("reading file {0}...this may take a while".format(input_file))

    with gzip.open(input_file, 'rb') as f:
        for i, line in enumerate(f):

            if (i % 100000 == 0):
                logging.info("read {0} chat lines".format(i))
            # do some pre-processing and return a list of words for each review text
            yield gensim.utils.simple_preprocess(line)


In [5]:
# read the tokenized reviews into a list
# each review item becomes a serries of words
# so this becomes a list of lists
documents = list(read_input(data_file))
logging.info("Done reading data file")

2019-02-11 23:18:41,349 : INFO : reading file gnue_irc_chat_logs.tsv.gz...this may take a while
2019-02-11 23:18:41,351 : INFO : read 0 chat lines
2019-02-11 23:18:42,686 : INFO : read 100000 chat lines
2019-02-11 23:18:43,916 : INFO : read 200000 chat lines
2019-02-11 23:18:45,177 : INFO : read 300000 chat lines
2019-02-11 23:18:46,469 : INFO : read 400000 chat lines
2019-02-11 23:18:47,757 : INFO : read 500000 chat lines
2019-02-11 23:18:49,126 : INFO : read 600000 chat lines
2019-02-11 23:18:49,875 : INFO : Done reading data file


In [6]:
model = gensim.models.Word2Vec (documents, size=150, window=10, min_count=2, workers=10)
model.train(documents,total_examples=len(documents),epochs=10)

2019-02-11 23:19:28,743 : INFO : collecting all words and their counts
2019-02-11 23:19:28,744 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-02-11 23:19:28,761 : INFO : PROGRESS: at sentence #10000, processed 64099 words, keeping 6876 word types
2019-02-11 23:19:28,778 : INFO : PROGRESS: at sentence #20000, processed 126558 words, keeping 10038 word types
2019-02-11 23:19:28,797 : INFO : PROGRESS: at sentence #30000, processed 188127 words, keeping 12591 word types
2019-02-11 23:19:28,813 : INFO : PROGRESS: at sentence #40000, processed 251056 words, keeping 15080 word types
2019-02-11 23:19:28,829 : INFO : PROGRESS: at sentence #50000, processed 313288 words, keeping 17204 word types
2019-02-11 23:19:28,845 : INFO : PROGRESS: at sentence #60000, processed 379011 words, keeping 18956 word types
2019-02-11 23:19:28,860 : INFO : PROGRESS: at sentence #70000, processed 439894 words, keeping 20794 word types
2019-02-11 23:19:28,878 : INFO : PROGRESS: at se

2019-02-11 23:19:30,167 : INFO : downsampling leaves estimated 3318224 word corpus (80.4% of prior 4128595)
2019-02-11 23:19:30,283 : INFO : estimated required memory for 37964 words and 150 dimensions: 64538800 bytes
2019-02-11 23:19:30,284 : INFO : resetting layer weights
2019-02-11 23:19:30,747 : INFO : training model with 10 workers on 37964 vocabulary and 150 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
2019-02-11 23:19:31,770 : INFO : EPOCH 1 - PROGRESS: at 34.56% examples, 1139029 words/s, in_qsize 20, out_qsize 0
2019-02-11 23:19:32,778 : INFO : EPOCH 1 - PROGRESS: at 70.89% examples, 1169877 words/s, in_qsize 19, out_qsize 0
2019-02-11 23:19:33,447 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-02-11 23:19:33,448 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-02-11 23:19:33,450 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-02-11 23:19:33,456 : INFO : worker thread finished; awaiting fini

2019-02-11 23:19:47,789 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-11 23:19:47,790 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-11 23:19:47,790 : INFO : EPOCH - 1 : training on 4167004 raw words (3319202 effective words) took 2.9s, 1145231 effective words/s
2019-02-11 23:19:48,815 : INFO : EPOCH 2 - PROGRESS: at 37.15% examples, 1219032 words/s, in_qsize 18, out_qsize 1
2019-02-11 23:19:49,822 : INFO : EPOCH 2 - PROGRESS: at 76.19% examples, 1254132 words/s, in_qsize 19, out_qsize 0
2019-02-11 23:19:50,397 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-02-11 23:19:50,415 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-02-11 23:19:50,424 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-02-11 23:19:50,427 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-02-11 23:19:50,428 : INFO : worker thread finished; awaiting finish of 5 more thr

2019-02-11 23:20:06,244 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-02-11 23:20:06,249 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-02-11 23:20:06,250 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-02-11 23:20:06,259 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-02-11 23:20:06,268 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-02-11 23:20:06,273 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-11 23:20:06,274 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-11 23:20:06,277 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-11 23:20:06,287 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-11 23:20:06,291 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-11 23:20:06,291 : INFO : EPOCH - 8 : training on 4167004 raw words (3319675 effect

(33183723, 41670040)

In [25]:
w1 = "could"
model.wv.most_similar (positive=w1)

[('can', 0.8209862112998962),
 ('would', 0.7378469705581665),
 ('should', 0.6870460510253906),
 ('coudl', 0.6705307960510254),
 ('will', 0.6512588262557983),
 ('couldn', 0.5844181776046753),
 ('might', 0.5841648578643799),
 ('woudl', 0.5708403587341309),
 ('shoudl', 0.5620536208152771),
 ('must', 0.54425048828125)]