In [20]:
import gzip
import gensim
import logging
import numpy as np
import pandas as pd
import string

from gensim.test.utils import get_tmpfile
from os.path import join

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
FEATURES_DIR = join("..", "feature_outputs")
DATA_DIR = join("..", "data_files")
PUNCTUATION_SET = set(string.punctuation)

In [3]:
CHAT_INPUT_FILE = join(DATA_DIR, "gnue_irc_chat_logs_preprocessed_words_only.txt.gz")

# with gzip.open (data_file, 'rb') as f:
#     for i,line in enumerate (f):
#         print(line)
#         break

In [4]:
def read_input(input_file):
    """This method reads the input file which is in gzip format"""

    logging.info("reading file {0}...this may take a while".format(input_file))

    with gzip.open(input_file, 'rb') as f:
        for i, line in enumerate(f):

            if (i % 100000 == 0):
                logging.info("read {0} chat lines".format(i))
            # do some pre-processing and return a list of words for each review text
            yield gensim.utils.simple_preprocess(line)


In [5]:
# read the tokenized reviews into a list
# each review item becomes a serries of words
# so this becomes a list of lists
documents = list(read_input(CHAT_INPUT_FILE))
logging.info("Done reading data file")

2019-04-16 11:21:52,838 : INFO : reading file ../data_files/gnue_irc_chat_logs_preprocessed_words_only.txt.gz...this may take a while
2019-04-16 11:21:52,846 : INFO : read 0 chat lines
2019-04-16 11:21:54,169 : INFO : read 100000 chat lines
2019-04-16 11:21:55,439 : INFO : read 200000 chat lines
2019-04-16 11:21:56,775 : INFO : read 300000 chat lines
2019-04-16 11:21:58,116 : INFO : read 400000 chat lines
2019-04-16 11:21:59,565 : INFO : read 500000 chat lines
2019-04-16 11:22:00,903 : INFO : read 600000 chat lines
2019-04-16 11:22:01,820 : INFO : Done reading data file


In [10]:
model = gensim.models.Word2Vec (documents, size=150, window=10, min_count=2, workers=10)
model.train(documents,total_examples=len(documents),epochs=30)

2019-04-16 11:25:47,883 : INFO : collecting all words and their counts
2019-04-16 11:25:47,884 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-04-16 11:25:47,902 : INFO : PROGRESS: at sentence #10000, processed 64153 words, keeping 6873 word types
2019-04-16 11:25:47,921 : INFO : PROGRESS: at sentence #20000, processed 126677 words, keeping 10038 word types
2019-04-16 11:25:47,940 : INFO : PROGRESS: at sentence #30000, processed 188326 words, keeping 12592 word types
2019-04-16 11:25:47,959 : INFO : PROGRESS: at sentence #40000, processed 251292 words, keeping 15081 word types
2019-04-16 11:25:47,979 : INFO : PROGRESS: at sentence #50000, processed 313590 words, keeping 17212 word types
2019-04-16 11:25:47,999 : INFO : PROGRESS: at sentence #60000, processed 379341 words, keeping 18964 word types
2019-04-16 11:25:48,018 : INFO : PROGRESS: at sentence #70000, processed 440245 words, keeping 20799 word types
2019-04-16 11:25:48,042 : INFO : PROGRESS: at se

2019-04-16 11:25:49,445 : INFO : downsampling leaves estimated 3321113 word corpus (80.4% of prior 4131460)
2019-04-16 11:25:49,564 : INFO : estimated required memory for 37992 words and 150 dimensions: 64586400 bytes
2019-04-16 11:25:49,565 : INFO : resetting layer weights
2019-04-16 11:25:49,968 : INFO : training model with 10 workers on 37992 vocabulary and 150 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
2019-04-16 11:25:50,994 : INFO : EPOCH 1 - PROGRESS: at 35.01% examples, 1161060 words/s, in_qsize 19, out_qsize 0
2019-04-16 11:25:52,000 : INFO : EPOCH 1 - PROGRESS: at 70.84% examples, 1174394 words/s, in_qsize 20, out_qsize 0
2019-04-16 11:25:52,714 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-04-16 11:25:52,741 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-04-16 11:25:52,746 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-04-16 11:25:52,748 : INFO : worker thread finished; awaiting fini

2019-04-16 11:26:07,842 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-04-16 11:26:07,850 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-04-16 11:26:07,853 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-04-16 11:26:07,857 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-04-16 11:26:07,858 : INFO : EPOCH - 1 : training on 4169816 raw words (3322314 effective words) took 2.9s, 1132294 effective words/s
2019-04-16 11:26:08,876 : INFO : EPOCH 2 - PROGRESS: at 33.84% examples, 1117229 words/s, in_qsize 19, out_qsize 0
2019-04-16 11:26:09,884 : INFO : EPOCH 2 - PROGRESS: at 71.36% examples, 1178968 words/s, in_qsize 18, out_qsize 1
2019-04-16 11:26:10,625 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-04-16 11:26:10,638 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-04-16 11:26:10,643 : INFO : worker thread finished; awaiting finish of 7 more thr

2019-04-16 11:26:26,832 : INFO : EPOCH 8 - PROGRESS: at 52.13% examples, 859485 words/s, in_qsize 17, out_qsize 2
2019-04-16 11:26:27,845 : INFO : EPOCH 8 - PROGRESS: at 85.90% examples, 940141 words/s, in_qsize 19, out_qsize 0
2019-04-16 11:26:28,188 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-04-16 11:26:28,200 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-04-16 11:26:28,203 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-04-16 11:26:28,204 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-04-16 11:26:28,205 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-04-16 11:26:28,206 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-04-16 11:26:28,207 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-04-16 11:26:28,210 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-04-16 11:26:28,213 : INFO : worker thre

2019-04-16 11:26:45,706 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-04-16 11:26:45,715 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-04-16 11:26:45,716 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-04-16 11:26:45,718 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-04-16 11:26:45,719 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-04-16 11:26:45,723 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-04-16 11:26:45,730 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-04-16 11:26:45,731 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-04-16 11:26:45,732 : INFO : EPOCH - 14 : training on 4169816 raw words (3321389 effective words) took 2.9s, 1141116 effective words/s
2019-04-16 11:26:46,753 : INFO : EPOCH 15 - PROGRESS: at 26.91% examples, 886877 words/s, in_qsize 19, out_qsize 2
2019-04-16 11:26:4

2019-04-16 11:27:04,490 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-04-16 11:27:04,501 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-04-16 11:27:04,502 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-04-16 11:27:04,507 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-04-16 11:27:04,508 : INFO : EPOCH - 20 : training on 4169816 raw words (3320968 effective words) took 3.5s, 958364 effective words/s
2019-04-16 11:27:05,525 : INFO : EPOCH 21 - PROGRESS: at 25.29% examples, 834513 words/s, in_qsize 20, out_qsize 0
2019-04-16 11:27:06,530 : INFO : EPOCH 21 - PROGRESS: at 52.85% examples, 876697 words/s, in_qsize 20, out_qsize 0
2019-04-16 11:27:07,534 : INFO : EPOCH 21 - PROGRESS: at 76.89% examples, 848927 words/s, in_qsize 20, out_qsize 0
2019-04-16 11:27:08,184 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-04-16 11:27:08,192 : INFO : worker thread finished; await

2019-04-16 11:27:23,022 : INFO : EPOCH - 26 : training on 4169816 raw words (3320404 effective words) took 3.0s, 1120051 effective words/s
2019-04-16 11:27:24,038 : INFO : EPOCH 27 - PROGRESS: at 34.26% examples, 1135463 words/s, in_qsize 19, out_qsize 0
2019-04-16 11:27:25,072 : INFO : EPOCH 27 - PROGRESS: at 71.09% examples, 1160967 words/s, in_qsize 16, out_qsize 3
2019-04-16 11:27:25,807 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-04-16 11:27:25,823 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-04-16 11:27:25,827 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-04-16 11:27:25,834 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-04-16 11:27:25,837 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-04-16 11:27:25,839 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-04-16 11:27:25,843 : INFO : worker thread finished; awaiting finish of 3 more 

(99627591, 125094480)

In [11]:
w1 = "hi"
hi_vec = model.wv.word_vec(w1)
# print(hi_vec)
model.wv.most_similar(positive=w1)
print(model.wv.similar_by_word("yes"))
# print(svs[3])
model.wv.similar_by_vector(svs[430])
# svs[0]

2019-04-16 11:30:28,273 : INFO : precomputing L2-norms of word weight vectors


[('yep', 0.45996320247650146), ('yeah', 0.3965533375740051), ('ok', 0.3885487914085388), ('also', 0.361253023147583), ('correct', 0.34908440709114075), ('still', 0.3433060646057129), ('yip', 0.3312121331691742), ('not', 0.3236917555332184), ('mistaken', 0.30754122138023376), ('technically', 0.30000999569892883)]


NameError: name 'svs' is not defined

In [21]:
loaded_model =  gensim.models.Word2Vec.load(join(DATA_DIR, "gnue_irc_word2vec_model_30e.h5"))

2019-04-16 11:32:35,694 : INFO : loading Word2Vec object from ../data_files/gnue_irc_word2vec_model_30e.h5
2019-04-16 11:32:36,291 : INFO : loading wv recursively from ../data_files/gnue_irc_word2vec_model_30e.h5.wv.* with mmap=None
2019-04-16 11:32:36,292 : INFO : setting ignored attribute vectors_norm to None
2019-04-16 11:32:36,292 : INFO : loading vocabulary recursively from ../data_files/gnue_irc_word2vec_model_30e.h5.vocabulary.* with mmap=None
2019-04-16 11:32:36,293 : INFO : loading trainables recursively from ../data_files/gnue_irc_word2vec_model_30e.h5.trainables.* with mmap=None
2019-04-16 11:32:36,294 : INFO : setting ignored attribute cum_table to None
2019-04-16 11:32:36,295 : INFO : loaded ../data_files/gnue_irc_word2vec_model_30e.h5


In [23]:
derek_vector = loaded_model.wv.word_vec("derek")
derek_vector

array([-0.03135124,  1.2129529 , -0.79715836, -1.8308288 ,  0.4907622 ,
        1.1276971 , -0.32817286,  0.19805674,  0.02772181,  0.25818926,
        0.42525584, -1.1003402 , -0.22541124, -1.6306183 , -0.14814904,
        0.55835855,  0.46363   ,  0.38090482,  1.1816655 , -0.39435875,
        1.643432  ,  1.8381475 ,  0.16281776, -0.13909623, -0.22085871,
        1.1222386 ,  0.80230576,  0.8636888 ,  0.34996024, -0.93690115,
        1.1480107 ,  1.7027161 ,  0.13138667, -0.8414984 , -0.16195013,
       -1.0490205 ,  0.623895  ,  0.0026741 ,  1.0096171 ,  1.3378421 ,
       -0.06086442,  0.11471634, -0.17178415,  0.47939003,  0.15147865,
       -2.1511607 ,  0.7102236 , -0.20657034, -0.624123  ,  1.1227851 ,
       -0.36108878,  0.6622783 , -0.22130087, -1.4422998 , -0.7275693 ,
       -0.10384305, -0.13879171,  0.5134441 ,  0.00515444,  0.52122694,
        0.5395217 , -0.791034  , -0.5890757 , -0.82481664,  0.4689453 ,
        1.8096837 , -0.37319225, -0.8374059 , -0.28248352,  0.39

In [24]:
loaded_model.wv.word_vec("cvs").shape

(150,)

In [15]:
# np.multiply(model.wv.word_vec("derek"), 0.0015415098078320612)

In [25]:
# Save the model
# filename = join(DATA_DIR, "gnue_irc_word2vec_model_30e_2.h5")
# model.save(filename)

In [38]:
# Load Word TF-IDFS
def get_word_tf_idfs(word_tf_idfs_filename):
    word_tf_idfs = []
    with open(word_tf_idfs_filename) as word_tf_idfs_file:
        for values in word_tf_idfs_file:
            values = [float(value) for value in values.strip().split()]
            word_tf_idfs.append(values)
    return word_tf_idfs
        

In [39]:
word_tf_idfs_filename = join(DATA_DIR, "word_tf_idfs.txt")
word_tf_idfs = get_word_tf_idfs(word_tf_idfs_filename)[:4]

In [40]:
word_tf_idfs[1]

[0.0005048845017409812, 0.0006060496352296479]

Generating Sentence Vectors using the **Average of Word2Vec vectors with TF-IDF** method

"This is one of the best approach which I will recommend. Just take the word vectors and multiply it with their TF-IDF scores. Just take the average and it will represent your sentence vector"

https://stackoverflow.com/questions/29760935/how-to-get-vector-for-a-sentence-from-the-word2vec-of-tokens-in-sentence


In [41]:
def strip_leading_and_trailing_punctuation(word):
    return word.strip(string.punctuation) 

In [42]:
def pre_process_sentence(sentence):
    if not sentence:
        print("No words here")
    if type(sentence) is not str:
        try:
            sentence = sentence.decode('utf-8')
        except Exception:
            raise ValueError("Input must be a String or ByteString")
        
    return sentence

In [43]:
def all_chars_in_word_are_punctuation(word):
    return all(char in PUNCTUATION_SET for char in word)

In [44]:
def get_words_in_sentence(sentence):
    words = []
    for word in sentence.split():
        if not all_chars_in_word_are_punctuation(word):
            word = strip_leading_and_trailing_punctuation(word)
            if ',' in word:
                comma_split_words = word.split(',')
                for word in comma_split_words:
                    if not all_chars_in_word_are_punctuation(word):
                        words.append(word)
            else:
                words.append(word)
    return words

In [45]:
def get_sentence_word_vectors(sentence, model):
    sentence = pre_process_sentence(sentence)
    vectors = []
    words = get_words_in_sentence(sentence)
    if not words:
        return [np.zeros((150,))]
    for word in words:
        try:
            vectors.append(model.wv.word_vec(word.lower()))
        except KeyError:
            vectors.append(np.zeros((150,)))
    
    return vectors
    

In [46]:
len([vec for vec in get_sentence_word_vectors("cvs diff", model)])

2

In [47]:
get_sentence_word_vectors("cvs ddghsdg", model)[1].shape

(150,)

In [48]:
def get_sentence_vector(sentence_word_tf_idfs, word_vectors):
    products = []
    for word_tf_idf, word_vector in zip(sentence_word_tf_idfs, word_vectors):
        # product = np.multiply(word_vector, word_tf_idf)
        products.append(word_vector)
    vector_sum = np.sum(products, axis=0)
    return vector_sum / len(products)
        

In [51]:
word_tf_idfs_2, word_vectors = word_tf_idfs[1], get_sentence_word_vectors("derek", loaded_model)
sentence_vector = get_sentence_vector(word_tf_idfs_2, word_vectors)
assert sentence_vector.shape == derek_vector.shape
sentence_vector.shape
len(word_vectors)
# word_tf_idfs[1]
sentence_vector

array([-0.03135124,  1.2129529 , -0.79715836, -1.8308288 ,  0.4907622 ,
        1.1276971 , -0.32817286,  0.19805674,  0.02772181,  0.25818926,
        0.42525584, -1.1003402 , -0.22541124, -1.6306183 , -0.14814904,
        0.55835855,  0.46363   ,  0.38090482,  1.1816655 , -0.39435875,
        1.643432  ,  1.8381475 ,  0.16281776, -0.13909623, -0.22085871,
        1.1222386 ,  0.80230576,  0.8636888 ,  0.34996024, -0.93690115,
        1.1480107 ,  1.7027161 ,  0.13138667, -0.8414984 , -0.16195013,
       -1.0490205 ,  0.623895  ,  0.0026741 ,  1.0096171 ,  1.3378421 ,
       -0.06086442,  0.11471634, -0.17178415,  0.47939003,  0.15147865,
       -2.1511607 ,  0.7102236 , -0.20657034, -0.624123  ,  1.1227851 ,
       -0.36108878,  0.6622783 , -0.22130087, -1.4422998 , -0.7275693 ,
       -0.10384305, -0.13879171,  0.5134441 ,  0.00515444,  0.52122694,
        0.5395217 , -0.791034  , -0.5890757 , -0.82481664,  0.4689453 ,
        1.8096837 , -0.37319225, -0.8374059 , -0.28248352,  0.39

In [102]:
def generate_sentence_vectors(
    chat_input_filename, 
    word_tf_idfs, 
    word_2_vec_model, 
    sentence_vectors_output_filename
):
    with gzip.open(chat_input_filename) as chat_input_file, open(
        sentence_vectors_output_filename, "w") as sentence_vectors_output_file:
        line_count = 0
        # Add header ro csv. Numbers from 1 to 150
        sentence_vectors_output_file.write("{}\n".format(",".join([str(num) for num in range(1,151)])))
        for sentence in chat_input_file:
            sentence_word_vectors = get_sentence_word_vectors(sentence, word_2_vec_model)
            sentence_vector = get_sentence_vector(word_tf_idfs[line_count], sentence_word_vectors)
            csv_vector = ','.join([str(num) for num in sentence_vector])
            sentence_vectors_output_file.write("{} \n".format(csv_vector))
            line_count += 1
    

In [103]:
word_tf_idfs_filename = join(DATA_DIR, "word_tf_idfs.txt")
word_tf_idfs = get_word_tf_idfs(word_tf_idfs_filename)
word_2_vec_model = model
sentence_vectors_output_filename = join(FEATURES_DIR, "sentence_vectors_30e.csv")

In [104]:
generate_sentence_vectors(
    CHAT_INPUT_FILE, 
    word_tf_idfs, 
    word_2_vec_model, 
    sentence_vectors_output_filename
)

In [105]:
x1 = np.arange(9.0).reshape((3, 3))
x2 = np.arange(9).reshape((3, 3)) * 2
# np.multiply(x1, x2)
np.sum([x2, x2], axis=0)

array([[ 0,  4,  8],
       [12, 16, 20],
       [24, 28, 32]])

In [106]:
sentence_vectors = pd.read_csv(sentence_vectors_output_filename)

In [107]:
svs = np.array(sentence_vectors.values)

In [108]:
svs[0]

array([-0.03135124,  1.2129529 , -0.79715836, -1.8308288 ,  0.4907622 ,
        1.1276971 , -0.32817286,  0.19805674,  0.02772181,  0.25818926,
        0.42525584, -1.1003402 , -0.22541124, -1.6306183 , -0.14814904,
        0.55835855,  0.46363   ,  0.38090482,  1.1816655 , -0.39435875,
        1.643432  ,  1.8381475 ,  0.16281776, -0.13909623, -0.22085871,
        1.1222386 ,  0.80230576,  0.8636888 ,  0.34996024, -0.93690115,
        1.1480107 ,  1.7027161 ,  0.13138667, -0.8414984 , -0.16195013,
       -1.0490205 ,  0.623895  ,  0.0026741 ,  1.0096171 ,  1.3378421 ,
       -0.06086441,  0.11471634, -0.17178415,  0.47939003,  0.15147865,
       -2.1511607 ,  0.7102236 , -0.20657034, -0.624123  ,  1.1227851 ,
       -0.36108878,  0.6622783 , -0.22130087, -1.4422998 , -0.7275693 ,
       -0.10384305, -0.13879171,  0.5134441 ,  0.00515444,  0.52122694,
        0.5395217 , -0.791034  , -0.5890757 , -0.82481664,  0.4689453 ,
        1.8096837 , -0.37319225, -0.8374059 , -0.28248352,  0.39