In [43]:
import gzip
import gensim
import logging
import numpy as np
import pandas as pd
import string

from gensim.test.utils import get_tmpfile
from os.path import join

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
FEATURES_DIR = join("..", "feature_outputs")
DATA_DIR = join("..", "data_files")
PUNCTUATION_SET = set(string.punctuation)

In [3]:
CHAT_INPUT_FILE = join(DATA_DIR, "gnue_irc_chat_logs_preprocessed_words_only.txt.gz")

# with gzip.open (data_file, 'rb') as f:
#     for i,line in enumerate (f):
#         print(line)
#         break

In [4]:
def read_input(input_file):
    """This method reads the input file which is in gzip format"""

    logging.info("reading file {0}...this may take a while".format(input_file))

    with gzip.open(input_file, 'rb') as f:
        for i, line in enumerate(f):

            if (i % 100000 == 0):
                logging.info("read {0} chat lines".format(i))
            # do some pre-processing and return a list of words for each review text
            yield gensim.utils.simple_preprocess(line)


In [5]:
# read the tokenized reviews into a list
# each review item becomes a serries of words
# so this becomes a list of lists
documents = list(read_input(CHAT_INPUT_FILE))
logging.info("Done reading data file")

2019-04-11 12:08:11,094 : INFO : reading file ../data_files/gnue_irc_chat_logs_preprocessed_words_only.txt.gz...this may take a while
2019-04-11 12:08:11,103 : INFO : read 0 chat lines
2019-04-11 12:08:12,412 : INFO : read 100000 chat lines
2019-04-11 12:08:13,706 : INFO : read 200000 chat lines
2019-04-11 12:08:15,010 : INFO : read 300000 chat lines
2019-04-11 12:08:16,266 : INFO : read 400000 chat lines
2019-04-11 12:08:17,534 : INFO : read 500000 chat lines
2019-04-11 12:08:18,821 : INFO : read 600000 chat lines
2019-04-11 12:08:19,521 : INFO : Done reading data file


In [6]:
model = gensim.models.Word2Vec (documents, size=150, window=10, min_count=2, workers=10)
model.train(documents,total_examples=len(documents),epochs=30)

2019-04-11 12:08:44,112 : INFO : collecting all words and their counts
2019-04-11 12:08:44,113 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-04-11 12:08:44,129 : INFO : PROGRESS: at sentence #10000, processed 64153 words, keeping 6873 word types
2019-04-11 12:08:44,147 : INFO : PROGRESS: at sentence #20000, processed 126677 words, keeping 10038 word types
2019-04-11 12:08:44,164 : INFO : PROGRESS: at sentence #30000, processed 188326 words, keeping 12592 word types
2019-04-11 12:08:44,182 : INFO : PROGRESS: at sentence #40000, processed 251292 words, keeping 15081 word types
2019-04-11 12:08:44,201 : INFO : PROGRESS: at sentence #50000, processed 313590 words, keeping 17212 word types
2019-04-11 12:08:44,219 : INFO : PROGRESS: at sentence #60000, processed 379341 words, keeping 18964 word types
2019-04-11 12:08:44,237 : INFO : PROGRESS: at sentence #70000, processed 440245 words, keeping 20799 word types
2019-04-11 12:08:44,258 : INFO : PROGRESS: at se

2019-04-11 12:08:45,646 : INFO : downsampling leaves estimated 3321113 word corpus (80.4% of prior 4131460)
2019-04-11 12:08:45,769 : INFO : estimated required memory for 37992 words and 150 dimensions: 64586400 bytes
2019-04-11 12:08:45,770 : INFO : resetting layer weights
2019-04-11 12:08:46,213 : INFO : training model with 10 workers on 37992 vocabulary and 150 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
2019-04-11 12:08:47,236 : INFO : EPOCH 1 - PROGRESS: at 32.65% examples, 1071560 words/s, in_qsize 18, out_qsize 1
2019-04-11 12:08:48,256 : INFO : EPOCH 1 - PROGRESS: at 67.27% examples, 1102140 words/s, in_qsize 19, out_qsize 0
2019-04-11 12:08:49,128 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-04-11 12:08:49,136 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-04-11 12:08:49,142 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-04-11 12:08:49,147 : INFO : worker thread finished; awaiting fini

2019-04-11 12:09:04,428 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-04-11 12:09:04,430 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-04-11 12:09:04,431 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-04-11 12:09:04,443 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-04-11 12:09:04,444 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-04-11 12:09:04,444 : INFO : EPOCH - 1 : training on 4169816 raw words (3320951 effective words) took 3.2s, 1029696 effective words/s
2019-04-11 12:09:05,462 : INFO : EPOCH 2 - PROGRESS: at 34.06% examples, 1126816 words/s, in_qsize 19, out_qsize 0
2019-04-11 12:09:06,466 : INFO : EPOCH 2 - PROGRESS: at 68.01% examples, 1126768 words/s, in_qsize 19, out_qsize 0
2019-04-11 12:09:07,344 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-04-11 12:09:07,351 : INFO : worker thread finished; awaiting finish of 8 more thr

2019-04-11 12:09:21,997 : INFO : EPOCH - 7 : training on 4169816 raw words (3321753 effective words) took 2.9s, 1144298 effective words/s
2019-04-11 12:09:23,014 : INFO : EPOCH 8 - PROGRESS: at 31.21% examples, 1031905 words/s, in_qsize 19, out_qsize 0
2019-04-11 12:09:24,019 : INFO : EPOCH 8 - PROGRESS: at 67.25% examples, 1114373 words/s, in_qsize 19, out_qsize 0
2019-04-11 12:09:25,005 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-04-11 12:09:25,017 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-04-11 12:09:25,031 : INFO : EPOCH 8 - PROGRESS: at 98.51% examples, 1083469 words/s, in_qsize 5, out_qsize 5
2019-04-11 12:09:25,033 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-04-11 12:09:25,036 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-04-11 12:09:25,038 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-04-11 12:09:25,039 : INFO : worker thread finished; awaiti

2019-04-11 12:09:41,746 : INFO : EPOCH 14 - PROGRESS: at 34.51% examples, 1144702 words/s, in_qsize 19, out_qsize 0
2019-04-11 12:09:42,760 : INFO : EPOCH 14 - PROGRESS: at 69.00% examples, 1137576 words/s, in_qsize 20, out_qsize 0
2019-04-11 12:09:43,621 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-04-11 12:09:43,637 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-04-11 12:09:43,640 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-04-11 12:09:43,641 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-04-11 12:09:43,643 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-04-11 12:09:43,645 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-04-11 12:09:43,646 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-04-11 12:09:43,647 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-04-11 12:09:43,655 : INFO : worker 

2019-04-11 12:10:02,538 : INFO : EPOCH 20 - PROGRESS: at 64.77% examples, 1075060 words/s, in_qsize 19, out_qsize 0
2019-04-11 12:10:03,364 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-04-11 12:10:03,369 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-04-11 12:10:03,371 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-04-11 12:10:03,387 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-04-11 12:10:03,394 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-04-11 12:10:03,396 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-04-11 12:10:03,397 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-04-11 12:10:03,398 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-04-11 12:10:03,406 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-04-11 12:10:03,410 : INFO : worker thread finished; awaiting

2019-04-11 12:10:20,414 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-04-11 12:10:20,414 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-04-11 12:10:20,417 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-04-11 12:10:20,423 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-04-11 12:10:20,424 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-04-11 12:10:20,428 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-04-11 12:10:20,429 : INFO : EPOCH - 26 : training on 4169816 raw words (3321373 effective words) took 3.2s, 1032295 effective words/s
2019-04-11 12:10:21,445 : INFO : EPOCH 27 - PROGRESS: at 35.89% examples, 1190778 words/s, in_qsize 19, out_qsize 0
2019-04-11 12:10:22,448 : INFO : EPOCH 27 - PROGRESS: at 72.07% examples, 1194657 words/s, in_qsize 19, out_qsize 0
2019-04-11 12:10:23,195 : INFO : worker thread finished; awaiting finish of 9 more 

(99631447, 125094480)

In [133]:
w1 = "hi"
hi_vec = model.wv.word_vec(w1)
# print(hi_vec)
model.wv.most_similar(positive=w1)
print(model.wv.similar_by_word("yes"))
# print(svs[3])
model.wv.similar_by_vector(svs[430])
# svs[0]

[('yep', 0.4705391824245453), ('yeah', 0.40171346068382263), ('ok', 0.3854413628578186), ('also', 0.37402111291885376), ('still', 0.3661518096923828), ('not', 0.32625606656074524), ('correct', 0.32489296793937683), ('technically', 0.32411783933639526), ('deprecated', 0.31837207078933716), ('yah', 0.3139795958995819)]


[('what', 0.5706781148910522),
 ('anyone', 0.49943992495536804),
 ('anybody', 0.48487603664398193),
 ('reinhard', 0.4716574251651764),
 ('someone', 0.46277058124542236),
 ('somebody', 0.4620576500892639),
 ('anything', 0.43369153141975403),
 ('who', 0.42737510800361633),
 ('know', 0.4136282503604889),
 ('reportlabs', 0.4118925631046295)]

In [109]:
derek_vector = model.wv.word_vec("derek")
derek_vector

array([-0.03135124,  1.2129529 , -0.79715836, -1.8308288 ,  0.4907622 ,
        1.1276971 , -0.32817286,  0.19805674,  0.02772181,  0.25818926,
        0.42525584, -1.1003402 , -0.22541124, -1.6306183 , -0.14814904,
        0.55835855,  0.46363   ,  0.38090482,  1.1816655 , -0.39435875,
        1.643432  ,  1.8381475 ,  0.16281776, -0.13909623, -0.22085871,
        1.1222386 ,  0.80230576,  0.8636888 ,  0.34996024, -0.93690115,
        1.1480107 ,  1.7027161 ,  0.13138667, -0.8414984 , -0.16195013,
       -1.0490205 ,  0.623895  ,  0.0026741 ,  1.0096171 ,  1.3378421 ,
       -0.06086442,  0.11471634, -0.17178415,  0.47939003,  0.15147865,
       -2.1511607 ,  0.7102236 , -0.20657034, -0.624123  ,  1.1227851 ,
       -0.36108878,  0.6622783 , -0.22130087, -1.4422998 , -0.7275693 ,
       -0.10384305, -0.13879171,  0.5134441 ,  0.00515444,  0.52122694,
        0.5395217 , -0.791034  , -0.5890757 , -0.82481664,  0.4689453 ,
        1.8096837 , -0.37319225, -0.8374059 , -0.28248352,  0.39

In [15]:
model.wv.word_vec("cvs").shape

(150,)

In [18]:
# np.multiply(model.wv.word_vec("derek"), 0.0015415098078320612)

In [19]:
# Save the model
filename = join(DATA_DIR, "gnue_irc_word2vec_model_30e")
model.save(filename)

2019-04-11 12:18:15,103 : INFO : saving Word2Vec object under ../data_files/gnue_irc_word2vec_model_30e, separately None
2019-04-11 12:18:15,105 : INFO : not storing attribute vectors_norm
2019-04-11 12:18:15,106 : INFO : not storing attribute cum_table
2019-04-11 12:18:15,543 : INFO : saved ../data_files/gnue_irc_word2vec_model_30e


In [20]:
# Load Word TF-IDFS
def get_word_tf_idfs(word_tf_idfs_filename):
    word_tf_idfs = []
    with open(word_tf_idfs_filename) as word_tf_idfs_file:
        for values in word_tf_idfs_file:
            values = [float(value) for value in values.strip().split()]
            word_tf_idfs.append(values)
    return word_tf_idfs
        

In [21]:
word_tf_idfs_filename = join(DATA_DIR, "word_tf_idfs.txt")
word_tf_idfs = get_word_tf_idfs(word_tf_idfs_filename)[:4]

In [22]:
word_tf_idfs[1]

[0.0005048845017409812, 0.0006060496352296479]

Generating Sentence Vectors using the **Average of Word2Vec vectors with TF-IDF** method

"This is one of the best approach which I will recommend. Just take the word vectors and multiply it with their TF-IDF scores. Just take the average and it will represent your sentence vector"

https://stackoverflow.com/questions/29760935/how-to-get-vector-for-a-sentence-from-the-word2vec-of-tokens-in-sentence


In [23]:
def strip_leading_and_trailing_punctuation(word):
    return word.strip(string.punctuation) 

In [24]:
def pre_process_sentence(sentence):
    if not sentence:
        print("No words here")
    if type(sentence) is not str:
        try:
            sentence = sentence.decode('utf-8')
        except Exception:
            raise ValueError("Input must be a String or ByteString")
        
    return sentence

In [25]:
def all_chars_in_word_are_punctuation(word):
    return all(char in PUNCTUATION_SET for char in word)

In [26]:
def get_words_in_sentence(sentence):
    words = []
    for word in sentence.split():
        if not all_chars_in_word_are_punctuation(word):
            word = strip_leading_and_trailing_punctuation(word)
            if ',' in word:
                comma_split_words = word.split(',')
                for word in comma_split_words:
                    if not all_chars_in_word_are_punctuation(word):
                        words.append(word)
            else:
                words.append(word)
    return words

In [27]:
def get_sentence_word_vectors(sentence, model):
    sentence = pre_process_sentence(sentence)
    vectors = []
    words = get_words_in_sentence(sentence)
    if not words:
        return [np.zeros((150,))]
    for word in words:
        try:
            vectors.append(model.wv.word_vec(word.lower()))
        except KeyError:
            vectors.append(np.zeros((150,)))
    
    return vectors
    

In [28]:
len([vec for vec in get_sentence_word_vectors("cvs diff", model)])

2

In [31]:
get_sentence_word_vectors("cvs ddghsdg", model)[1].shape

(150,)

In [99]:
def get_sentence_vector(sentence_word_tf_idfs, word_vectors):
    products = []
    for word_tf_idf, word_vector in zip(sentence_word_tf_idfs, word_vectors):
        # product = np.multiply(word_vector, word_tf_idf)
        products.append(word_vector)
    vector_sum = np.sum(products, axis=0)
    return vector_sum / len(products)
        

In [101]:
word_tf_idfs_2, word_vectors = word_tf_idfs[1], get_sentence_word_vectors("yes", model)
sentence_vector = get_sentence_vector(word_tf_idfs_2, word_vectors)
assert sentence_vector.shape == derek_vector.shape
sentence_vector.shape
len(word_vectors)
word_tf_idfs[1]
sentence_vector

array([-0.17306045, -0.4119689 , -0.68852144,  0.3575111 , -1.0125728 ,
       -1.2197611 ,  1.3024852 , -0.08643285, -0.08627433, -0.45718646,
       -0.03006266,  0.1772719 ,  0.7941217 ,  1.2254379 , -0.90057796,
       -1.1216336 ,  0.4185411 , -0.8488634 ,  0.06914191, -2.252991  ,
       -0.66548985,  1.5550226 ,  0.15210392,  1.558742  , -0.04461487,
        1.2995632 , -0.18457621,  0.24615063, -0.22307488,  0.0930325 ,
       -0.3554208 , -0.51180583, -0.6259641 ,  0.9453741 ,  0.63189083,
       -0.31038192,  0.7324851 ,  0.76287144,  0.4901549 ,  1.3060192 ,
        1.7331214 , -1.2645956 ,  1.1456177 , -0.37746853, -0.3546912 ,
        0.55272067, -0.35021815, -0.7842747 , -0.02991787, -0.26625466,
       -0.84261954,  0.19607289,  0.16211705,  0.3306152 ,  0.15203331,
       -0.25734717, -0.9158747 ,  1.152726  , -0.19793746, -0.07784983,
        0.77156156,  0.37280014, -0.39148784, -0.52247673,  2.0089169 ,
       -0.05270575,  0.55640674, -0.5582477 , -0.31680283,  2.14

In [102]:
def generate_sentence_vectors(
    chat_input_filename, 
    word_tf_idfs, 
    word_2_vec_model, 
    sentence_vectors_output_filename
):
    with gzip.open(chat_input_filename) as chat_input_file, open(
        sentence_vectors_output_filename, "w") as sentence_vectors_output_file:
        line_count = 0
        # Add header ro csv. Numbers from 1 to 150
        sentence_vectors_output_file.write("{}\n".format(",".join([str(num) for num in range(1,151)])))
        for sentence in chat_input_file:
            sentence_word_vectors = get_sentence_word_vectors(sentence, word_2_vec_model)
            sentence_vector = get_sentence_vector(word_tf_idfs[line_count], sentence_word_vectors)
            csv_vector = ','.join([str(num) for num in sentence_vector])
            sentence_vectors_output_file.write("{} \n".format(csv_vector))
            line_count += 1
    

In [103]:
word_tf_idfs_filename = join(DATA_DIR, "word_tf_idfs.txt")
word_tf_idfs = get_word_tf_idfs(word_tf_idfs_filename)
word_2_vec_model = model
sentence_vectors_output_filename = join(FEATURES_DIR, "sentence_vectors_30e.csv")

In [104]:
generate_sentence_vectors(
    CHAT_INPUT_FILE, 
    word_tf_idfs, 
    word_2_vec_model, 
    sentence_vectors_output_filename
)

In [105]:
x1 = np.arange(9.0).reshape((3, 3))
x2 = np.arange(9).reshape((3, 3)) * 2
# np.multiply(x1, x2)
np.sum([x2, x2], axis=0)

array([[ 0,  4,  8],
       [12, 16, 20],
       [24, 28, 32]])

In [106]:
sentence_vectors = pd.read_csv(sentence_vectors_output_filename)

In [107]:
svs = np.array(sentence_vectors.values)

In [108]:
svs[0]

array([-0.03135124,  1.2129529 , -0.79715836, -1.8308288 ,  0.4907622 ,
        1.1276971 , -0.32817286,  0.19805674,  0.02772181,  0.25818926,
        0.42525584, -1.1003402 , -0.22541124, -1.6306183 , -0.14814904,
        0.55835855,  0.46363   ,  0.38090482,  1.1816655 , -0.39435875,
        1.643432  ,  1.8381475 ,  0.16281776, -0.13909623, -0.22085871,
        1.1222386 ,  0.80230576,  0.8636888 ,  0.34996024, -0.93690115,
        1.1480107 ,  1.7027161 ,  0.13138667, -0.8414984 , -0.16195013,
       -1.0490205 ,  0.623895  ,  0.0026741 ,  1.0096171 ,  1.3378421 ,
       -0.06086441,  0.11471634, -0.17178415,  0.47939003,  0.15147865,
       -2.1511607 ,  0.7102236 , -0.20657034, -0.624123  ,  1.1227851 ,
       -0.36108878,  0.6622783 , -0.22130087, -1.4422998 , -0.7275693 ,
       -0.10384305, -0.13879171,  0.5134441 ,  0.00515444,  0.52122694,
        0.5395217 , -0.791034  , -0.5890757 , -0.82481664,  0.4689453 ,
        1.8096837 , -0.37319225, -0.8374059 , -0.28248352,  0.39