In [1]:
import gzip
import gensim
import logging
import numpy as np
import string

from gensim.test.utils import get_tmpfile
from os.path import join

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
FEATURES_DIR = join("..", "feature_outputs")
DATA_DIR = join("..", "data_files")
PUNCTUATION_SET = set(string.punctuation)

In [3]:
CHAT_INPUT_FILE = join(DATA_DIR, "gnue_irc_chat_logs_preprocessed_words_only.txt.gz")

# with gzip.open (data_file, 'rb') as f:
#     for i,line in enumerate (f):
#         print(line)
#         break

In [4]:
def read_input(input_file):
    """This method reads the input file which is in gzip format"""

    logging.info("reading file {0}...this may take a while".format(input_file))

    with gzip.open(input_file, 'rb') as f:
        for i, line in enumerate(f):

            if (i % 100000 == 0):
                logging.info("read {0} chat lines".format(i))
            # do some pre-processing and return a list of words for each review text
            yield gensim.utils.simple_preprocess(line)


In [5]:
# read the tokenized reviews into a list
# each review item becomes a serries of words
# so this becomes a list of lists
documents = list(read_input(CHAT_INPUT_FILE))
logging.info("Done reading data file")

2019-03-14 08:45:55,934 : INFO : reading file ../data_files/gnue_irc_chat_logs_preprocessed_words_only.txt.gz...this may take a while
2019-03-14 08:45:55,937 : INFO : read 0 chat lines
2019-03-14 08:45:57,245 : INFO : read 100000 chat lines
2019-03-14 08:45:58,532 : INFO : read 200000 chat lines
2019-03-14 08:45:59,895 : INFO : read 300000 chat lines
2019-03-14 08:46:01,285 : INFO : read 400000 chat lines
2019-03-14 08:46:02,603 : INFO : read 500000 chat lines
2019-03-14 08:46:03,924 : INFO : read 600000 chat lines
2019-03-14 08:46:04,616 : INFO : Done reading data file


In [6]:
model = gensim.models.Word2Vec (documents, size=150, window=10, min_count=2, workers=10)
model.train(documents,total_examples=len(documents),epochs=10)

2019-03-14 08:46:06,502 : INFO : collecting all words and their counts
2019-03-14 08:46:06,503 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-03-14 08:46:06,522 : INFO : PROGRESS: at sentence #10000, processed 64153 words, keeping 6873 word types
2019-03-14 08:46:06,540 : INFO : PROGRESS: at sentence #20000, processed 126677 words, keeping 10038 word types
2019-03-14 08:46:06,558 : INFO : PROGRESS: at sentence #30000, processed 188326 words, keeping 12592 word types
2019-03-14 08:46:06,578 : INFO : PROGRESS: at sentence #40000, processed 251292 words, keeping 15081 word types
2019-03-14 08:46:06,596 : INFO : PROGRESS: at sentence #50000, processed 313590 words, keeping 17212 word types
2019-03-14 08:46:06,614 : INFO : PROGRESS: at sentence #60000, processed 379341 words, keeping 18964 word types
2019-03-14 08:46:06,637 : INFO : PROGRESS: at sentence #70000, processed 440245 words, keeping 20799 word types
2019-03-14 08:46:06,658 : INFO : PROGRESS: at se

2019-03-14 08:46:08,094 : INFO : downsampling leaves estimated 3321113 word corpus (80.4% of prior 4131460)
2019-03-14 08:46:08,231 : INFO : estimated required memory for 37992 words and 150 dimensions: 64586400 bytes
2019-03-14 08:46:08,232 : INFO : resetting layer weights
2019-03-14 08:46:08,653 : INFO : training model with 10 workers on 37992 vocabulary and 150 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
2019-03-14 08:46:09,680 : INFO : EPOCH 1 - PROGRESS: at 35.01% examples, 1158912 words/s, in_qsize 19, out_qsize 0
2019-03-14 08:46:10,685 : INFO : EPOCH 1 - PROGRESS: at 69.92% examples, 1158041 words/s, in_qsize 17, out_qsize 2
2019-03-14 08:46:11,565 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-03-14 08:46:11,582 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-03-14 08:46:11,588 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-03-14 08:46:11,590 : INFO : worker thread finished; awaiting fini

2019-03-14 08:46:26,025 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-03-14 08:46:26,032 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-03-14 08:46:26,033 : INFO : EPOCH - 1 : training on 4169816 raw words (3321118 effective words) took 2.6s, 1260442 effective words/s
2019-03-14 08:46:27,061 : INFO : EPOCH 2 - PROGRESS: at 38.07% examples, 1259058 words/s, in_qsize 20, out_qsize 0
2019-03-14 08:46:28,072 : INFO : EPOCH 2 - PROGRESS: at 76.14% examples, 1255866 words/s, in_qsize 19, out_qsize 0
2019-03-14 08:46:28,614 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-03-14 08:46:28,629 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-03-14 08:46:28,633 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-03-14 08:46:28,637 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-03-14 08:46:28,639 : INFO : worker thread finished; awaiting finish of 5 more thr

2019-03-14 08:46:44,712 : INFO : EPOCH 8 - PROGRESS: at 63.85% examples, 1049849 words/s, in_qsize 20, out_qsize 0
2019-03-14 08:46:45,649 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-03-14 08:46:45,655 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-03-14 08:46:45,668 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-03-14 08:46:45,677 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-03-14 08:46:45,679 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-03-14 08:46:45,680 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-03-14 08:46:45,681 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-03-14 08:46:45,683 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-03-14 08:46:45,694 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-03-14 08:46:45,697 : INFO : worker thread finished; awaiting 

(33209822, 41698160)

In [12]:
w1 = "hi"
model.wv.most_similar(positive=w1)

[('hello', 0.7157601714134216),
 ('howdy', 0.6666513085365295),
 ('mornin', 0.6130080223083496),
 ('wb', 0.607377290725708),
 ('hallo', 0.6039972305297852),
 ('hiya', 0.5790766477584839),
 ('greetings', 0.5756561160087585),
 ('heya', 0.5622656941413879),
 ('ello', 0.5578193664550781),
 ('heyas', 0.5369154810905457)]

In [189]:
derek_vector = model.wv.word_vec("derek")
derek_vector

array([ 0.5279345 , -0.13940096,  0.14848003, -0.20386428,  0.30318138,
       -0.7518825 ,  0.32862315, -0.0710255 , -0.30316716,  0.42825776,
        0.41443545,  0.8094691 ,  1.0279076 ,  0.09358946,  0.5970718 ,
       -0.51281583,  0.1515062 , -0.800897  , -0.4606536 , -0.28351092,
       -0.2884971 , -0.8592874 , -0.17111757,  1.5243422 ,  0.13311459,
       -1.017981  ,  0.308114  , -0.6696086 , -0.14961235, -0.5324405 ,
        0.39775124,  0.48026583, -0.7153367 ,  0.3510015 ,  0.06037151,
        0.39693207, -0.25557142,  0.14970575, -0.05517276, -0.16058485,
       -0.54178226,  0.41233024, -0.31504527, -1.1565965 , -0.88247764,
        0.45110017, -0.336391  , -0.9309303 , -0.14017169,  0.94689447,
       -0.30711138,  0.8684238 , -0.40074196, -0.12314773,  0.09314933,
        0.67560005,  0.46069032, -0.28590992, -0.508878  , -0.08849116,
        0.02737017,  0.09804839,  0.54818404, -0.2637814 , -0.6895593 ,
       -1.0969024 , -0.08252746, -0.8242667 ,  1.0529389 ,  0.32

In [17]:
model.wv.word_vec("cvs").shape

(150,)

In [192]:
np.multiply(model.wv.word_vec("derek"), 0.0015415098078320612)

array([ 1.75430268e-05,  3.20636493e-04,  1.42087520e-03,  1.68726873e-03,
        1.34648243e-03, -6.37582038e-04, -6.66913227e-04, -5.27164491e-04,
        3.07432172e-04,  6.05612528e-04, -2.35400908e-03,  1.04977039e-03,
       -6.52331917e-04,  1.87362335e-03, -8.56540806e-04,  1.97623111e-03,
       -2.30358331e-03,  1.54692063e-03, -2.52765487e-03, -1.96043798e-03,
       -2.12545809e-03, -8.89024464e-04, -1.21306453e-03, -7.72741972e-04,
       -1.16806792e-03, -1.31328934e-05, -6.17633850e-05, -5.72148303e-04,
       -2.74863269e-04, -3.50224669e-03,  5.04993717e-04, -9.83446487e-04,
        8.95250356e-04, -9.80200246e-04, -1.67520915e-03,  1.39368454e-03,
       -1.92016573e-03, -1.82855176e-03, -9.05466950e-05,  3.05284688e-04,
        7.33390800e-04,  3.51534982e-04, -1.20753444e-04, -9.90741610e-05,
       -6.91084075e-04, -2.10401035e-04,  9.06368135e-04,  6.84935600e-04,
       -1.28763320e-03, -2.87177623e-04, -1.25641900e-03, -1.66943087e-03,
       -9.07944632e-04,  

In [193]:
# Save the model
filename = join(DATA_DIR, "gnue_irc_word2vec_model")
model.save(filename)

2019-02-19 14:54:34,437 : INFO : saving Word2Vec object under ../data_files/gnue_irc_word2vec_model, separately None
2019-02-19 14:54:34,438 : INFO : not storing attribute vectors_norm
2019-02-19 14:54:34,439 : INFO : not storing attribute cum_table
2019-02-19 14:54:34,984 : INFO : saved ../data_files/gnue_irc_word2vec_model


In [194]:
# Load Word TF-IDFS
def get_word_tf_idfs(word_tf_idfs_filename):
    word_tf_idfs = []
    with open(word_tf_idfs_filename) as word_tf_idfs_file:
        for values in word_tf_idfs_file:
            values = [float(value) for value in values.strip().split()]
            word_tf_idfs.append(values)
    return word_tf_idfs
        

In [195]:
word_tf_idfs_filename = join(DATA_DIR, "word_tf_idfs.txt")
word_tf_idfs = get_word_tf_idfs(word_tf_idfs_filename)[:4]

In [196]:
word_tf_idfs[1]

[0.0005048845017409812, 0.0006060496352296479]

Generating Sentence Vectors using the **Average of Word2Vec vectors with TF-IDF** method

"This is one of the best approach which I will recommend. Just take the word vectors and multiply it with their TF-IDF scores. Just take the average and it will represent your sentence vector"

https://stackoverflow.com/questions/29760935/how-to-get-vector-for-a-sentence-from-the-word2vec-of-tokens-in-sentence


In [197]:
def strip_leading_and_trailing_punctuation(word):
    return word.strip(string.punctuation) 

In [198]:
def pre_process_sentence(sentence):
    if not sentence:
        print("No words here")
    if type(sentence) is not str:
        try:
            sentence = sentence.decode('utf-8')
        except Exception:
            raise ValueError("Input must be a String or ByteString")
        
    return sentence

In [199]:
def all_chars_in_word_are_punctuation(word):
    return all(char in PUNCTUATION_SET for char in word)

In [200]:
def get_words_in_sentence(sentence):
    words = []
    for word in sentence.split():
        if not all_chars_in_word_are_punctuation(word):
            word = strip_leading_and_trailing_punctuation(word)
            if ',' in word:
                comma_split_words = word.split(',')
                for word in comma_split_words:
                    if not all_chars_in_word_are_punctuation(word):
                        words.append(word)
            else:
                words.append(word)
    return words

In [301]:
def get_sentence_word_vectors(sentence, model):
    sentence = pre_process_sentence(sentence)
    vectors = []
    words = get_words_in_sentence(sentence)
    if not words:
        return [np.zeros((150,))]
    for word in words:
        try:
            vectors.append(model.wv.word_vec(word.lower()))
        except KeyError:
            vectors.append(np.zeros((150,)))
    
    return vectors
    

In [302]:
len([vec for vec in get_sentence_word_vectors("cvs diff", model)])

2

In [303]:
get_sentence_word_vectors("cvs ddghsdg", model)[1].shape

(150,)

In [304]:
def get_sentence_vector(sentence_word_tf_idfs, word_vectors):
    products = []
    for word_tf_idf, word_vector in zip(sentence_word_tf_idfs, word_vectors):
        product = np.multiply(word_vector, word_tf_idf)
        products.append(product)
    vector_sum = np.sum(products, axis=0)
    return vector_sum / len(word_tf_idfs)
        

In [305]:
word_tf_idfs_2, word_vectors = word_tf_idfs[1], get_sentence_word_vectors("cVS diff", model)
sentence_vector = get_sentence_vector(word_tf_idfs_2, word_vectors)
assert sentence_vector.shape == derek_vector.shape
sentence_vector.shape
len(word_vectors)
word_tf_idfs[1]

[0.0005048845017409812, 0.0006060496352296479]

In [14]:
def generate_sentence_vectors(
    chat_input_filename, 
    word_tf_idfs, 
    word_2_vec_model, 
    sentence_vectors_output_filename
):
    with gzip.open(chat_input_filename) as chat_input_file, open(
        sentence_vectors_output_filename, "w") as sentence_vectors_output_file:
        line_count = 0
        # Add header ro csv. Numbers from 1 to 150
        sentence_vectors_output_file.write("{}\n".format(",".join([str(num) for num in range(1,151)])))
        for sentence in chat_input_file:
            sentence_word_vectors = get_sentence_word_vectors(sentence, word_2_vec_model)
            sentence_vector = get_sentence_vector(word_tf_idfs[line_count], sentence_word_vectors)
            csv_vector = ','.join([str(num) for num in sentence_vector])
            sentence_vectors_output_file.write("{} \n".format(csv_vector))
            line_count += 1
    

In [15]:
word_tf_idfs_filename = join(DATA_DIR, "word_tf_idfs.txt")
word_tf_idfs = get_word_tf_idfs(word_tf_idfs_filename)
word_2_vec_model = model
sentence_vectors_output_filename = join(FEATURES_DIR, "sentence_vectors.csv")

generate_sentence_vectors(
    CHAT_INPUT_FILE, 
    word_tf_idfs, 
    word_2_vec_model, 
    sentence_vectors_output_filename
)

NameError: name 'get_word_tf_idfs' is not defined

In [126]:
x1 = np.arange(9.0).reshape((3, 3))
x2 = np.arange(9).reshape((3, 3)) * 2
# np.multiply(x1, x2)
np.sum([x2, x2], axis=0)

array([[ 0,  4,  8],
       [12, 16, 20],
       [24, 28, 32]])

In [None]:
with open(sentence_vectors_output_filename) as sentence_vectors:
    sentence_vector = 