# Notebook A : Doc vectors with word vectors

Began 21 May 2020 by Amaury de Barbuat from ECL

Updated by William Riou from ENSTA PARIS

## 0- Intro

The goal is to do document embed for each pilot report using word vectors averaging

## 1- Create word embeddings

We will do word embeddings over a given report basis

In [8]:
# entry : csv file as 'time', 'content', 'title'
# output : line sentence cleaned text file

def process_csv(input_filename, output_filename, reports_filename):
    
    import pandas as pd
    import os.path
    import warnings
    warnings.filterwarnings('ignore')
    
    import nltk
    nltk.download('punkt')
    
    data = pd.read_csv(input_filename, sep=',')
    
    final = pd.DataFrame(columns=['reports'])
    
    MyFile = open(output_filename,'w')
    if ~os.path.isfile('abbr.txt'):
        # abbreviation
        file = open('abbr.txt', 'w')
        file.close()
    
    file = open('abbr.txt','r')
    abbrs = file.readlines()
    file.close()
    old = [] 
    new = []
    for abbr in abbrs :
        abbr = abbr.split()
        old.append(abbr[0])
        new.append(abbr[1])

    for i in range (len(data)):
        
        if data['time'][i]=='2008-01-01': #or data['time'][i][:4]=='2006' or data['time'][i][:4]=='2007':
        
            report = str(data['title'][i])+' '+str(data['content'][i])
            from nltk import sent_tokenize
            sentences = sent_tokenize(report)

            new_report=''

            for s in sentences:

                line=''

                from nltk.tokenize import word_tokenize
                tokens = word_tokenize(s)

                # convert to lower case
                tokens = [w.lower() for w in tokens]

                # remove punctuation from each word
                import string
                table = str.maketrans('', '', string.punctuation)
                stripped = [w.translate(table) for w in tokens]

                # remove remaining tokens that are not alphabetic
                #words = [word for word in stripped if word.isalpha()]

                # remove abbreviations
                words_wo_abbrs=[]
                for word in stripped:
                    if word in old:
                        u=old.index(word)
                        words_wo_abbrs.append(new[u])
                    else:
                        words_wo_abbrs.append(word)

                # filter out stop words
                #from nltk.corpus import stopwords
                #stop_words = set(stopwords.words('french'))
                #temp_words = [w for w in words_wo_abbrs if not w in stop_words]

                # stemming
                #from nltk.stem.porter import PorterStemmer
                #porter = PorterStemmer()
                #final_words = [porter.stem(word) for word in temp_words]

                for word in words_wo_abbrs:
                    line += word+' '

                MyFile.write(line+'\n')

                new_report += line+'. '

            final.loc[i] = [new_report]
        
    final.to_csv(reports_filename, sep=',', header=True, index=False)

In [9]:
process_csv('ASRS1.csv', 'LineSentences_57.txt', 'reports_57.csv')

[nltk_data] Downloading package punkt to /home/urendil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
def shuffle_lines(input_filename, output_filename):
    
    import random
    lines = open(input_filename, 'r').readlines()
    random.shuffle(lines)
    open(output_filename, 'w').writelines(lines)

In [11]:
shuffle_lines('LineSentences_57.txt', 'LineSentences_shuffled_57.txt')

In [15]:
# entry : cleaned txt file with one sentence per line
# output : KeyedVector .kv file in tmp file containing the embedding

def word_embedding(input_filename, vectors_filename):
    
    # Create the model
    import os
    from gensim.models.word2vec import Word2Vec
    from gensim.models.word2vec import LineSentence
    sentences = LineSentence(input_filename, max_sentence_length=10000, limit=None)
    model = Word2Vec(sentences, min_count=5, size=100, iter=5)
    
    # Save model
    from gensim.models.keyedvectors import KeyedVectors
    model.wv.save_word2vec_format(os.path.join(os.getcwd(),vectors_filename), binary=False)

In [16]:
word_embedding('LineSentences_shuffled_57.txt', 'word_vectors_57_100d_5e.txt')

## 2- Document embeddings with word vectors averaging

Here we want to embed each report by averaging its word vectors

In [17]:
def w2vtxt_to_lists_of_words_and_vectors(filename):

    file = open(filename, "r")
    headers = next(file) # in txt w2v format, first line contains number of words and and vectors dimension
    lines = file.readlines()
    num_vect = int(headers.split()[0]) # number of words in the file
    dim_vect = int(headers.split()[1]) # dimension of word vectors
    file.close()

    vectors = [] # will contain every vectors as sublists e.g. [[0.13, ..., -0.87], [-0.45, ..., 0.02], ...]
    words = [] # will contain every words e.g. ['cat', 'dog', ...]

    for line in lines :
        line = line.split() # turn into list each line, they contain the word and its n dimensions
        vector = line[1:(dim_vect+1)] # coordinates are index 1 to dim_vect+1
        vector = [float(i) for i in vector]
        word = line[0] # the corresponding word is the first element of the line
        vectors.append(vector)
        words.append(word)
    
    return (vectors, words)

In [18]:
def build_doc_vectors(reports_filename, word_vectors, doc_vectors):
    
    import numpy as np
    import pandas as pd
    
    data = pd.read_csv(reports_filename, sep=',')
    
    MyFile = open(doc_vectors,'w')
    
    s=' '
    
    vectors, words = w2vtxt_to_lists_of_words_and_vectors(word_vectors)
    
    for i in range (len(data)):
        
        report = data['reports'][i].split(' ')
        
        avg = np.array([0.0 for k in range (len(vectors[0]))])
        n_words = 0
        for e in report:
            if e in words:
                ind = words.index(e)
                avg += np.array(vectors[ind])
                n_words += 1
        
        report_vector = avg/n_words
        report_vector = report_vector.tolist()
        report_vector = [str(e) for e in report_vector]
        report_vector = s.join(report_vector)
        
        MyFile.write(report_vector+'\n')

In [21]:
build_doc_vectors('reports_57.csv', 'word_vectors_57_100d_5e.txt', 'w2v_doc_vectors_57_100d_5e.txt')