In [1]:
# Import modules and set up logging.
from typing import Callable, Dict, List, Set, Tuple, Generator
import gensim.downloader as api
from gensim.models import Word2Vec
import gensim
import logging
import numpy as np
import os
import nltk
import re
nltk.download('averaged_perceptron_tagger')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\junec\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
#to download the pretrained model of 'word2vec-google-news-300'
#make sure to use a 64 bit python
import struct
struct.calcsize("P") * 8
#!which python
#!which pip

64

In [3]:
# Reduce logging level.
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARNING)

In [4]:
model_loaded = api.load('word2vec-google-news-300')

2021-11-10 01:31:59,491 : INFO : loading projection weights from C:\Users\junec/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz
2021-11-10 01:32:47,777 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from C:\\Users\\junec/gensim-data\\word2vec-google-news-300\\word2vec-google-news-300.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2021-11-10T01:32:47.773867', 'gensim': '4.1.2', 'python': '3.7.6 (default, Jan  8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'load_word2vec_format'}


In [6]:
#https://stackoverflow.com/questions/57507832/unable-to-allocate-array-with-shape-and-data-type
#model_loaded.save('googleNews.d2v')
#model_loaded = gensim.models.keyedvectors.KeyedVectors.load('googleNews.d2v')

In [7]:
CONTENT_WORD_TYPE=["noun","adj","verb","adv"]
POS_TAGS=["NN","JJ","VB","RB"]

In [8]:
def preprocessDBOtype(dp_type:str)->str:
    dp_type=dp_type[len("dbo:"):]
    splitted_type=re.findall('[A-Z][a-z]*', dp_type)
    return " ".join(splitted_type).lower()
    #return " ".join(splitted_type)

In [9]:
def parse_word_POStag(sentence:str)->Dict:
    """
    parse content words of a sentence with their POS tag
    argument:a sentence string
    return:dictionary,key is POS tag and value is the corresponding word
           a list of content words
    """
    tag_dict={}
    content_words=[]
    tokenized_sentence = nltk.word_tokenize(sentence)
    tagged_words=nltk.pos_tag(tokenized_sentence)
    for POS_tag in POS_TAGS:
        for word,tag in tagged_words:
            if tag[:2]==POS_tag:
                content_words.append(word)
                temp=tag_dict.get(POS_tag,[])
                temp.append(word)
                tag_dict[POS_tag]=temp
                #print(tag_dict[POS_tag])
    return tag_dict,content_words


In [21]:
def calc_pairwise_similarity(model_loaded,question_tagged:Dict,type_tagged:Dict)->List:
    """
    calculate pairwise similarity between 
    content words in the query and the type label
    """
    similarities=[]
    for POS_tag,words in type_tagged.items():
        if POS_tag in question_tagged.keys():
            for word1 in words:
                for word2 in question_tagged[POS_tag]:
                    try:
                        similarities.append(model_loaded.similarity(word1,word2))
                    except KeyError:
                        pass
    return similarities


       

In [14]:
def extract_features_23to25(model_loaded,dp_type:str, question:str)->Tuple[float,float,float]:
    #get content words and parse dictonary
    question_tagged,question_content=parse_word_POStag(question)
    processed_type=preprocessDBOtype(dp_type)
    type_tagged,type_content=parse_word_POStag(processed_type)
    #get centroid
    question_centrality=model_loaded.rank_by_centrality(question_content, use_norm=True)
    type_centrality=model_loaded.rank_by_centrality(type_content, use_norm=True)
    question_centroid=question_centrality[0][1]
    type_centroid=type_centrality[0][1]
    #feature 23
    sim_aggr=round(model_loaded.similarity(question_centroid, type_centroid),4)
    
    pairwise_similarity=calc_pairwise_similarity(model_loaded,question_tagged,type_tagged)
    #feature 24,25
    sim_max=max(pairwise_similarity)
    sim_avg=round(sum(pairwise_similarity)/len(pairwise_similarity),4)
    
    return sim_aggr,sim_max,sim_avg

In [12]:
dp_type="dbo:GreatMusicFestival"
question="When was Bibi Andersson married to Per Ahlmark very green?"
extract_features_23to25(model_loaded,dp_type, question)


(0.0196, 0.13620232, 0.0312)

In [15]:
#try another model
print(api.info('text8'))
text8_corpus = api.load('text8')
model = Word2Vec(text8_corpus) 

2021-11-10 01:47:33,161 : INFO : collecting all words and their counts
2021-11-10 01:47:33,207 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


{'num_records': 1701, 'record_format': 'list of str (tokens)', 'file_size': 33182058, 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/text8/__init__.py', 'license': 'not found', 'description': 'First 100,000,000 bytes of plain text from Wikipedia. Used for testing purposes; see wiki-english-* for proper full Wikipedia datasets.', 'checksum': '68799af40b6bda07dfa47a32612e5364', 'file_name': 'text8.gz', 'read_more': ['http://mattmahoney.net/dc/textdata.html'], 'parts': 1}


2021-11-10 01:47:38,357 : INFO : collected 253854 word types from a corpus of 17005207 raw words and 1701 sentences
2021-11-10 01:47:38,358 : INFO : Creating a fresh vocabulary
2021-11-10 01:47:38,677 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 71290 unique words (28.083071371733357%% of original 253854, drops 182564)', 'datetime': '2021-11-10T01:47:38.676741', 'gensim': '4.1.2', 'python': '3.7.6 (default, Jan  8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'prepare_vocab'}
2021-11-10 01:47:38,679 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 16718844 word corpus (98.3160275555599%% of original 17005207, drops 286363)', 'datetime': '2021-11-10T01:47:38.679745', 'gensim': '4.1.2', 'python': '3.7.6 (default, Jan  8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'prepare_vocab'}
2021-11-10 01:47:39,097 : INFO : deleting the raw counts d

2021-11-10 01:48:23,214 : INFO : EPOCH 4 - PROGRESS: at 8.88% examples, 543838 words/s, in_qsize 5, out_qsize 1
2021-11-10 01:48:24,225 : INFO : EPOCH 4 - PROGRESS: at 13.05% examples, 533922 words/s, in_qsize 4, out_qsize 1
2021-11-10 01:48:25,236 : INFO : EPOCH 4 - PROGRESS: at 21.99% examples, 676585 words/s, in_qsize 5, out_qsize 0
2021-11-10 01:48:26,242 : INFO : EPOCH 4 - PROGRESS: at 29.45% examples, 728754 words/s, in_qsize 5, out_qsize 0
2021-11-10 01:48:27,244 : INFO : EPOCH 4 - PROGRESS: at 34.63% examples, 716479 words/s, in_qsize 5, out_qsize 0
2021-11-10 01:48:28,258 : INFO : EPOCH 4 - PROGRESS: at 42.45% examples, 752459 words/s, in_qsize 5, out_qsize 0
2021-11-10 01:48:29,268 : INFO : EPOCH 4 - PROGRESS: at 47.74% examples, 740745 words/s, in_qsize 5, out_qsize 0
2021-11-10 01:48:30,285 : INFO : EPOCH 4 - PROGRESS: at 52.73% examples, 726857 words/s, in_qsize 5, out_qsize 1
2021-11-10 01:48:31,303 : INFO : EPOCH 4 - PROGRESS: at 57.79% examples, 716513 words/s, in_qsize

In [22]:
model=model.wv
dp_type="dbo:GreatMusicFestival"
question="When was Bibi Andersson married to Per Ahlmark very green?"
extract_features_23to25(model,dp_type, question)



(-0.0572, -0.062192805, -0.0622)