In [9]:
import os
import json

#import spacy

from gensim.test.utils import datapath
from gensim.models import Word2Vec, KeyedVectors

## reading and writing data    
def get_file_content(path):
    with open(path, 'r') as f:
        lines = f.read().splitlines()
    return lines

def get_json_content(path):
    with open(path, 'r') as f:
        json_data = json.load(f)
    
    data = json.loads(json_data)
    return data

#loading models
def load_spacy_nlp(model_name):
    return spacy.load(model_name)

def load_word2vec_model(model_path):
    return KeyedVectors.load_word2vec_format(datapath(model_path), binary=True, unicode_errors='ignore')

def get_lexicon(lexicon_contents):
    return lexicon_contents[1:]
    
## processing

def word2vec_similar_topk(model, word, topk):
    return model.most_similar(word, topn=topk)


def get_pos_tags_word2vec_terms(lexicon_list):
    vocab_features = {}
    lexicons = ' '.join(lexicon_list)
    doc = nlp(lexicons)
    
    with open(data_dir+'results/lexicons_postag_word2vecterms.txt', 'w') as fr:
        
        fr.write('Keyphrase,POS,Tag,Word2VecTerms\n')
        for token in doc:
            #print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)
            fr.write(token.text +',' + token.pos_ + ',' + token.tag_)
            print (token.text +',' + token.pos_ + ',' + token.tag_)
            if token.text in word2vec_model:
                #print(token.text, token.pos_, token.tag_, 'Found')
                similar_word_weight = word2vec_similar_topk(word2vec_model, token.text, 100)
                words = [item[0] for item in similar_word_weight if item[1]>0.6]
                weights = [item[1] for item in similar_word_weight if item[1]>0.6] 
                
                word_weight = [words[wdx]+'_'+str(weights[wdx]) for wdx in range(0, len(words))]
                line = ' '.join(word_weight)
                fr.write(','+line+'\n')
                print (line)
                #print ("{}".format(similar_word_weight))
                #fr.write('\tFound'+'\n')
            else:
                fr.write('\tNot found'+'\n')
                #print(token.text, token.pos_, token.tag_, 'Not Found')
                #print ("{} is not found".format(token))        

def get_pos_tags(lexicon_list):
    vocab_features = {}
    lexicons = ' '.join(lexicon_list)
    doc = nlp(lexicons)
    
    with open(data_dir+'results/lexicons_postag_isword2vecvocab.txt', 'w') as fr:
        for token in doc:
            #print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)
            fr.write(token.text +'\t' + token.pos_ + '\t' + token.tag_)
            if token.text in word2vec_model:
                #print(token.text, token.pos_, token.tag_, 'Found')
                #similar_word_weight = word2vec_similar_topk(word2vec_model, token.text, 5)
                #print ("{}".format(similar_word_weight))
                fr.write('\tFound'+'\n')
            else:
                fr.write('\tNot found'+'\n')
                #print(token.text, token.pos_, token.tag_, 'Not Found')
                #print ("{} is not found".format(token))        
    
def get_google_word2vec_space(lexicon_list):    
    #with open(data_dir+'results/lexicons_tag.txt', 'w') as fr:
    count = 0
    for token in lexicon_list:
        if token in word2vec_model:
            print ("{} is in the google word2vec space".format(token))
            count = count + 1
        else:
            print ("{} is not found".format(token))      
    rate_of_exist = count/len(lexicon_list)
    return rate_of_exist


def get_twitter_word2vec_space(lexicon_list):    
    #with open(data_dir+'results/lexicons_tag.txt', 'w') as fr:
    count = 0
    for token in lexicon_list:
        if token in word2vec_twitter or token.lower() in word2vec_twitter:
            print ("{} is in the twitter word2vec space".format(token))
            count = count + 1
        else:
            print ("{} is not found".format(token))
    
    rate_of_exist = count/len(lexicon_list)
    return rate_of_exist


print ("setting up the data path ...")
data_dir = "/projets/sig/mullah/nlp/fgpi/"
figures_path ='../figures/'
graph_dir = "/projets/sig/mullah/nlp/fgpi/graph"
lexicon_name='Radical'

#newretweetuserid_neworiginaluserid_path = os.path.join(data_dir, 'data/processed/20000_UserRetweetID_UserOriginalID_NewID.txt')
#tweetID_msg_path = os.path.join(data_dir, 'data/processed/20000_id_OriginalTweet.json')
#retweetUserID_tweetUserID_tweetID_path = os.path.join(data_dir, 'data/processed/20000_UserRetweet_UserOriginal_idOriginalTweet.txt')
#retweetUserID_tweetUserID_tweetID_path = os.path.join(data_dir, 'data/processed/20000_RetweetUserNewID_TweetUserNewID_TweetID.txt')

lexicon_path = os.path.join(data_dir, 'lexicons/'+lexicon_name)
#word2vec_model_path = ""
word2vec_google_model_path = '/projets/sig/mullah/ir/data/word_embedding/pretrain/word2vec/googlenews/GoogleNews-vectors-negative300.bin'
word2vec_twitter_model_path = '/projets/sig/mullah/ir/data/word_embedding/pretrain/twitter/word2vec_twitter_tokens.bin'

#loading models
#nlp = load_spacy_nlp('en_core_web_sm')
word2vec_model = load_word2vec_model(word2vec_google_model_path)
word2vec_twitter = load_word2vec_model(word2vec_twitter_model_path)

#word2vec_twitter = KeyedVectors.load_word2vec_format(word2vec_twitter_model_path, binary=True, unicode_errors='ignore')
#lexicon_path = os.path.join(data_dir, 'lexicons/Islamism')

lexicon_contents = get_file_content(lexicon_path)
lexicon_list = get_lexicon(lexicon_contents)
print ("Total jargons in lexicon: {}".format(len(lexicon_list)))
print (lexicon_list)

#get_pos_tags(lexicon_list)
rate_of_exist_google = get_google_word2vec_space(lexicon_list)
print (rate_of_exist_google)
rate_of_exist_twitter = get_twitter_word2vec_space(lexicon_list)
print (rate_of_exist_twitter)

#get_pos_tags_word2vec_terms(lexicon_list)

setting up the data path ...
Total jargons in lexicon: 188
#CountrysideCleanup is not found
#Islam is not found
#MyJihad is not found
#SpringJihad is not found
airstrike is in the google word2vec space
alal-Jihad is not found
aleppo is not found
aleppo is not found
allah is in the google word2vec space
army is in the google word2vec space
Ash-Sham is not found
assad is not found
attack is in the google word2vec space
battlefield is in the google word2vec space
break is in the google word2vec space
claim is in the google word2vec space
Victory is in the google word2vec space
cynthiastruth is not found
destroy is in the google word2vec space
extreme is in the google word2vec space
Extremists is in the google word2vec space
Fight is in the google word2vec space
fight is in the google word2vec space
Haya is in the google word2vec space
fight is in the google word2vec space
Haya ala-Jihad is not found
Iraq is in the google word2vec space
IS is in the google word2vec space
isil is not found
