In [1]:
import codecs
import re
import math
import nltk
import collections
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [2]:
from scipy.stats.stats import pearsonr
import pandas as pd

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shafr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk.stem.porter import PorterStemmer
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}']) # remove it if you need punctuation 
porter = PorterStemmer()


In [5]:
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
#Токенизация текста
def tokenization(text):
    
    doc = re.split(r' *[\;\:\—\<\>\\\"\.\,\!\?\&\=\+\(\)\{\}\[\]\r\n\t\« \»\“\”\„\/\d ]', text) 
    tokens = []
    for token in doc: 
        if token != '':
            tokens.append(token.lower())        
    return tokens

In [7]:
#токенизируем каждый текст в корпусе
def tokenization_corpus(corpus):
    corpus_tokins = []
    for text in corpus:
        #corpus_tokins.append(tokenization(text))
        corpus_tokins.append([porter.stem(i.lower()) for i in tokenization(text) if i.lower() not in stop_words] )
    return corpus_tokins

In [8]:
# получаем все токены в корпусе
def all_corpus_tokens(corpus_tokens):
    all_tokens =[]
    for text in corpus_tokens:
        for word in text:
            all_tokens.append(word)
    return all_tokens

In [9]:
#стемминг и лемматизация для одного текста
def stem_lem(tokens):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    return [stemmer.stem(lemmatizer.lemmatize(word)) for word in tokens]
    #return [stemmer.stem(word) for word in tokens]
    #return [lemmatizer.lemmatize(word) for word in tokens]

In [10]:
#стемминг и лемматизация для корпуса текстов
def stem_lem_corpus(corpus):
    u_corp = []
    for tokens in corpus:
        u_corp.append(stem_lem(tokens))
    return u_corp

In [39]:
#считаем P(wi)
def  Pw(tokens,alpha):
    tf_countter = collections.Counter(tokens)
    counter_alpha = collections.Counter(tokens)
    denominator =0
    for i in tf_countter:
        denominator += tf_countter[i]**alpha
        tf_countter[i] = tf_countter[i]/len(tokens)
    for i in counter_alpha:
        counter_alpha[i] = counter_alpha[i]**alpha/denominator
    return tf_countter, counter_alpha

In [12]:
def p_wi_wj(corpus,tokens,w_size):
    matrix_ww = np.zeros((len(tokens),len(tokens)))
    size_corpus = len(corpus)
    for i in range(size_corpus):
        left_border = 0 if i-w_size < 0 else i-w_size
        right_border = size_corpus if i+w_size > size_corpus else i+w_size
        #index_i_matrix = tokens.index(corpus[i]) 
        for j in np.arange(left_border,right_border):
            index_j_matrix = tokens.index(corpus[j])
            for k in np.arange(j+1,right_border):
                index_k_matrix = tokens.index(corpus[k])
                matrix_ww[index_k_matrix,index_j_matrix] += 1/size_corpus
                matrix_ww[index_j_matrix,index_k_matrix] += 1/size_corpus
                
    return matrix_ww

In [34]:
def ppmi(matrix_wi_wj,counter_wi,unique,counter_alpha):
    size = len(unique)
    matrix_ppmi = np.zeros((size,size))
    for i in range(size):
        word_i = unique[i]
        for j in range(size):
            word_j = unique[j]
            if matrix_wi_wj[i,j] == 0:
                matrix_ppmi[i,j] = 0
            else:
                PMI = matrix_wi_wj[i,j]/(counter_wi[word_i]*counter_alpha[word_j])#PMI=P(Wi,Wj)/(P(Wi)*P(Wj))
                if PMI <= 1 :
                    #print(1)
                    matrix_ppmi[i,j]= 0
                else:
                    matrix_ppmi[i,j] = max(np.log2(PMI),0)
                    
                #matrix_ppmi[i,j] = max(np.log(1 if matrix_wi_wj[i,j]/(counter_wi[word_i]*counter_wi[word_j])<),0)#PMI=P(Wi,Wj)/(P(Wi)*P(Wj))
    return matrix_ppmi     
            

In [14]:
data = pd.read_csv('./wordsim354/wordsim_similarity_goldstandard.txt',sep='\t', names=['word1','word2','sim'])

In [15]:
word1 = [porter.stem(word) for word in data['word1'].values]
word2 = [porter.stem(word) for word in data['word2'].values]
scores = [word for word in data['sim'].values]

In [42]:
pathCV = './../example2/' #Computer vision articles
texts = []
#получаем каждый документ и создаем корпус
for i in range(10):
    fileObj = codecs.open( pathCV + str(i)+'.txt', "r", "utf_8_sig" )
    text = fileObj.read() # или читайте по строке
    texts.append(text)
    fileObj.close()
    


In [35]:
def model(texts,N=7,alpha=1):
    
    tokens_corp = tokenization_corpus(texts)
    all_tokens = all_corpus_tokens(tokens_corp)
       
    #Лемматизация и стемминг
    #st_lm_all_tokkens = stem_lem(all_tokens)
    #st_lm_tokens_corp = stem_lem_corpus(tokens_corp)
    
    #corpus_new = []
    #for text in tokens_corp:
    #    corpus_new.extend(text)
        
    unique = list(set(all_tokens))
    matrix_wi_wj = p_wi_wj(all_tokens,unique,N)
    
    counter_wi,counter_alpha = Pw(all_tokens,alpha)
    
    matrix_ppmi = ppmi(matrix_wi_wj,counter_wi,unique,counter_alpha)
    return matrix_ppmi



In [20]:
def cos(A,B):
    SumA = 0 
    SumB = 0
    SumAB = 0
    for i in range(len(A)):
        SumAB += A[i]*B[i]
        SumA += A[i]**2
        SumB += B[i]**2
    return SumAB/(np.sqrt(SumA)*np.sqrt(SumB))

In [21]:
def jac(A,B):

    SumA = 0 
    SumB = 0
    for i in range(len(A)):
        SumA += min(A[i],B[i])
        SumB += max(A[i],B[i])
    return SumA/SumB

In [22]:
def KL(A,B):
    Sum = 0
    for i in range (len(A)):
        if (A[i] != 0) and B[i] != 0:
            Sum += A[i] * np.log(A[i]/B[i])
    return Sum

In [23]:
def JS(A,B):
    AB = [(u[0]+u[1])/2 for u in zip(A,B)]
    return (KL(A,AB)+KL(B,AB))/2

In [25]:
D =[]

In [52]:
%%time
n_windows = 10
alpha =0.9
tokens_corp = tokenization_corpus(texts)
all_tokens = all_corpus_tokens(tokens_corp)
unique = list(set(all_tokens))
C = []
C.append(('alpha','n_windows','cos_dist','jac_dist       ','KL           ','JS         '))
print(C[0])
for i in np.arange(1,n_windows):
    matrix_ppmi = model(texts,i,alpha)
    
    indexs = []
    for k,w in enumerate(zip(word1,word2)):
        if (w[0] in unique) and (w[1] in unique):
            indexs.append(k)
    
    cos_dist = []
    jac_dist = []
    KL_dist = []
    JS_dist = []
    SC = []
    for j in indexs:
        index_w1 = unique.index(word1[j])
        index_w2 = unique.index(word2[j])
        A = matrix_ppmi[index_w1]
        B = matrix_ppmi[index_w2]
        
        cos_dist.append(cos(A,B))
        jac_dist.append(jac(A,B))
        KL_dist.append(KL(A,B))
        JS_dist.append(JS(A,B))
        SC.append(scores[j])
    C.append((alpha,i,pearsonr(cos_dist,SC)[0],pearsonr(jac_dist,SC)[0],pearsonr(KL_dist,SC)[0],pearsonr(JS_dist,SC)[0]))
    print(C[i])

('alpha', 'n_windows', 'cos_dist', 'jac_dist       ', 'KL           ', 'JS         ')
(0.9, 1, 0.3819111437394743, 0.35045503475728174, -0.2003193477639999, 0.1307753293058749)
(0.9, 2, 0.3637682197274416, 0.3314420340453714, -0.12174567357133263, 0.13474805511127474)
(0.9, 3, 0.34099034503221703, 0.3180968492324886, -0.13830916293390905, 0.12536345050375833)
(0.9, 4, 0.35153231558352444, 0.32775112475341445, -0.1078422575703692, 0.11446170306548958)
(0.9, 5, 0.368607169662774, 0.3380647875726011, -0.1398967305892251, 0.10341027550054654)
(0.9, 6, 0.3806460250246268, 0.3437233980146923, -0.11185283313300251, 0.09474786254915028)
(0.9, 7, 0.3841803453951056, 0.3475076273841437, -0.10728782589481863, 0.08499324394374193)
(0.9, 8, 0.3779767739407905, 0.3433918464112759, -0.0918895264404458, 0.0791488526125923)
(0.9, 9, 0.36956467471341614, 0.33981858357568534, -0.10587335215879189, 0.07337939127674689)
Wall time: 10min 41s


In [53]:
D.append(C)

In [54]:
for c in D:
    for line in c:
        print(line)

('alpha', 'n_windows', 'cos_dist', 'jac_dist       ', 'KL           ', 'JS         ')
(1, 1, 0.37868471062298004, 0.34909917996254153, -0.21731833412147847, 0.1321824221955711)
(1, 2, 0.3641812865630324, 0.3313709634197902, -0.11967449057959169, 0.1367339754900067)
(1, 3, 0.34467260103195035, 0.31926932888935794, -0.14016024538333757, 0.1270900646544173)
(1, 4, 0.3546967173718193, 0.32887231223819513, -0.10910837790769026, 0.11619002001903143)
(1, 5, 0.3714164813142092, 0.33892035992782943, -0.14060669364142028, 0.10514137494072942)
(1, 6, 0.38320229753754625, 0.3444457633212567, -0.11247879023072106, 0.09643187761234528)
(1, 7, 0.38677656156473106, 0.3482961437293354, -0.10785154294763862, 0.08665864352705586)
(1, 8, 0.38041308587250894, 0.3441143244947273, -0.09226994116055887, 0.08080124660032356)
(1, 9, 0.371601944062431, 0.34041587224485037, -0.10642783702413036, 0.07502619838406517)
('alpha', 'n_windows', 'cos_dist', 'jac_dist       ', 'KL           ', 'JS         ')
(0.75, 1, 0.