In [55]:
import pandas as pd
import numpy as np

## Чтение файлов

In [2]:
articles = pd.read_csv('articles.csv', sep=';', header=None)
stopwords = pd.read_csv('stopwords.csv', sep=';', header=None)

In [4]:
articles.head()

Unnamed: 0,0,1
0,1,Bradley Charles Cooper born January 5 1975 is ...
1,2,Cooper enrolled in the MFA program at the Acto...
2,3,Cooper found greater success with the romantic...
3,4,Labeled a sex symbol by the media Cooper was n...
4,5,Cooper was born on January 5 1975 in Abingto...


In [5]:
stopwords.head()

Unnamed: 0,0
0,x
1,y
2,your
3,yours
4,yourself


## Обработка данных

- привести все слова к нижнему регистру

In [10]:
articles['lower'] = articles[1].apply(lambda x: x.lower())

- отбросить все символы, которые не являются латинскими буквами _(и, вероятно, пробелами)_

In [15]:
articles['latin'] = articles['lower'].apply(lambda text: ''.join([symbol for symbol in text if 'a' <= symbol <= 'z' or symbol.isspace()]))

- удалить все стоп-слова из articles с помощью таблицы stopwords

In [30]:
articles['cleared'] = articles['latin'].apply(lambda text: [word for word in text.split() if (stopwords[0] == word).sum() == 0])

# Шаг 2

In [97]:
def NPMI(text):
    global words_freq
    global words_total
    global pairs_freq
    global pairs_total
    
    # абсолютные частотности слов 
    words_total += len(text)
    for word in text:
        words_freq[word] = words_freq.get(word, 0) + 1
    
    # составим пары слов
    pairs = list(zip(text, text[1:]))
    pairs = list(map(lambda x: x[0] + " " + x[1], pairs))
    
    pairs_total += len(pairs)
    for pair in pairs:
        pairs_freq[pair] = pairs_freq.get(pair, 0) + 1 

In [98]:
words_freq = {}
words_total = 0

pairs_freq = {}
pairs_total = 0

articles['NPMI'] = articles['cleared'].apply(lambda text: NPMI(text))

In [101]:
# вероятности слов, P(a), P(b)
for key in words_freq:
    words_freq[key] /= words_total
    
# вероятности пар слов, P(a, b)
for key in pairs_freq:
    pairs_freq[key] /= pairs_total
    
# теперь для каждой пары слов PMI
PMI = {}
for key in pairs_freq:
    word1, word2 = key.split()

    PMI_ab = np.log(pairs_freq[key] / (words_freq[word1] * words_freq[word2]))
    PMI[key] = PMI_ab

# теперь для каждой пары слов NPMI 
NPMI = {}
for key in PMI:
    NPMI[key] = -(PMI[key] / np.log(pairs_freq[key]))
    
# print(NPMI)
#     return NPMI

In [102]:
sorted(NPMI.items(), key=lambda x: x[1], reverse=True)[:50]

[('fish fry', 1.004695345930954),
 ('linings playbook', 1.0046474918383168),
 ('nightmare alley', 1.0046474918383168),
 ('los angeles', 1.0046474918383168),
 ('guardians galaxy', 1.0045902343009177),
 ('licorice pizza', 1.0045184658630515),
 ('barack obama', 1.0045184658630515),
 ('willy wanker', 1.0045184658630515),
 ('iberian peninsula', 1.0045184658630515),
 ('irina shayk', 1.004421042302328),
 ('phil wenneck', 1.004421042302328),
 ('bipolar disorder', 1.004421042302328),
 ('severely deformed', 1.004421042302328),
 ('clint eastwood', 1.004421042302328),
 ('pounds kg', 1.004421042302328),
 ('parke custis', 1.004421042302328),
 ('amos bronson', 1.004421042302328),
 ('allison rader', 1.004421042302328),
 ('steven moll', 1.004421042302328),
 ('bing crosby', 1.004421042302328),
 ('ella fitzgerald', 1.004421042302328),
 ('joel whitburns', 1.004421042302328),
 ('rabbit foot', 1.004421042302328),
 ('foot minstrels', 1.004421042302328),
 ('elks rendezvous', 1.004421042302328),
 ('here chicke