In [1]:
from gensim import corpora

In [2]:
documents = [u"Football club Arsenal defeat local rivals this weekend.", u"Weekend football frenzy takes over London.", u"Bank open for take over bids after losing millions.", u"London football clubs bid to move to Wembley stadium.", u"Arsenal bid 50 million pounds for striker Kane.", u"Financial troubles result in loss of millions for bank.", u"Western bank files for bankruptcy after financial losses.", u"London football club is taken over by oil millionaire from Russia.", u"Banking on finances not working for Russia."]

In [3]:
import spacy

In [4]:
nlp = spacy.load('en_core_web_sm')

In [6]:
texts = []
for document in documents:
    text = []
    doc = nlp(document)
    for w in doc:
        if not w.is_stop and not w.is_punct and not w.like_num:
            text.append(w.lemma_)
    texts.append(text)

In [7]:
dictionary = corpora.Dictionary(texts)
print(dictionary.token2id)

{'Arsenal': 0, 'club': 1, 'defeat': 2, 'football': 3, 'local': 4, 'rival': 5, 'weekend': 6, 'London': 7, 'frenzy': 8, 'take': 9, 'Bank': 10, 'bid': 11, 'lose': 12, 'million': 13, 'open': 14, 'Wembley': 15, 'stadium': 16, 'Kane': 17, 'arsenal': 18, 'pound': 19, 'striker': 20, 'bank': 21, 'financial': 22, 'loss': 23, 'result': 24, 'trouble': 25, 'bankruptcy': 26, 'file': 27, 'western': 28, 'Russia': 29, 'millionaire': 30, 'oil': 31, 'finance': 32, 'work': 33}


In [8]:
print(dictionary)

Dictionary(34 unique tokens: ['Arsenal', 'club', 'defeat', 'football', 'local']...)


In [9]:
type(dictionary)

gensim.corpora.dictionary.Dictionary

In [10]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [11]:
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(3, 1), (6, 1), (7, 1), (8, 1), (9, 1)],
 [(10, 1), (11, 1), (12, 1), (13, 1), (14, 1)],
 [(1, 1), (3, 1), (7, 1), (11, 1), (15, 1), (16, 1)],
 [(11, 1), (17, 1), (18, 1), (19, 1), (20, 1)],
 [(13, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1)],
 [(21, 1), (22, 1), (23, 1), (26, 1), (27, 1), (28, 1)],
 [(1, 1), (3, 1), (7, 1), (9, 1), (29, 1), (30, 1), (31, 1)],
 [(21, 1), (29, 1), (32, 1), (33, 1)]]

In [12]:
# TF-IDF represntation
from gensim import models

In [13]:
tfidf = models.TfidfModel(corpus)

In [14]:
tfidf

<gensim.models.tfidfmodel.TfidfModel at 0x143e785d8c8>

In [15]:
for document in tfidf[corpus]:
    print(document)

[(0, 0.4538520228951382), (1, 0.2269260114475691), (2, 0.4538520228951382), (3, 0.16750327793200118), (4, 0.4538520228951382), (5, 0.4538520228951382), (6, 0.3106776504135697)]
[(3, 0.24212967666975266), (6, 0.4490913847888623), (7, 0.32802654645398593), (8, 0.6560530929079719), (9, 0.4490913847888623)]
[(10, 0.5185742509441921), (11, 0.25928712547209604), (12, 0.5185742509441921), (13, 0.354982288765831), (14, 0.5185742509441921)]
[(1, 0.29431054749542984), (3, 0.21724253258131512), (7, 0.29431054749542984), (11, 0.29431054749542984), (15, 0.5886210949908597), (16, 0.5886210949908597)]
[(11, 0.24253562503633297), (17, 0.48507125007266594), (18, 0.48507125007266594), (19, 0.48507125007266594), (20, 0.48507125007266594)]
[(13, 0.35801943340074827), (21, 0.2615055248879333), (22, 0.35801943340074827), (23, 0.35801943340074827), (24, 0.5230110497758667), (25, 0.5230110497758667)]
[(21, 0.24434832234965204), (22, 0.33453001789363906), (23, 0.33453001789363906), (26, 0.4886966446993041), (2

In [17]:
# creating n-grams
bigram = models.Phrases(texts)
texts = [bigram[line] for line in texts]

In [18]:
texts

[['football', 'club', 'Arsenal', 'defeat', 'local', 'rival', 'weekend'],
 ['weekend', 'football', 'frenzy', 'take', 'London'],
 ['Bank', 'open', 'bid', 'lose', 'million'],
 ['London', 'football', 'club', 'bid', 'Wembley', 'stadium'],
 ['arsenal', 'bid', 'pound', 'striker', 'Kane'],
 ['financial', 'trouble', 'result', 'loss', 'million', 'bank'],
 ['western', 'bank', 'file', 'bankruptcy', 'financial', 'loss'],
 ['London', 'football', 'club', 'take', 'oil', 'millionaire', 'Russia'],
 ['bank', 'finance', 'work', 'Russia']]

In [20]:
dictionary = corpora.Dictionary(texts)

In [21]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x143e7870548>

In [23]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [24]:
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(3, 1), (6, 1), (7, 1), (8, 1), (9, 1)],
 [(10, 1), (11, 1), (12, 1), (13, 1), (14, 1)],
 [(1, 1), (3, 1), (7, 1), (11, 1), (15, 1), (16, 1)],
 [(11, 1), (17, 1), (18, 1), (19, 1), (20, 1)],
 [(13, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1)],
 [(21, 1), (22, 1), (23, 1), (26, 1), (27, 1), (28, 1)],
 [(1, 1), (3, 1), (7, 1), (9, 1), (29, 1), (30, 1), (31, 1)],
 [(21, 1), (29, 1), (32, 1), (33, 1)]]

In [25]:
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [26]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x143e7870548>

In [27]:
print(dictionary)

Dictionary(0 unique tokens: [])


In [34]:
dictionary.filter_extremes()

In [35]:
print(dictionary)

Dictionary(0 unique tokens: [])


In [36]:
corpus1 = [dictionary.doc2bow(text) for text in texts]

In [37]:
corpus1

[[], [], [], [], [], [], [], [], []]