In [1]:
%cd /workspace/flair
# !python -m spacy download en_core_web_lg

/workspace/flair


In [10]:
documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

from pprint import pprint  # pretty-printer
from collections import defaultdict

# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]

# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

pprint(texts)

from gensim import corpora

dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/deerwester.dict')  # store the dictionary, for future reference
print(dictionary)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]
Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


In [46]:
from gensim import models

tfidf = models.TfidfModel(corpus)  # step 1 -- initialize a model
corpus_tfidf = tfidf[corpus]
# for doc in corpus_tfidf:
#     print(doc)

# lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=3)  # initialize an LSI transformation
model = models.LdaModel(corpus, id2word=dictionary, num_topics=3)
corpus_lsi = model[corpus_tfidf]  # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
model.print_topics(3)

# both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
# for doc, as_text in zip(corpus_lsi, documents):
#     print(doc, as_text)


[(0,
  '0.215*"graph" + 0.151*"trees" + 0.150*"minors" + 0.120*"system" + 0.089*"survey" + 0.075*"human" + 0.071*"eps" + 0.027*"user" + 0.026*"interface" + 0.025*"computer"'),
 (1,
  '0.173*"user" + 0.172*"time" + 0.170*"response" + 0.101*"computer" + 0.100*"system" + 0.100*"survey" + 0.036*"trees" + 0.032*"graph" + 0.030*"minors" + 0.028*"human"'),
 (2,
  '0.172*"interface" + 0.146*"system" + 0.121*"eps" + 0.117*"human" + 0.100*"user" + 0.099*"trees" + 0.099*"computer" + 0.031*"graph" + 0.030*"minors" + 0.029*"survey"')]

In [18]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_lg")  # make sure to use larger model!

dog dog 1.0
dog cat 0.80168545
dog banana 0.24327646
cat dog 0.80168545
cat cat 1.0
cat banana 0.2815437
banana dog 0.24327646
banana cat 0.2815437
banana banana 1.0


In [27]:
# F,Ford Motor Company,NYQ,Auto Manufacturers - Major,USA,,,
# MSFT,Microsoft Corporation,NMS,Business Software & Services,USA,,,
tks = nlp("AXP American Express Company")
# t1, t2 = nlp("F Ford")

# for token in tks:
#     print(token.text, token.has_vector, token.vector_norm, token.is_oov)

# for token1 in tks:
token1 = tks[0]
for token2 in tks[1:]:
    print(token1.text, token2.text, token1.similarity(token2))

AXP American 0.041551307
AXP Express 0.011548235
AXP Company -0.17779963


In [40]:
t1 = "The Walt Disney Company"
t2 = "The Walt Disney"
d1 = nlp(t1)
d2 = nlp(t2)

# type(doc.ents[0])
# displacy.render(doc, style="ent")
# s = doc.ents[0]
# s.vector
# d1.similarity(d2)


The Walt Disney Company


In [2]:
import spacy


t = 'NEW YORK -- Yields on most certificates of deposit offered by major banks dropped more than a tenth of a percentage point in the latest week, reflecting the overall decline in short-term interest rates. On small-denomination, or "consumer," CDs sold directly by banks, the average yield on six-month deposits fell to 5.49% from 5.62% in the week ended yesterday, according to an 18-bank survey by Banxquote Money Markets, a Wilmington, Del., information service.</br></br>On three-month "consumer" deposits, the average yield sank to 5.29% from 5.42% the week before, according to Banxquote. Two banks in the Banxquote survey, Citibank in New York and CoreStates in Pennsylvania, are paying less than 5% on threemonth small-denomination CDs.</br></br>Declines were somewhat smaller on five-year consumer CDs, which eased to 7.37% from 7.45%, Banxquote said.</br></br>Yields on three-month and six-month Treasury bills sold at Monday\'s auction plummeted more than a fifth of a percentage point from the previous week, to 5.46% and 5.63%, respectively.'

nlp = spacy.load("en_core_web_sm")
doc = nlp(t)

In [6]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)
displacy.render(doc, style="ent")
doc[0]
doc[0].text, doc[0].ent_type_, doc[0].ent_kb_id_

In [8]:
from flair.data import Corpus
from flair.datasets import TREC_6
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer


corpus = TREC_6()
label_dict = corpus.make_label_dictionary()
word_embeddings = [WordEmbeddings('glove'),
                   # comment in flair embeddings for state-of-the-art results
                   # FlairEmbeddings('news-forward'),
                   # FlairEmbeddings('news-backward'),
                   ]
document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)
trainer = ModelTrainer(classifier, corpus)

trainer.train('resources/taggers/ag_news',
              learning_rate=0.1,
              mini_batch_size=32,
              anneal_factor=0.5,
              patience=5,
              max_epochs=150)

from flair.visual.training_curves import Plotter
plotter = Plotter()
plotter.plot_weights('resources/taggers/ag_news/weights.txt')

2020-03-11 13:55:31,270 Reading data from /root/.flair/datasets/trec_6
2020-03-11 13:55:31,271 Train: /root/.flair/datasets/trec_6/train.txt
2020-03-11 13:55:31,272 Dev: None
2020-03-11 13:55:31,273 Test: /root/.flair/datasets/trec_6/test.txt
2020-03-11 13:55:32,692 Computing label dictionary. Progress:



  0%|          | 0/4907 [00:00<?, ?it/s][A
  9%|▉         | 438/4907 [00:00<00:01, 4378.73it/s][A
 30%|██▉       | 1454/4907 [00:00<00:00, 5279.09it/s][A
100%|██████████| 4907/4907 [00:00<00:00, 13164.31it/s][A

2020-03-11 13:55:33,107 [b'LOC', b'DESC', b'ENTY', b'NUM', b'HUM', b'ABBR']





2020-03-11 13:55:34,439 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim not found in cache, downloading to /tmp/tmp4ky3oa1v



  0%|          | 0/21494764 [00:00<?, ?B/s][A
  0%|          | 1024/21494764 [00:00<1:28:32, 4046.10B/s][A
  0%|          | 34816/21494764 [00:00<1:02:43, 5702.65B/s][A
  0%|          | 87040/21494764 [00:00<44:10, 8076.04B/s]  [A
  1%|          | 139264/21494764 [00:00<31:03, 11458.89B/s][A
  1%|          | 208896/21494764 [00:01<22:01, 16102.60B/s][A
  2%|▏         | 417792/21494764 [00:01<15:19, 22918.87B/s][A
  2%|▏         | 491520/21494764 [00:01<10:56, 31996.21B/s][A
  3%|▎         | 696320/21494764 [00:01<07:38, 45402.28B/s][A
  4%|▍         | 922624/21494764 [00:01<05:21, 64082.77B/s][A
  6%|▌         | 1270784/21494764 [00:01<03:42, 90824.20B/s][A
  8%|▊         | 1758208/21494764 [00:01<02:33, 128685.86B/s][A
  9%|▉         | 2032640/21494764 [00:01<01:48, 179940.14B/s][A
 12%|█▏        | 2497536/21494764 [00:01<01:15, 252862.10B/s][A
 13%|█▎        | 2825216/21494764 [00:02<00:53, 347951.15B/s][A
 15%|█▌        | 3326976/21494764 [00:02<00:37, 481955.26B/s]

2020-03-11 13:55:48,008 copying /tmp/tmp4ky3oa1v to cache at /root/.flair/embeddings/glove.gensim
2020-03-11 13:55:48,194 removing temp file /tmp/tmp4ky3oa1v





ValueError: cannot reshape array of size 417659 into shape (400000,100)

In [None]:
from flair.models import TextClassifier
from flair.data import Sentence

classifier = TextClassifier.load('en-sentiment')
sentence = Sentence('Flair is pretty neat!')

classifier.predict(sentence)
# print sentence with predicted labels

print('Sentence above is: ', sentence.labels)

In [4]:
from flair.data import Sentence
from flair.models import SequenceTagger

# t = 'NEW YORK -- Yields on most certificates of deposit offered by major banks dropped more than a tenth of a percentage point in the latest week, reflecting the overall decline in short-term interest rates.</br></br>On small-denomination, or "consumer," CDs sold directly by banks, the average yield on six-month deposits fell to 5.49% from 5.62% in the week ended yesterday, according to an 18-bank survey by Banxquote Money Markets, a Wilmington, Del., information service.</br></br>On three-month "consumer" deposits, the average yield sank to 5.29% from 5.42% the week before, according to Banxquote. Two banks in the Banxquote survey, Citibank in New York and CoreStates in Pennsylvania, are paying less than 5% on threemonth small-denomination CDs.</br></br>Declines were somewhat smaller on five-year consumer CDs, which eased to 7.37% from 7.45%, Banxquote said.</br></br>Yields on three-month and six-month Treasury bills sold at Monday\'s auction plummeted more than a fifth of a percentage point from the previous week, to 5.46% and 5.63%, respectively.'
t = "iTV Will Boost Apple http:\/\/t.co\/8dup4cQc08 $AAPL #APPLE"
t = t.replace("AAPL", "")

sentence = Sentence(t)
# [x for x in sentence]

tagger = SequenceTagger.load("chunk-fast")
# tagger = SequenceTagger.load("ner-ontonotes-fast")
tagger.predict(sentence)

# print("Analysing %s" % sentence)
print(sentence.to_tagged_string())

2020-03-11 13:28:36,083 loading file /root/.flair/models/en-chunk-conll2000-fast-v0.4.pt
iTV <S-NP> Will <B-VP> Boost <E-VP> Apple <B-NP> http:\/\/t.co\/8dup4cQc08 <I-NP> $ <I-NP> #APPLE <E-NP>


In [5]:
sentence.get_spans('np')

[<NP-span (1): "iTV">,
 <VP-span (2,3): "Will Boost">,
 <NP-span (4,5,6,7): "Apple http:\/\/t.co\/8dup4cQc08 $ #APPLE">]

In [4]:
from flair.data import Sentence
from flair.models import SequenceTagger

t = 'NEW YORK -- Yields on most certificates of deposit offered by major banks dropped more than a tenth of a percentage point in the latest week, reflecting the overall decline in short-term interest rates.</br></br>On small-denomination, or "consumer," CDs sold directly by banks, the average yield on six-month deposits fell to 5.49% from 5.62% in the week ended yesterday, according to an 18-bank survey by Banxquote Money Markets, a Wilmington, Del., information service.</br></br>On three-month "consumer" deposits, the average yield sank to 5.29% from 5.42% the week before, according to Banxquote. Two banks in the Banxquote survey, Citibank in New York and CoreStates in Pennsylvania, are paying less than 5% on threemonth small-denomination CDs.</br></br>Declines were somewhat smaller on five-year consumer CDs, which eased to 7.37% from 7.45%, Banxquote said.</br></br>Yields on three-month and six-month Treasury bills sold at Monday\'s auction plummeted more than a fifth of a percentage point from the previous week, to 5.46% and 5.63%, respectively.'

tagger = SequenceTagger.load("chunk-fast")
sentence = Sentence(t)
tagger.predict(sentence)
sentence.to_tagged_string()

2020-02-28 06:10:34,080 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4/release-chunk-fast-0/en-chunk-conll2000-fast-v0.4.pt not found in cache, downloading to /tmp/tmpdhmzfc8v


100%|██████████| 75233247/75233247 [00:37<00:00, 2018218.86B/s]

2020-02-28 06:11:13,724 copying /tmp/tmpdhmzfc8v to cache at /root/.flair/models/en-chunk-conll2000-fast-v0.4.pt





2020-02-28 06:11:13,899 removing temp file /tmp/tmpdhmzfc8v
2020-02-28 06:11:13,930 loading file /root/.flair/models/en-chunk-conll2000-fast-v0.4.pt


'NEW <B-NP> YORK <E-NP> -- Yields <S-NP> on <S-PP> most <B-NP> certificates <E-NP> of <S-PP> deposit <S-NP> offered <S-VP> by <S-PP> major <B-NP> banks <E-NP> dropped <S-VP> more <B-NP> than <I-NP> a <I-NP> tenth <E-NP> of <S-PP> a <B-NP> percentage <I-NP> point <E-NP> in <S-PP> the <B-NP> latest <I-NP> week, <E-NP> reflecting <S-VP> the <B-NP> overall <I-NP> decline <E-NP> in <S-PP> short-term <B-NP> interest <I-NP> rates.</br></br>On <E-NP> small-denomination, or "consumer," <B-NP> CDs <E-NP> sold <S-VP> directly <S-ADVP> by <S-PP> banks, <S-NP> the <B-NP> average <I-NP> yield <E-NP> on <S-PP> six-month <B-NP> deposits <E-NP> fell <S-VP> to <S-PP> 5.49% <S-NP> from <S-PP> 5.62% <S-NP> in <S-PP> the <B-NP> week <E-NP> ended <S-VP> yesterday, <S-NP> according <S-PP> to <S-PP> an <B-NP> 18-bank <I-NP> survey <E-NP> by <S-PP> Banxquote <B-NP> Money <I-NP> Markets, <E-NP> a <B-NP> Wilmington, <I-NP> Del., <I-NP> information <I-NP> service.</br></br>On <I-NP> three-month <I-NP> "consumer" 

In [5]:
for entity in sentence.get_spans('ner'):
    print(entity)

In [13]:
import pandas as pd

df = pd.read_csv("/workspace/flair/data/Full-Economic-News-DFE-839861.csv", encoding='ISO-8859-1')
# df.head
# _unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,positivity,positivity:confidence,relevance,relevance:confidence,articleid,date,headline,positivity_gold,relevance_gold,text
# a, b, c = df.loc[0, ["date", "headline", "text"]]
df.loc[10, ["date", "headline", "text"]][1]

'Dow Falls 45.95, Late GM Surge Stanches Losses'

In [2]:
from flair.data import Sentence
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger.load("ner")

sentence: Sentence = Sentence("George Washington went to Washington .")
tagger.predict(sentence)

print("Analysing %s" % sentence)
print("\nThe following NER tags are found: \n")
print(sentence.to_tagged_string())

2020-02-26 13:30:40,873 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4/NER-conll03-english/en-ner-conll03-v0.4.pt not found in cache, downloading to /tmp/tmp4xhqochp


100%|██████████| 432197603/432197603 [47:36<00:00, 151296.68B/s] 

2020-02-26 14:18:18,476 copying /tmp/tmp4xhqochp to cache at /root/.flair/models/en-ner-conll03-v0.4.pt





2020-02-26 14:18:19,110 removing temp file /tmp/tmp4xhqochp
2020-02-26 14:18:19,240 loading file /root/.flair/models/en-ner-conll03-v0.4.pt
Analysing Sentence: "George Washington went to Washington ." - 6 Tokens

The following NER tags are found: 

George <B-PER> Washington <E-PER> went to Washington <S-LOC> .
