In [13]:
# !pip install word2number

In [28]:
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from word2number import w2n

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/taohidshadat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/taohidshadat/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [29]:
ps = PorterStemmer()

The goal of this NLTK portion is to get 3 constraints out of a sentence:
- number constraint (example 3)
- mathematical constraint (<=)
- categorical constraint (tech)


Example Inputs:
*  “I want to invest in at most 3 companies”
*  “I want to invest in at least 3 companies”
* “I want to invest in European companies”
* “I want to invest in 3 tech companies”

This portion of the code extracts a number from a sentence in word format and number format. The function *extract_number* returns a list of the number/numbers extracted. 

* This function also handles from number ranges such as "I want 3 to 5 stocks"

Example:
* "I want one hundred stocks" ---> [100]
* "I want 3 to 5 stocks" ----> [3]
* "I want three to five stocks" ---> [3]


In [30]:
def extract_number(inp):
  res = re.findall(r'\d+', inp)
  if len(res) > 0:
    res = [int(res[0])]
  #if it comes here that means the input doesn't have a numeric form and probably word format
  if not res:
    for w in inp.split():
      if not res:
        try:
          res = [w2n.word_to_num(w)]
        except:
          res = []
  return res

This function extracts the mathematical constrain and outputs it into a list by referring to a dictionary of inequality key words

Example:
* "I want at least 3 stocks" ---> ['>=']

In [31]:
def extract_mathematical(inp):
  #dictionary of key words for inequalities
  d = {
      "=": ["only", "exactly"],
      ">=": ["at least", "minimum", "no less than", "no fewer than", "greater than or equal to"],
      "<=": ["at most", "maximum", "no more than", "not above", "does not exceed", "less than or equal to"],
      ">": ["more than", "exceeds", "over", "above", "greater than"],
      "<": ["under", "below", "fewer than", "beneath", "less than"]
    }
  res = []
  for key, value in d.items():
    for w in value:
      if w in inp.lower():
        res.append(key)
  return res

In [32]:
def extract_cat(inp):
  sectors = {
      'Industrials': [],
      'Health Care': [],
      'Information Technology': ['technology', 'tech', 'game'],
      'Communication Services': [],
      'Consumer Staples': [],
      'Consumer Discretionary': [],
      'Utilities': [],
      'Financials': ['bank'],
      'Materials': [],
      'Real Estate': [],
      'Energy': ['green', 'green-energy', 'energy']}
  res = []
  #the parts of speech we want to extract
  pos_wanted = ["NN", "JJ", "JJS", "JJR", "NNS"]
  words = nltk.word_tokenize(inp)
  #tag words with their part of speech
  tagged = nltk.pos_tag(words)
  for word,pos in tagged:
    if pos in pos_wanted and ps.stem(word) != "stock" and ps.stem(word) != "compani":   
      for key, val in sectors.items():
        if word.lower() in val:
          res.append(key)
  return res

In [33]:
def extraction(inp):
  res = []
  mth = extract_mathematical(inp)
  num = extract_number(inp)
  cat = extract_cat(inp)
  if len(mth) > 0:
    res.append((mth[0], 'mth'))
  else:
    res.append(("None", "mth"))
  if len(num) > 0:
    res.append((num[0], 'num'))
  else:
    res.append(("None", "num"))
  if cat:
    res.append((cat, "cat"))
  else:
    res.append(("None", "cat"))
  return res

In [34]:
print(extraction("I want at least 3 - 5 tech stocks"))
print(extraction("I want tech stocks")) 
print(extraction("I want only 4 stocks"))
print(extraction("I want 4 - 6 european stocks"))

[('>=', 'mth'), (3, 'num'), (['Information Technology'], 'cat')]
[('None', 'mth'), ('None', 'num'), (['Information Technology'], 'cat')]
[('=', 'mth'), (4, 'num'), ('None', 'cat')]
[('None', 'mth'), (4, 'num'), ('None', 'cat')]
