In [1]:
import re
from collections import defaultdict
import spacy

nlp = spacy.load("en_core_web_sm")

words = defaultdict(int)

with open("../chapter06/train.txt", "r") as file:
  for line in file:
    items = line.split("\t")
    category = items[0]
    title = items[1]

    code_regex = re.compile('[!"#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`{|}~「」〔〕“”〈〉『』【】＆＊・（）＄＃＠。、？！｀＋￥％]')
    text = code_regex.sub('', title.rstrip().lower())

    doc = nlp(text)

    for token in doc:
      words[token.text] += 1
words

defaultdict(int,
            {'update': 1028,
             '1guinea': 3,
             'haemorrhagic': 1,
             'fever': 5,
             'may': 200,
             'have': 121,
             'crossed': 3,
             'into': 105,
             'sierra': 9,
             'leone': 10,
             'frances': 11,
             'orange': 13,
             'wo': 44,
             'nt': 186,
             'carry': 5,
             'netflix': 48,
             'on': 1324,
             'settop': 1,
             'boxes': 1,
             'at': 663,
             'first': 212,
             'ceo': 130,
             '3valeant': 1,
             'shares': 201,
             'fall': 103,
             'lowered': 1,
             '2014': 102,
             '2015': 39,
             'forecasts': 24,
             'dylan': 2,
             'fan': 16,
             'shells': 1,
             'out': 162,
             '2million': 1,
             'for': 1345,
             'oneofakind': 1,
             'draft': 6,
        

In [2]:
from collections import OrderedDict

ordered_word = OrderedDict(sorted(words.items(), key=lambda x:x[1], reverse=True))

ordered_word

OrderedDict([(' ', 3163),
             ('to', 2860),
             ('in', 1911),
             ('the', 1594),
             ('of', 1429),
             ('for', 1345),
             ('on', 1324),
             ('as', 1141),
             ('update', 1028),
             ('and', 943),
             ('us', 888),
             ('a', 810),
             ('with', 767),
             ('at', 663),
             ('is', 620),
             ('after', 603),
             ('new', 539),
             ('says', 406),
             ('from', 385),
             ('up', 366),
             ('by', 328),
             ('over', 277),
             ('kardashian', 272),
             ('her', 260),
             ('kim', 245),
             ('be', 237),
             ('1', 232),
             ('data', 213),
             ('first', 212),
             ('stocks', 212),
             ('are', 212),
             ('will', 211),
             ('china', 204),
             ('about', 204),
             ('shares', 201),
             ('may', 200),
      

In [3]:
ordered_word.pop(' ')
ordered_word

OrderedDict([('to', 2860),
             ('in', 1911),
             ('the', 1594),
             ('of', 1429),
             ('for', 1345),
             ('on', 1324),
             ('as', 1141),
             ('update', 1028),
             ('and', 943),
             ('us', 888),
             ('a', 810),
             ('with', 767),
             ('at', 663),
             ('is', 620),
             ('after', 603),
             ('new', 539),
             ('says', 406),
             ('from', 385),
             ('up', 366),
             ('by', 328),
             ('over', 277),
             ('kardashian', 272),
             ('her', 260),
             ('kim', 245),
             ('be', 237),
             ('1', 232),
             ('data', 213),
             ('first', 212),
             ('stocks', 212),
             ('are', 212),
             ('will', 211),
             ('china', 204),
             ('about', 204),
             ('shares', 201),
             ('may', 200),
             ('more', 195),
    

In [4]:
import pandas as pd

df = pd.DataFrame.from_dict(ordered_word, orient="index", columns=["COUNT"])
df

Unnamed: 0,COUNT
to,2860
in,1911
the,1594
of,1429
for,1345
...,...
humankind,1
sole,1
curator,1
healthkit,1


In [5]:
i = 0

def setId(count, axis=None):
  global i
  i += 1
  if count > 1:
    return i
  else:
    return 0

In [6]:
df["ID"] = df["COUNT"].apply(setId, axis=1)
df

Unnamed: 0,COUNT,ID
to,2860,1
in,1911,2
the,1594,3
of,1429,4
for,1345,5
...,...,...
humankind,1,0
sole,1,0
curator,1,0
healthkit,1,0


In [7]:
df.to_csv("./id.csv")

In [8]:
def search_id(word: str) -> int:
  try:
    return df.loc[[word]]["ID"].values[0]
  except:
    return 0

print(search_id("to"))

1


In [9]:
def tokenizer(title: str):
  words = []

  code_regex = re.compile('[!"#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`{|}~「」〔〕“”〈〉『』【】＆＊・（）＄＃＠。、？！｀＋￥％]')
  text = code_regex.sub('', title.rstrip().lower())

  doc = nlp(text)

  for token in doc:
    id = search_id(token.text)
    words.append(id)

  return words

In [10]:
print(tokenizer("Justin Bieber Under Investigation For Attempted Robbery At Dave  Busters"))

[83, 111, 335, 1071, 5, 4125, 2851, 13, 1831, 0, 0]
