In [1]:
import sys
sys.path.append("..")

from src.webapp_utility import Loader

import warnings
warnings.filterwarnings('ignore')

In [2]:
l = Loader()

Loading full count vectorizers... Done
Loading full lda model... Done
Loading small lda model... Done
Loading word embeddings... Done


### Word frequency

In [3]:
l.get_freq_distribution("gun", interval=20)

[(1760, 1.0),
 (1840, 0.0026345502446368085),
 (1860, 0.002485143165856294),
 (1880, 0.003918973941368078),
 (1900, 0.008018802709802295),
 (1920, 0.020330934184165615),
 (1940, 0.021543496045814018),
 (1960, 0.08811566665677048),
 (1980, 0.10936842400606044),
 (2000, 0.11250221985437756)]

### Embeddings

In [4]:
l.get_n_similar(word="mr", n=3, model_type="full")

[('death', 1.0), ('wend', 0.9999999403953552), ('lame', 0.9999998211860657)]

In [5]:
l.get_n_similar(word="gun", n=3, model_type="one", year=2000)

[('knife', 0.8641740083694458),
 ('handgun', 0.8565400242805481),
 ('shotgun', 0.8561012148857117)]

In [6]:
l.get_n_similar(word="gun", n=3, model_type="ten", year=2000)

[('handgun', 0.8175308704376221),
 ('pistol', 0.7670312523841858),
 ('shotgun', 0.7456485033035278)]

### Topics

In [7]:
l.get_topic_dist(["gun"], model="small")

{0: 0.44,
 1: 0.338,
 2: 21.027,
 3: 48.96,
 4: 0.002,
 5: 0.015,
 6: 0.146,
 7: 0.238,
 8: 2.236,
 9: 0.0,
 10: 11.288,
 11: 0.058,
 12: 0.001,
 13: 15.251}

In [8]:
l.get_topic_dist(["gun", "cocaine"], model="big")

{0: 43.027,
 1: 0.0,
 2: 2.257,
 3: 0.0,
 4: 0.004,
 5: 0.0,
 6: 0.0,
 7: 0.062,
 8: 0.009,
 9: 0.0,
 10: 0.0,
 11: 0.001,
 12: 0.0,
 13: 54.641}

In [9]:
l.get_topics_words(n=5, model="big")
# l.get_topics_words(n=5, model="small")

{0: [('testify', 0.25),
  ('people', 0.201),
  ('testimony', 0.195),
  ('jury', 0.179),
  ('witness', 0.174)],
 1: [('agreement', 0.231),
  ('fee', 0.229),
  ('award', 0.183),
  ('petitioner', 0.18),
  ('attorney', 0.177)],
 2: [('sentence', 0.26),
  ('people', 0.251),
  ('counsel', 0.169),
  ('offense', 0.162),
  ('charge', 0.158)],
 3: [('board', 0.298),
  ('commission', 0.237),
  ('employee', 0.166),
  ('claimant', 0.153),
  ('decision', 0.146)],
 4: [('respondent', 0.417),
  ('statement', 0.149),
  ('information', 0.148),
  ('testify', 0.145),
  ('mental', 0.14)],
 5: [('plaintiff', 0.582),
  ('motion', 0.158),
  ('complaint', 0.123),
  ('rule', 0.07),
  ('dismiss', 0.066)],
 6: [('property', 0.279),
  ('city', 0.222),
  ('plaintiff', 0.207),
  ('use', 0.162),
  ('lease', 0.13)],
 7: [('child', 0.319),
  ('section', 0.211),
  ('school', 0.193),
  ('petition', 0.141),
  ('board', 0.137)],
 8: [('plaintiff', 0.325),
  ('car', 0.207),
  ('jury', 0.164),
  ('injury', 0.163),
  ('neglig

### Semantic

In [10]:
c = l.get_semantic_data("cocaine", base_year=2010)
print(c.keys())
c["one_year"]

dict_keys(['one_year', 'ten_year'])


[(2010, 1.0000001),
 (2009, 0.8796335),
 (2008, 0.8978368),
 (2007, 0.853726),
 (2006, 0.8547532),
 (2005, 0.8768975),
 (2004, 0.8583519),
 (2003, 0.85325295),
 (2002, 0.84031045),
 (2001, 0.8679122),
 (2000, 0.83747),
 (1999, 0.8714155),
 (1998, 0.8220672),
 (1997, 0.8828484),
 (1996, 0.8623095),
 (1995, 0.85866886),
 (1994, 0.7618509),
 (1993, 0.74331117),
 (1992, 0.700742),
 (1991, 0.73083466),
 (1990, 0.7397601),
 (1989, 0.7238395),
 (1988, 0.7545918),
 (1987, 0.79444826),
 (1986, 0.7920489),
 (1985, 0.76328886),
 (1984, 0.7869801),
 (1983, 0.77199197),
 (1982, 0.81901085),
 (1981, 0.66725475),
 (1980, 0.6760842),
 (1979, 0.649776),
 (1978, 0.73963344),
 (1977, 0.62569845),
 (1976, 0.6935027),
 (1975, 0.7067495),
 (1974, 0.5828003),
 (1973, -1),
 (1972, -1),
 (1971, -1),
 (1970, -1),
 (1969, -1),
 (1968, -1),
 (1967, -1),
 (1966, -1),
 (1965, -1),
 (1964, -1),
 (1963, -1),
 (1962, -1),
 (1961, -1),
 (1960, -1),
 (1959, -1),
 (1958, -1),
 (1957, -1),
 (1956, -1),
 (1955, -1),
 (1954