In [31]:
import pickle as pkl
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
from parse_util import is_cjk
from graph_util import *
sns.set_style("darkgrid", {"axes.facecolor": ".9"})
import matplotlib.pyplot as plt
import matplotlib.font_manager as mfm

font_path = '/home/ycm/.fonts/SourceHanSansSC-Regular.otf'
prop = mfm.FontProperties(fname=font_path)

In [32]:
with open('CHAR_TO_IDS.pkl', 'rb') as f:
    char_to_ids = pkl.load(f)
char_to_ids['书'] = Char('书')
char_to_ids['已'] = Char('已')
char_to_ids['亊'] = Char('亊')
char_to_ids['枭'] = Char('枭')
char_to_ids['枭'] = Char('枭')
char_to_ids['袅'] = Char('袅')

In [33]:
N_YEARS_PER_BLOCK = 5
START_YEAR = 1950
time_periods = list(range(START_YEAR,
                          START_YEAR + 1 + N_YEARS_PER_BLOCK * min(i for i in range(125) if START_YEAR + i * N_YEARS_PER_BLOCK >= 2020),
                          N_YEARS_PER_BLOCK))

In [34]:
df = pd.read_pickle('df_full_5y.pkl')

# Sanity check POS tags for frequently occuring terms

In [35]:
pos_list = set(df.pos)

In [36]:
MIN_TOTAL_OCCURRENCE_THRESHOLD = 200

def row_is_at_least_threshold(row):
    return sum(row[f'{t}_n_occ'] for t in time_periods) >= MIN_TOTAL_OCCURRENCE_THRESHOLD

for pos in pos_list:
    df_only_this_pos = df[df.pos == pos]
    df_only_this_pos_at_least_threshold = df_only_this_pos[df_only_this_pos.apply(row_is_at_least_threshold, axis=1)]
    print(pos)
    print(df_only_this_pos_at_least_threshold.head(10).term)
    print()

NUM
44     信教
62     绥靖
72     五一
82     一倍
105    力学
115     券
122    应交
135     荐
230    回族
235    视点
Name: term, dtype: object

ADP
46       比较
71       遵照
209      另选
441      助长
682      如一
686      会对
1120     以近
1189      等
1330     全校
1346    换言之
Name: term, dtype: object

X
143        受
2245      把这
3223      各有
3468      先把
4911       呵
5383    第二十条
5569      将成
5724      不能
8399       啊
8939      要把
Name: term, dtype: object

ADJ
1        或多
51       些小
53       撰稿
68       信访
86       浑然
140      每周
142      例行
149      恐吓
186      活性
187    无所作为
Name: term, dtype: object

PRON
796      这部
2592     各地
3027      子
3192     本书
3283      饶
4867     何谓
4938     她们
5181     那个
7296     该条
7640    大事记
Name: term, dtype: object

VERB
2     窝里斗
5      围捕
11    可不是
14     立起
24     珍重
28      势
29     就职
31     着落
59    合理化
67     加深
Name: term, dtype: object

NOUN
0        注销
3        叹息
12       刘锋
13        吁
15       晋阳
17       建德
18       金花
19    安纳托利亚
25       横扫
26      长绒棉

# Conclusion - still a lot of bad POS tags. (also it's difficult to choose threshold)

In [37]:
def row_get_total_num_occurrences(row):
    return sum(row[f'{t}_n_occ'] for t in time_periods)

In [38]:
df['total_occ'] = df.apply(row_get_total_num_occurrences, axis=1)

## 50 most common NOUNs look good.

In [39]:
df[df.pos == 'NOUN'].sort_values('total_occ', ascending=False).head(50)

Unnamed: 0,term,pos,1950_n_occ,1950_n_occ_norm,1955_n_occ,1955_n_occ_norm,1960_n_occ,1960_n_occ_norm,1965_n_occ,1965_n_occ_norm,...,2000_n_occ_norm,2005_n_occ,2005_n_occ_norm,2010_n_occ,2010_n_occ_norm,2015_n_occ,2015_n_occ_norm,2020_n_occ,2020_n_occ_norm,total_occ
56441,经济,NOUN,2785,471.238462,37903,450.955168,28151,379.359417,15485,262.239104,...,1861.378385,6514093,1506.915084,425815,1108.11766,172947,483.869599,0,0,53590802
59420,社会,NOUN,2361,361.04329,257840,666.952692,117770,616.123672,53572,442.173963,...,1426.765189,6946703,1472.551085,444880,1085.396405,197825,459.795585,0,0,47133370
152016,年,NOUN,5038,211.116228,237974,625.549545,72370,359.058006,23682,153.081722,...,1408.107756,5972164,1226.733106,529684,932.481991,205613,349.285034,0,0,38830101
15294,人,NOUN,5356,183.870774,247887,563.726206,98141,446.222117,52379,317.326221,...,1263.364762,6787625,1297.922598,533235,829.189667,410880,514.000926,0,0,38489408
126970,企业,NOUN,3180,861.071429,54170,400.686032,30963,422.553368,6338,106.480535,...,2074.794202,5121164,1652.762315,465845,1695.819492,210096,628.920007,0,0,38121510
147578,主义,NOUN,3848,748.814379,455263,1095.467707,230270,1191.004263,117412,969.969986,...,804.703304,2583209,666.045698,123246,432.208692,50499,194.889085,0,0,30635464
51062,发展,NOUN,1151,140.34127,68665,296.806656,48984,294.267981,15910,157.668373,...,1104.983354,4949543,1087.609121,374111,945.328196,134703,316.497554,0,0,29844476
147773,国家,NOUN,5300,613.823622,155301,488.391185,57662,350.633716,24491,253.100568,...,1077.115961,4311412,944.641321,293510,739.414775,144147,352.913569,0,0,29620793
3367,中国,NOUN,2988,292.840659,110757,323.21998,66375,346.874416,49150,498.52034,...,1125.84448,5579103,1195.956955,420573,1014.076867,204723,482.233143,0,0,28625643
55170,工作,NOUN,6447,275.449394,166503,431.154395,86901,474.962097,40281,294.963455,...,921.092027,3945061,873.174856,417871,979.531038,135977,249.399263,0,0,26406968


## 50 most common VERBs look good.

In [40]:
df[df.pos == 'VERB'].sort_values('total_occ', ascending=False).head(50)

Unnamed: 0,term,pos,1950_n_occ,1950_n_occ_norm,1955_n_occ,1955_n_occ_norm,1960_n_occ,1960_n_occ_norm,1965_n_occ,1965_n_occ_norm,...,2000_n_occ_norm,2005_n_occ,2005_n_occ_norm,2010_n_occ,2010_n_occ_norm,2015_n_occ,2015_n_occ_norm,2020_n_occ,2020_n_occ_norm,total_occ
6035,是,VERB,7961,308.114871,750202,1584.087987,366055,1530.633342,181851,1188.584657,...,3137.660532,15986701,2969.477376,1271250,1928.08042,1040039,1313.737235,0,0,112846278
133499,有,VERB,6196,324.268254,327666,734.439579,159546,687.432271,77186,471.810573,...,1577.029075,7551354,1426.182568,523593,905.526817,341992,495.957321,0,0,55364913
59595,要,VERB,2068,184.684253,143283,363.292544,76034,433.976433,57742,461.931217,...,875.558631,3690560,767.697131,278057,540.276772,174328,286.346075,0,0,27962709
15443,为,VERB,2319,206.987583,122097,332.985724,51100,281.968565,24875,217.280183,...,830.335954,3816531,795.80913,323978,750.989062,162918,342.828215,0,0,24573891
95718,到,VERB,2029,150.865175,157416,393.094984,67888,344.750434,39865,305.153405,...,704.68996,3428142,694.479127,235330,483.158935,143509,257.485251,0,0,22868975
70010,进行,VERB,2148,244.349206,94840,334.895105,50763,296.806349,24655,233.727103,...,603.736242,2729140,584.268664,237185,612.551072,126488,314.935563,0,0,17407252
77084,说,VERB,1029,116.5625,45797,453.078569,11120,154.263945,20782,319.675571,...,472.670777,2060451,451.385262,147076,362.090308,141022,279.913483,0,0,15621132
1624,可以,VERB,1077,127.290643,95033,260.036363,43022,244.51698,19413,168.994864,...,544.441728,2685092,552.476514,212599,478.051717,184210,366.02266,0,0,15391279
92298,发展,VERB,843,121.601891,45908,207.420155,29652,198.573818,12947,135.611007,...,534.555714,2113158,494.057407,156972,443.20269,60779,159.713313,0,0,14347271
20226,使,VERB,1517,127.383333,98235,265.619558,40911,235.63733,20450,191.811573,...,436.779902,1613617,352.211732,121748,303.961645,90053,198.512215,0,0,13767719


## 50 most common ADJs look good.

In [41]:
df[df.pos == 'ADJ'].sort_values('total_occ', ascending=False).head(50)

Unnamed: 0,term,pos,1950_n_occ,1950_n_occ_norm,1955_n_occ,1955_n_occ_norm,1960_n_occ,1960_n_occ_norm,1965_n_occ,1965_n_occ_norm,...,2000_n_occ_norm,2005_n_occ,2005_n_occ_norm,2010_n_occ,2010_n_occ_norm,2015_n_occ,2015_n_occ_norm,2020_n_occ,2020_n_occ_norm,total_occ
42823,大,ADJ,1579,80.440404,80990,217.161781,37326,199.011842,24155,208.661416,...,569.152833,2358170,509.571105,150555,326.985713,67103,135.107399,0,0,15213168
35558,新,ADJ,971,49.776096,59313,164.340983,28426,165.164659,17616,141.565581,...,458.93396,1923907,446.296757,178069,425.115305,77429,163.661863,0,0,12251592
57297,性,ADJ,580,49.172727,35936,117.214799,20576,142.754482,7394,73.827561,...,443.2601,2098655,490.062096,139813,327.689979,53841,128.235787,0,0,11091773
127341,重要,ADJ,518,42.254579,20260,69.645147,12572,92.810667,5641,58.736843,...,317.618868,1348929,312.338954,101877,263.173298,39957,92.277251,0,0,7756477
44686,基本,ADJ,615,56.935714,26979,95.385948,12762,103.676527,4780,60.310298,...,275.868264,1049436,267.404108,93142,244.281118,35427,80.937725,0,0,7199426
86040,主要,ADJ,610,44.763722,33222,107.181044,16769,110.675141,6842,67.3179,...,257.011877,979651,236.707096,95974,267.089626,36714,90.756892,0,0,6791595
129389,有关,ADJ,1248,136.285714,16236,77.714669,6958,63.900688,2899,42.388326,...,299.742998,1076727,261.179074,61765,199.572615,22413,65.627496,0,0,6436236
121422,总,ADJ,868,130.171429,35961,144.505836,10922,91.210376,4429,63.157892,...,248.373003,728388,205.259245,88906,306.441322,21594,70.655356,0,0,6360702
31124,不同,ADJ,309,42.416667,22789,86.281341,15284,115.696108,4845,56.305849,...,201.745178,866747,211.116849,75066,204.758011,49127,128.460951,0,0,5421742
157996,一定,ADJ,277,42.141026,28739,102.049505,15934,120.025352,4230,49.023105,...,177.159003,625346,160.659878,49122,140.399966,29802,83.399336,0,0,4910689


## 50 most common ADVs look mostly good except for a few exceptions that I'm unsure about.

- 一个 ('a', 'one') - not sureof contexts when this is a an adverb
- 但是 （'but')
- 在 (progressive aspect marker)

In [42]:
df[df.pos == 'ADV'].sort_values('total_occ', ascending=False).head(50)

Unnamed: 0,term,pos,1950_n_occ,1950_n_occ_norm,1955_n_occ,1955_n_occ_norm,1960_n_occ,1960_n_occ_norm,1965_n_occ,1965_n_occ_norm,...,2000_n_occ_norm,2005_n_occ,2005_n_occ_norm,2010_n_occ,2010_n_occ_norm,2015_n_occ,2015_n_occ_norm,2020_n_occ,2020_n_occ_norm,total_occ
117807,不,ADV,4899,285.753082,287545,669.979219,131919,609.076172,70928,516.106656,...,1201.368837,5738554,1121.638683,354161,619.454644,209640,322.182568,0,0,40025461
158352,也,ADV,2026,164.003571,198780,481.597637,95335,454.753243,41103,322.018026,...,843.517724,4093135,818.555779,327374,676.743486,268413,484.474879,0,0,27843101
110456,就,ADV,2124,155.650668,210350,519.97918,92227,485.302032,51496,413.223246,...,591.487546,2671323,557.604367,206777,452.581406,182117,325.333263,0,0,20115570
56974,都,ADV,1416,118.045022,124948,320.308876,58521,301.394332,30865,248.0974,...,533.539367,2512818,519.887514,182241,411.473889,149417,292.100297,0,0,17531209
150869,还,ADV,1265,159.778279,108461,291.775731,48357,261.086448,24796,218.302563,...,444.972791,1929913,413.490341,124808,299.851026,78115,186.52349,0,0,14300899
144317,但,ADV,1590,140.470455,66771,188.554128,30399,181.194563,11441,110.328812,...,485.960072,2166569,464.60663,174909,400.199715,145546,322.53541,0,0,14179088
130894,一个,ADV,1121,120.55291,91338,246.613289,44701,246.674347,25739,228.17471,...,478.815027,2278405,471.20769,179883,429.102375,183667,376.219587,0,0,14133598
157506,而,ADV,1044,124.177273,111901,309.997443,45117,263.439654,16123,153.242425,...,468.209097,2083185,457.22622,158098,366.609171,133836,312.568951,0,0,13663807
83310,更,ADV,710,76.80303,66964,187.286091,35877,212.373847,15550,137.32179,...,397.267683,1852788,411.12574,116572,254.525036,76310,157.538452,0,0,10465937
151398,就是,ADV,831,82.20101,97671,265.358598,44653,268.168853,25461,241.148328,...,326.099473,1430278,319.855131,118561,282.584565,124753,246.751908,0,0,10042478


## 50 most common CONJs look good.

In [43]:
df[df.pos == 'CONJ'].sort_values('total_occ', ascending=False).head(50)

Unnamed: 0,term,pos,1950_n_occ,1950_n_occ_norm,1955_n_occ,1955_n_occ_norm,1960_n_occ,1960_n_occ_norm,1965_n_occ,1965_n_occ_norm,...,2000_n_occ_norm,2005_n_occ,2005_n_occ_norm,2010_n_occ,2010_n_occ_norm,2015_n_occ,2015_n_occ_norm,2020_n_occ,2020_n_occ_norm,total_occ
147684,和,CONJ,17263,513.666312,713079,1424.430386,353599,1383.238685,168158,1015.39951,...,3758.443103,19175862,3601.794684,1713464,3076.669344,876542,1331.217622,0,0,125164893
101216,与,CONJ,2618,218.795455,83425,245.040326,33808,214.224449,9169,98.403237,...,990.488011,4941726,1022.758713,462934,885.980173,266623,403.921504,0,0,26783767
112837,或,CONJ,5129,492.052632,71467,228.868899,31664,195.512528,13020,139.72351,...,516.012791,2158429,469.048417,217262,494.779355,168774,349.979886,0,0,14556404
100395,及,CONJ,3952,153.863433,41517,123.318418,17833,94.796681,8606,63.865071,...,446.593651,1949598,426.99926,239461,519.277528,75753,140.896108,0,0,11247624
42067,并,CONJ,2076,210.756944,46569,160.650607,18248,127.298701,7666,89.877264,...,367.815632,1686937,365.903624,137792,357.498583,70739,181.082065,0,0,9897414
17214,以及,CONJ,1146,125.013305,40108,132.755638,16067,107.518681,5469,65.661728,...,277.070049,1309937,296.849035,120334,311.828016,67252,192.097172,0,0,7386611
59298,如果,CONJ,711,118.797222,62793,195.228982,23671,169.012617,10588,113.85562,...,253.768552,1066199,250.806548,99866,262.903997,109823,276.889639,0,0,6492408
660,不是,CONJ,474,55.875,52738,169.661847,24701,167.787272,11375,123.575438,...,189.31961,729423,180.623016,49913,135.138339,53425,137.903186,0,0,5403205
16241,或者,CONJ,517,94.34697,30570,118.941313,10204,88.800904,5079,70.378932,...,273.800386,1400962,343.472682,71142,220.43501,47654,149.933805,0,0,5373650
155586,虽然,CONJ,192,30.516667,23662,80.933038,10823,78.91197,4108,49.95383,...,119.268384,473862,121.570525,39980,114.933936,32890,82.539434,0,0,2990311
