In [1]:
import sqlite3
import pandas as pd
from collections import Counter

In [2]:
conn = sqlite3.connect('words.db')
cur = conn.cursor()

In [3]:
list_of_pos = ['NOUN', 'ADJF', 'ADJS', 'COMP', 'VERB', 'INFN', 
               'PRTF', 'PRTS', 'GRND', 'NUMR', 'ADVB', 'NPRO', 
               'PRED', 'PREP', 'CONJ', 'PRCL', 'INTJ']

In [4]:
list_of_types = ['неоднозначно по лемме', 'неоднозначно по параметрам',
                 'неоднозначно по части речи', 'неоднозначно по части речи и лемме', 
                 'однозначно']

In [5]:
color_list = ['#FDEBD0', '#FAD7A0', '#F8C471', '#F5B041', 
              '#F39C12', '#E67E22', '#D35400', '#A93226', 
              '#922B21', '#922B21']

In [6]:
def make_df(l1, l2):
    p = {}
    for c in l1:
        p[c] = [0.0 for c in l2]
    df = pd.DataFrame(p)
    df.index = l2
    return df

## Таблица с абсолютными значениями

In [7]:
df = pd.read_sql_query('''
                select pos_tag_lemma.word, main.type, GROUP_CONCAT(pos, ', ')
                from pos_tag_lemma
                join main on main.word = pos_tag_lemma.word
                where pos_tag_lemma.pos is not null and type != 'несловарно'
                group by pos_tag_lemma.word
                order by freq DESC
                -- limit 10000''', conn)
df = df.rename(columns={"GROUP_CONCAT(pos, ', ')":"pos"})

In [8]:
for index, row in df.iterrows():
    p = row['pos'].split()[0]
    p = p.rstrip(',')
    row['pos'] = p

In [9]:
grouped = df.groupby("type").agg({'pos':list})

In [10]:
final = make_df(list_of_pos, list_of_types)

In [11]:
for index, row in grouped.iterrows():
    c = Counter(row['pos'])
    for p in c.keys():
        final.at[index, p] = c[p]

In [12]:
final

Unnamed: 0,NOUN,ADJF,ADJS,COMP,VERB,INFN,PRTF,PRTS,GRND,NUMR,ADVB,NPRO,PRED,PREP,CONJ,PRCL,INTJ
неоднозначно по лемме,649.0,48.0,0.0,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
неоднозначно по параметрам,4591.0,2433.0,0.0,0.0,166.0,57.0,322.0,21.0,3.0,46.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0
неоднозначно по части речи,68.0,118.0,0.0,0.0,3.0,4.0,0.0,2.0,0.0,0.0,25.0,1.0,1.0,20.0,17.0,19.0,3.0
неоднозначно по части речи и лемме,269.0,351.0,39.0,4.0,39.0,4.0,52.0,32.0,3.0,7.0,208.0,16.0,5.0,16.0,21.0,5.0,1.0
однозначно,4842.0,347.0,52.0,18.0,2168.0,640.0,27.0,430.0,67.0,9.0,116.0,26.0,4.0,22.0,12.0,13.0,1.0


## Таблица с процентами (100% - вся  омонимия какого-то типа)

In [13]:
procent = make_df(list_of_pos, list_of_types)

In [14]:
for index, row in final.iterrows():
    s = sum(row)
    for pos in list_of_pos:
        procent.at[index, pos] = '{:.2f}'.format(final.at[index, pos]/s*100)

In [15]:
procent

Unnamed: 0,NOUN,ADJF,ADJS,COMP,VERB,INFN,PRTF,PRTS,GRND,NUMR,ADVB,NPRO,PRED,PREP,CONJ,PRCL,INTJ
неоднозначно по лемме,89.89,6.65,0.0,0.14,2.77,0.0,0.0,0.0,0.0,0.0,0.0,0.55,0.0,0.0,0.0,0.0,0.0
неоднозначно по параметрам,59.97,31.78,0.0,0.0,2.17,0.74,4.21,0.27,0.04,0.6,0.0,0.22,0.0,0.0,0.0,0.0,0.0
неоднозначно по части речи,24.2,41.99,0.0,0.0,1.07,1.42,0.0,0.71,0.0,0.0,8.9,0.36,0.36,7.12,6.05,6.76,1.07
неоднозначно по части речи и лемме,25.09,32.74,3.64,0.37,3.64,0.37,4.85,2.99,0.28,0.65,19.4,1.49,0.47,1.49,1.96,0.47,0.09
однозначно,55.06,3.95,0.59,0.2,24.65,7.28,0.31,4.89,0.76,0.1,1.32,0.3,0.05,0.25,0.14,0.15,0.01


In [16]:
def color_reds(value):
    if value <= 1:
        i = 0
    else:
        i = int(value // 10 + 1)
    color = color_list[i]

    return 'background-color: %s' % color

In [17]:
procent.style.applymap(color_reds)\
                .format("{:.2f}")

Unnamed: 0,NOUN,ADJF,ADJS,COMP,VERB,INFN,PRTF,PRTS,GRND,NUMR,ADVB,NPRO,PRED,PREP,CONJ,PRCL,INTJ
неоднозначно по лемме,89.89,6.65,0.0,0.14,2.77,0.0,0.0,0.0,0.0,0.0,0.0,0.55,0.0,0.0,0.0,0.0,0.0
неоднозначно по параметрам,59.97,31.78,0.0,0.0,2.17,0.74,4.21,0.27,0.04,0.6,0.0,0.22,0.0,0.0,0.0,0.0,0.0
неоднозначно по части речи,24.2,41.99,0.0,0.0,1.07,1.42,0.0,0.71,0.0,0.0,8.9,0.36,0.36,7.12,6.05,6.76,1.07
неоднозначно по части речи и лемме,25.09,32.74,3.64,0.37,3.64,0.37,4.85,2.99,0.28,0.65,19.4,1.49,0.47,1.49,1.96,0.47,0.09
однозначно,55.06,3.95,0.59,0.2,24.65,7.28,0.31,4.89,0.76,0.1,1.32,0.3,0.05,0.25,0.14,0.15,0.01
