In [40]:
import pandas as pd
import glob
from tqdm import tqdm
tqdm.pandas()

import stanza
stanza.download('hu')
nlp = stanza.Pipeline('hu')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 115kB [00:00, 48.6MB/s]                    
2020-07-23 12:06:57 INFO: Downloading default packages for language: hu (Hungarian)...
2020-07-23 12:06:58 INFO: File exists: /mnt/volume/jupyter/stanza_resources/hu/default.zip.
2020-07-23 12:07:02 INFO: Finished downloading models and saved to /mnt/volume/jupyter/stanza_resources.
2020-07-23 12:07:02 INFO: Loading these models for language: hu (Hungarian):
| Processor | Package |
-----------------------
| tokenize  | szeged  |
| pos       | szeged  |
| lemma     | szeged  |
| depparse  | szeged  |

2020-07-23 12:07:02 INFO: Use device: cpu
2020-07-23 12:07:02 INFO: Loading: tokenize
2020-07-23 12:07:02 INFO: Loading: pos
2020-07-23 12:07:04 INFO: Loading: lemma
2020-07-23 12:07:05 INFO: Loading: depparse
2020-07-23 12:07:06 INFO: Done loading processors!


In [75]:
resfiles = glob.glob('resultfiles/*06_*.csv')

In [78]:
dictstrings = ['person_data', 'wikilist','settlement_list','stanza','szotar_2.1']

In [79]:
def lemmatize(text):
    if type(text) is float: return text
    doc=nlp(text)
    return ' '.join([word.lemma for word in doc.sentences[0].words])

In [80]:
res={}
for resfile in resfiles:
    df=pd.read_csv(resfile)
    freqdf = pd.concat([df[col]
                       for dictstring in dictstrings
                       for col in df.columns if dictstring in col]
                       ).value_counts(
                       ).rename_axis('unique_entries_not_lemmatized'
                       ).reset_index(name='counts')
    res[resfile]=freqdf

In [118]:
concated_df=\
pd.concat([res[file] for file in res.keys()]
         ).groupby('unique_entries_not_lemmatized'
         ).agg({'counts': 'sum'}
         ).sort_values(by=['counts'],ascending=False
         ).reset_index()

In [156]:
limit = 50
multip=1.2

In [157]:
concated_df.truncate(after=int(limit*multip))

Unnamed: 0,unique_entries_not_lemmatized,counts,lemmatized
0,Orbán Viktor,6816,Orbán Viktor
1,Varga Mihály,3555,Varga Mihály
2,Szijjártó Péter,2951,Szijjártó Péter
3,Karácsony Gergely,1934,Karácsony Gergely
4,Donald Trump,1569,Donald Trump
...,...,...,...
56,Lovász László,272,Lovász László
57,Gyurcsány Ferenc,262,Gyurcsány Ferenc
58,Tiborcz István,257,Tiborcz István
59,Jakab Péter,256,Jakab Péter


In [158]:
concated_df['lemmatized']=\
    concated_df.truncate(after=int(limit*multip))['unique_entries_not_lemmatized'].progress_apply(lemmatize)

100%|██████████| 61/61 [01:08<00:00,  1.13s/it]


In [159]:
freqdf_onlylemmatized = concated_df.\
                        groupby(['lemmatized']).\
                        agg({'counts': 'sum'}).\
                        sort_values(by=['counts'],ascending=False).\
                        reset_index().\
                        truncate(after=limit)

In [160]:
freqdf_onlylemmatized

Unnamed: 0,lemmatized,counts
0,Orbán Viktor,6816
1,Varga Mihály,3555
2,Szijjártó Péter,2951
3,Karácsony Gergely,1934
4,Donald Trump,1569
5,Egyesült Államok,1534
6,##### EXTRA,1348
7,Schanda Tamás,1187
8,404 - article,1093
9,on site,1093


In [161]:
cooc = pd.DataFrame(index=freqdf_onlylemmatized['lemmatized'], columns=freqdf_onlylemmatized['lemmatized'])
cooc.index.names =   ['row']
cooc.columns.names = ['column']

In [162]:
dictcols = [col for col in df.columns if any([dictstring in col for dictstring in dictstrings])]

In [163]:
for icoocrow, coocrow in enumerate(cooc.index):
    for icooccol, cooccol in enumerate(cooc.columns):
        if icoocrow ==icooccol: print(icoocrow/limit)
        if icoocrow > icooccol:
            count=0
            for index, row in df[[*dictcols]].iterrows():
                if cooccol in list(row) and coocrow in list(row):
                    count+=1
            cooc.loc[coocrow,cooccol]=count

0.0
0.02
0.04
0.06
0.08
0.1
0.12
0.14
0.16
0.18
0.2
0.22
0.24
0.26
0.28
0.3
0.32
0.34
0.36
0.38
0.4
0.42
0.44
0.46
0.48
0.5
0.52
0.54
0.56
0.58
0.6
0.62
0.64
0.66
0.68
0.7
0.72
0.74
0.76
0.78
0.8
0.82
0.84
0.86
0.88
0.9
0.92
0.94
0.96
0.98
1.0


In [164]:
#!!!RUN ONLY ONCE!!!
cooc=cooc.add(cooc.transpose(),fill_value=0)

In [165]:
f = open("toR2limit"+str(limit)+".txt", "w")
for i, col in enumerate(cooc.columns):
    for j, index in enumerate(cooc.index):
        if i!=j:
            sig=cooc.loc[col,index]
            if sig != 0:
                print(col, index, sig)
                f.write('|'.join([str(each) for each in [col, index, sig, '\n']]))
f.close()

Orbán Viktor Varga Mihály 4
Orbán Viktor Szijjártó Péter 7
Orbán Viktor Karácsony Gergely 12
Orbán Viktor Donald Trump 16
Orbán Viktor Egyesült Államok 14
Orbán Viktor ##### EXTRA 1
Orbán Viktor Gulyás Gergely 2
Orbán Viktor Kaleta Gábor 3
Orbán Viktor Maruzsa Zoltán 1
Orbán Viktor Hollik István 2
Orbán Viktor Novák Katalin 1
Orbán Viktor Kásler Miklós 10
Orbán Viktor Bencsik János 2
Orbán Viktor Egészségügyi Világszervezet 1
Orbán Viktor Európai Unió 17
Orbán Viktor _ Forrás 2
Orbán Viktor Merkely Béla 3
Orbán Viktor Semjén Zsolt 1
Orbán Viktor Tarlós István 1
Orbán Viktor Ujhelyi István 6
Orbán Viktor Palkovics László 2
Orbán Viktor Szlávik János 2
Orbán Viktor Ungár Péter 5
Orbán Viktor Kósa Lajos 2
Orbán Viktor Horvát Demokratikus Közösség 4
Varga Mihály Orbán Viktor 4
Varga Mihály Szijjártó Péter 1
Varga Mihály Karácsony Gergely 5
Varga Mihály Gulyás Gergely 3
Varga Mihály Tállai András 1
Varga Mihály Kaleta Gábor 1
Varga Mihály Hollik István 1
Varga Mihály Európai Unió 1
Varga Mi