In [1]:
import stanza
import pandas as pd

In [2]:
stanza.download('hu')
nlp = stanza.Pipeline('hu')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 115kB [00:00, 28.3MB/s]                    
2020-07-03 14:32:04 INFO: Downloading default packages for language: hu (Hungarian)...
2020-07-03 14:32:04 INFO: File exists: /mnt/volume/jupyter/stanza_resources/hu/default.zip.
2020-07-03 14:32:08 INFO: Finished downloading models and saved to /mnt/volume/jupyter/stanza_resources.
2020-07-03 14:32:08 INFO: Loading these models for language: hu (Hungarian):
| Processor | Package |
-----------------------
| tokenize  | szeged  |
| pos       | szeged  |
| lemma     | szeged  |
| depparse  | szeged  |

2020-07-03 14:32:08 INFO: Use device: cpu
2020-07-03 14:32:08 INFO: Loading: tokenize
2020-07-03 14:32:08 INFO: Loading: pos
2020-07-03 14:32:09 INFO: Loading: lemma
2020-07-03 14:32:09 INFO: Loading: depparse
2020-07-03 14:32:11 INFO: Done loading processors!


In [3]:
def onlywithneighbours(ofthislist):
    try:
        filtered = [each for each in ofthislist 
                    if each+1 in ofthislist or each-1 in ofthislist]
    except TypeError:
        filtered = [each for each in ofthislist 
                    if str(int(each)+1) in ofthislist or str(int(each)-1) in ofthislist]
    return filtered

In [4]:
def split_into_numerical_sequences(inlist):

    inlist=sorted(inlist)

    breakindeces=[i for i,j in enumerate(inlist)
                    if (j+1 not in inlist and j in inlist)]

    sublists=[]
    for index, each in enumerate(breakindeces):
        if index==0:
            sublists.append([x for x in inlist
                               if x<=inlist[each]])
        if index!=0:
            sublists.append([x for x in inlist
                               if x<=inlist[each] and x>inlist[breakindeces[index-1]]])

    return sublists

In [5]:
def stanzanamesearch(text):

    doc = nlp(text)

    propns_and_their_positions_dictlist = [{
    word.id:word.lemma 
    for word in sentence.words if word.upos == 'PROPN'}
    for sentence in doc.sentences]

    propns_with_at_least_one_propn_neighbour = [[
    eachdict[eachkey]
    for eachkey in sorted(onlywithneighbours(list(eachdict.keys())))]
    for eachdict in propns_and_their_positions_dictlist]

    propn_ids_with_at_least_one_propn_neighbour = [[
    int(eachkey)
    for eachkey in sorted(onlywithneighbours(list(eachdict.keys())))]
    for eachdict in propns_and_their_positions_dictlist]

    propn_id_sequences_with_at_least_one_propn_neighbour=[
    split_into_numerical_sequences(eachsublist)
    for eachsublist in propn_ids_with_at_least_one_propn_neighbour]

    res=[]
    for sentenceindex, eachsentence in enumerate(propn_id_sequences_with_at_least_one_propn_neighbour):
        for sequenceindex, eachsequence in enumerate(eachsentence):
            name=[]
            for wordindex, eachword in enumerate(eachsequence):
                name.append(propns_and_their_positions_dictlist[sentenceindex][str(eachword)])
            name=' '.join(name)
            res.append(name)

    return list(set(res))

In [6]:
def validalias(alias):
    if len(alias.strip()) > 5 and len(alias.strip().split(' ')) > 1: return True
    else: return False

In [7]:
def searchlist_maker(csv,**kwargs):
    persondatadict = pd.read_csv(csv)
    if 'alias_separator' in kwargs:
        persondata_searchlist = [[eachalias.strip()
                                for eachalias in eachperson.split(kwargs['alias_separator'])]
                                for eachperson in persondatadict['name_list']
                                if type(eachperson) != float]
    else:
        persondata_searchlist = [[each]
                                for each in persondatadict['name_list']]
    return persondata_searchlist

In [185]:
def matchfinder(text,searchforthese):
    matches=[alias
             for persondata_searchtarget in searchforthese
             for alias in persondata_searchtarget
             if validalias(alias) and alias.lower() in str(text).lower()]
    return matches

In [186]:
def fillextracols(whichdictionary, whichrowtofill, withthislist, unlessitslongerthanthis):
    if not len(withthislist) > unlessitslongerthanthis and len(withthislist)>0:
        for i, e in enumerate(withthislist):
            targetdf.loc[whichrowtofill,whichdictionary+str(i)]=e

In [187]:
def prepare_extra_columns(num_of_columns_for_each_dict):
    for each in num_of_columns_for_each_dict:
        for index in range(num_of_columns_for_each_dict[each]):
            targetdf[each + str(index)]=''

In [188]:
class dictionary:
    def __init__(self, name, maxcolnum, searchlist=None, trustall=False):
        self.name = name
        self.maxcolnum = maxcolnum
        self.searchlist = searchlist
        self.trustall = trustall

In [189]:
dictionaries=[
dictionary('person_data',
           10,
           searchlist_maker('/mnt/volume/jupyter/szokereso/person_data-1592394309231_utf8.csv',alias_separator='|')),
dictionary('wikilist',
           5,
           searchlist_maker('/mnt/volume/jupyter/szokereso/A_negyedik_Orbán-kormany_allamtitkarainak_listaja.csv')),
dictionary('stanza',
           10)
]

In [190]:
targetdf = pd.read_csv('/mnt/volume/jupyter/szokereso/data_2020-06-29_12_01_51.csv')
prepare_extra_columns({dictionary.name: dictionary.maxcolnum for dictionary in dictionaries})

In [194]:
debugmode = False

In [192]:
import time

In [198]:
cells = list(targetdf['TEXT'])

start_time = time.time()
for dictionary in dictionaries:
    for icell, cell in enumerate(cells):
        if icell >60 or not debugmode:
            if icell%100==0: print(icell)
            if dictionary.searchlist is not None:
                fillextracols(dictionary.name,icell,matchfinder(cell,dictionary.searchlist),dictionary.maxcolnum)
            if dictionary.searchlist is None:
                fillextracols(dictionary.name,icell,stanzanamesearch(cell),dictionary.maxcolnum)
end_time = time.time()
print("--- %s seconds ---" % (time.time() - start_time))

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500


KeyboardInterrupt: 