In [1]:
import glob
import stanza
import pandas as pd
import time
import sys

In [2]:
stanza.download('hu')
nlp = stanza.Pipeline('hu')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 115kB [00:00, 9.75MB/s]                    
2020-07-07 10:34:34 INFO: Downloading default packages for language: hu (Hungarian)...
2020-07-07 10:34:35 INFO: File exists: /mnt/volume/jupyter/stanza_resources/hu/default.zip.
2020-07-07 10:34:38 INFO: Finished downloading models and saved to /mnt/volume/jupyter/stanza_resources.
2020-07-07 10:34:38 INFO: Loading these models for language: hu (Hungarian):
| Processor | Package |
-----------------------
| tokenize  | szeged  |
| pos       | szeged  |
| lemma     | szeged  |
| depparse  | szeged  |

2020-07-07 10:34:38 INFO: Use device: cpu
2020-07-07 10:34:38 INFO: Loading: tokenize
2020-07-07 10:34:39 INFO: Loading: pos
2020-07-07 10:34:40 INFO: Loading: lemma
2020-07-07 10:34:40 INFO: Loading: depparse
2020-07-07 10:34:41 INFO: Done loading processors!


In [3]:
def get_all_csvs(wildcard):
    csvlist = glob.glob(wildcard)[:int(len(glob.glob(wildcard))/2)]
    # can do ordering of csvlist here
    return csvlist

In [4]:
def onlywithneighbours(ofthislist):
    try:
        filtered = [each for each in ofthislist 
                    if each+1 in ofthislist or each-1 in ofthislist]
    except TypeError:
        filtered = [each for each in ofthislist 
                    if str(int(each)+1) in ofthislist or str(int(each)-1) in ofthislist]
    return filtered

In [5]:
def split_into_numerical_sequences(inlist):

    inlist=sorted(inlist)

    breakindeces=[i for i,j in enumerate(inlist)
                    if (j+1 not in inlist and j in inlist)]

    sublists=[]
    for index, each in enumerate(breakindeces):
        if index==0:
            sublists.append([x for x in inlist
                               if x<=inlist[each]])
        if index!=0:
            sublists.append([x for x in inlist
                               if x<=inlist[each] and x>inlist[breakindeces[index-1]]])

    return sublists

In [6]:
def stanzanamesearch(text):

    doc = nlp(text)

    propns_and_their_positions_dictlist = [{
    #word.id:word.lemma
    word.id:word.text
    for word in sentence.words if word.upos == 'PROPN'}
    for sentence in doc.sentences]

    propns_with_at_least_one_propn_neighbour = [[
    eachdict[eachkey]
    for eachkey in sorted(onlywithneighbours(list(eachdict.keys())))]
    for eachdict in propns_and_their_positions_dictlist]

    propn_ids_with_at_least_one_propn_neighbour = [[
    int(eachkey)
    for eachkey in sorted(onlywithneighbours(list(eachdict.keys())))]
    for eachdict in propns_and_their_positions_dictlist]

    propn_id_sequences_with_at_least_one_propn_neighbour=[
    split_into_numerical_sequences(eachsublist)
    for eachsublist in propn_ids_with_at_least_one_propn_neighbour]

    res=[]
    for sentenceindex, eachsentence in enumerate(propn_id_sequences_with_at_least_one_propn_neighbour):
        for sequenceindex, eachsequence in enumerate(eachsentence):
            name=[]
            for wordindex, eachword in enumerate(eachsequence):
                name.append(propns_and_their_positions_dictlist[sentenceindex][str(eachword)])
            name=' '.join(name)
            res.append(name)

    return list(set(res))

In [7]:
def validalias(alias):
    if len(alias.strip()) > 5 and len(alias.strip().split(' ')) > 1: return True
    else: return False

In [8]:
def searchlist_maker(csv,excelfile=False,headerwechoose='name_list',**kwargs):
    
    if excelfile: persondatadict = pd.read_excel('List of Settlements_m2.xlsx')
    else: persondatadict = pd.read_csv(csv)
    
    if 'alias_separator' in kwargs:
        persondata_searchlist = [[eachalias.strip()
                                for eachalias in eachperson.split(kwargs['alias_separator'])]
                                for eachperson in persondatadict[headerwechoose]
                                if type(eachperson) != float]
    else:
        persondata_searchlist = [[each]
                                for each in persondatadict[headerwechoose]]
    return persondata_searchlist

In [9]:
def matchfinder(text,searchforthese):
    matches=[alias
             for persondata_searchtarget in searchforthese
             for alias in persondata_searchtarget
             if validalias(alias) and alias.lower() in str(text).lower()]
    return matches

In [10]:
def fillextracols(whichdictionary, whichrowtofill, withthislist, unlessitslongerthanthis):
    if not len(withthislist) > unlessitslongerthanthis and len(withthislist)>0:
        for i, e in enumerate(withthislist):
            targetdf.loc[whichrowtofill,whichdictionary+str(i)]=e

In [11]:
def prepare_extra_columns(num_of_columns_for_each_dict):
    for each in num_of_columns_for_each_dict:
        for index in range(num_of_columns_for_each_dict[each]):
            targetdf[each + str(index)]=''

In [12]:
class dictionary_class:
    def __init__(self, name, maxcolnum, searchlist=None, geo=False):
        self.name = name
        self.maxcolnum = maxcolnum
        self.searchlist = searchlist
        self.geo = geo

In [13]:
dictionaries=[
dictionary_class('person_data',
           10,
           searchlist_maker('/mnt/volume/jupyter/szokereso/person_data-1592394309231_utf8.csv',alias_separator='|')),
dictionary_class('wikilist',
           5,
           searchlist_maker('/mnt/volume/jupyter/szokereso/A_negyedik_Orbán-kormany_allamtitkarainak_listaja.csv')),
dictionary_class('stanza',
           10),
dictionary_class('settlement_list',
          5,
          searchlist_maker('/mnt/volume/jupyter/szokereso/List of Settlements_m2.xlsx', excelfile=True,headerwechoose='settlement_name'),
          geo=True)
            ]

In [14]:
debugmode = False
wildcard = '/mnt/volume/anagy/mediascraper/mediaScraper/output/data*csv'

In [None]:
allcsvs=get_all_csvs(wildcard)
for eachcsv in allcsvs:
    try:
        targetdf = pd.read_csv(eachcsv)
        prepare_extra_columns({dictionary.name: dictionary.maxcolnum for dictionary in dictionaries})

        cells = list(targetdf['TEXT'])

        start_time = time.time()
        for idictionary, dictionary in enumerate(dictionaries):
            if idictionary==2 or not debugmode:
                for icell, cell in enumerate(cells):
                    if type(cell) is not float:
                        if icell > -1 or not debugmode:
                            if icell%500==0: print(icell)
                            if dictionary.searchlist is not None:
                                fillextracols(dictionary.name,icell,matchfinder(cell,dictionary.searchlist),dictionary.maxcolnum)
                            if dictionary.searchlist is None:
                                fillextracols(dictionary.name,icell,stanzanamesearch(cell),dictionary.maxcolnum)
        end_time = time.time()
        print("--- %s seconds ---" % (time.time() - start_time))
        targetdf.to_csv('/mnt/volume/jupyter/szokereso/resultfiles/'+eachcsv.split('/')[-1].split('.')[0]+'_szokereso_result.csv')
        print(eachcsv)
    except:
        the_type, the_value, the_traceback = sys.exc_info()
        outlist = [the_type, the_value, eachcsv, dictionary.name, icell]
        with open('/mnt/volume/jupyter/szokereso/resultfiles/'+eachcsv.split('/')[-1].split('.')[0]+'_ERRORLOG.txt', 'w') as f:
            for item in outlist:
                f.write("%s\n" % item)

0
500
1000
1500
2000
2500
3000
3500
0
500
1000
1500
2000
2500
3000
3500
0
500
1000
1500
2000
2500
3000
3500
0
500
1000
1500
2000
2500
3000
3500
--- 31847.138280391693 seconds ---
/mnt/volume/anagy/mediascraper/mediaScraper/output/data_2020-06-28_08:01:41.csv
0
500
1000
1500
2000
2500
3000
3500
0
500
1000
1500
2000
2500
3000
3500
0
500
1000
1500
2000
2500
3000
3500
0
500
1000
1500
2000
2500
3000
3500
--- 30739.14456677437 seconds ---
/mnt/volume/anagy/mediascraper/mediaScraper/output/data_2020-07-01_00:00:24.csv
0
500
1000
1500
2000
2500
3000
3500
0
500
1000
1500
2000
2500
3000
3500
0
500
1000
0
