In [3]:
import urllib.request as req
#!pip install fastkml
from fastkml import kml
import pandas as pd
import re

In [4]:
# using the map from https://hargitamegye.ro/vadkarok/interaktiv-terkep.html
# downloading in KML format
url = 'http://www.google.com/maps/d/kml?forcekml=1&mid=1sIXAUcYDAkCBUzw_6DZelr1rt48'

In [5]:
fname, _ = req.urlretrieve(url)
k = kml.KML()

with open(fname, 'rt', encoding='utf-8') as f:
    k.from_string(f.read().encode('utf-8'))

In [6]:
columns_raw = ['geo_loc', 'latitude', 'longitude', 'content', 'image']
df = pd.DataFrame(columns=columns_raw)

In [7]:
folders = next(k.features()).features()

In [8]:
# parse KML, insert data into dataframe
for f in folders:
    for pm in f.features():
        lon, lat, _ =  pm.geometry.coords[0]
        media_ln = ''
        if pm.extended_data is not None:
            media_ln = pm.extended_data.elements[0].value
        df = df.append({
                    'geo_loc': pm.name, 
                    'latitude': lat, 
                    'longitude': lon, 
                    'content': pm.description, 
                    'image': media_ln}, 
                ignore_index=True)

In [9]:
def remove_links(desc):
    '''
        Input: description extracted from KML files
        Output: description without the links of attached images
    '''
    desc = desc.lstrip().rstrip()
    desc = re.sub(r'<img[^>]*/>', '', desc)
    desc = re.sub(r'<br>', '', desc)
    return desc

In [10]:
df['content'] = df['content'].apply(remove_links)

In [11]:
dp = re.compile('([0-9]{4})[. ]+(január|február|március|április|május|június|július|' + 
                'augusztus|szeptember|október|november|december|[0-9]{1,2})[. ]+([0-9]{1,2})')

def standardize_date(date_groups):
    '''
        Input: result of regexp match with 3 groups, corresponding to (year, month, day)
        Output: (year, month, day) tuple, where month is converted to numerical format
    '''
    months_hu = [u'január', u'február', u'március', 
                 u'április', u'május', u'június', 
                 u'július', u'augusztus', u'szeptember', 
                 u'október', u'november', u'december']
    
    y, m, d = date_groups
    
    try: 
        m = str(months_hu.index(m) + 1)
    except:
        return y, m, d
    
    return y, m, d


def get_date(desc):
    '''
        Input: description extracted from KML
        Output: Finds dates in the description, returns it in pandas datetime format
    '''
    res = dp.search(desc)
    if res is not None:
        date = standardize_date(res.groups())
    else:
        date = ''
    
    return pd.to_datetime('-'.join(date))

In [12]:
df['date'] = df['content'].apply(get_date)

In [13]:
lnp = re.compile('https?://[^ ,]*')

def get_link(desc):
    '''
        Input: description extracted from KML
        Output: finds links to external news sites
    '''
    res = lnp.search(desc)
    if res is not None:
        ln = desc[res.start():res.end()]
    else: 
        ln = 'https://hargitamegye.ro/vadkarok/interaktiv-terkep.html'
        
    return ln

In [14]:
df['link'] = df['content'].apply(get_link)

In [15]:
def clean_content(desc):
    '''
        Input: description extracted from KML
        Output: removes links from description
    '''
    if 'http' in desc:
        return desc[:desc.find('http')]
    else:
        return desc

In [16]:
df['content'] = df['content'].apply(clean_content)

In [17]:
print(len(df))

df.head(15)

224


Unnamed: 0,geo_loc,latitude,longitude,content,image,date,link
0,Siménfalva,46.337221,25.107726,"Kórházba került 2019. január 6-án, vasárnap eg...",,2019-01-06,https://hargitamegye.ro/vadkarok/interaktiv-te...
1,Kőrispatak,46.417871,24.968825,2019. március 22-én 11 óra körül életveszélyes...,,2019-03-22,https://hargitamegye.ro/vadkarok/interaktiv-te...
2,Székelyszenterzsébet,46.290531,24.935896,2018. február 6-án egy székelyszenterzsébeti f...,,2018-02-06,https://hargitamegye.ro/vadkarok/interaktiv-te...
3,Homoródremete,46.29265,25.4026,A Homoródszentmárton községhez tartozó Homoród...,https://lh3.googleusercontent.com/Ehzdz9KPuyEb...,NaT,https://hargitamegye.ro/vadkarok/interaktiv-te...
4,Szent Anna-tó,46.126892,25.890094,Újabb medvetámadás áldozata került kórházba Ha...,https://lh5.googleusercontent.com/j298p_0ZiD5D...,NaT,https://hargitamegye.ro/vadkarok/interaktiv-te...
5,Tusnádfürdő,46.146075,25.856959,"Újból emberre támadt a medve Hargita megyében,...",https://lh6.googleusercontent.com/Pzc10bLlhWJM...,NaT,https://hargitamegye.ro/vadkarok/interaktiv-te...
6,Tusnádfürdő,46.147443,25.860907,Újabb medvetámadás történt július 24-én éjjel ...,,NaT,https://hargitamegye.ro/vadkarok/interaktiv-te...
7,Tusnádfürdő,46.143518,25.86125,2018. augusztus 5-én kora reggel a 28 éves tus...,https://lh6.googleusercontent.com/rHyfjJsaw0jk...,2018-08-05,https://hargitamegye.ro/vadkarok/interaktiv-te...
8,Tusnádfürdő,46.143697,25.852667,2018 augusztus 5-én kora reggel a 28 éves tusn...,https://lh3.googleusercontent.com/4UDOqPpshBDm...,2018-08-05,https://hargitamegye.ro/vadkarok/interaktiv-te...
9,Tarcsafalva,46.375859,25.13094,2018. augusztus 21-én emberre támadt a medve K...,,2018-08-21,https://hargitamegye.ro/vadkarok/interaktiv-te...


In [18]:
df['checked']=''
df['exact_loc']=0
df['source']='Vadkárok HR'
df['title']=df['geo_loc']

In [19]:
from utils import text_processor

In [20]:
duplicate=[]
relevant=[]
severity=[]
deaths=[]
for i in df.index:
    l=df.loc[i]['link']
    d=0
    if l:
        if 'szekelyhon' in l: d=1
    duplicate.append(d)
    c=df.loc[i]['content']
    t=df.loc[i]['title']
    r,s,d=text_processor(t,c)
    relevant.append(1) #override r
    severity.append(s)
    deaths.append(d)

In [21]:
df['duplicate']=duplicate
df['relevant']=relevant
df['severity']=severity
df['deaths']=deaths

In [22]:
df=df[['checked',
    'date',
    'link',
    'image',
    'source',
    'title',
    'content',
    'relevant',
    'severity',
    'deaths',
    'duplicate',
    'geo_loc',
    'exact_loc',
    'latitude',
    'longitude']].set_index('checked')

In [23]:
df.to_excel('data/vadkarok_hr.xlsx')

  force_unicode(url))
