In [83]:
import urllib.request as req
#!pip install fastkml
from fastkml import kml
import pandas as pd
import re

In [84]:
!pip install fastkml



In [85]:
# using the map from https://hargitamegye.ro/vadkarok/interaktiv-terkep.html
# downloading in KML format
url = 'http://www.google.com/maps/d/kml?forcekml=1&mid=1sIXAUcYDAkCBUzw_6DZelr1rt48'

In [86]:
fname, _ = req.urlretrieve(url)
k = kml.KML()

with open(fname, 'rt', encoding='utf-8') as f:
    k.from_string(f.read().encode('utf-8'))

In [87]:
columns_raw = ['geo_loc', 'latitude', 'longitude', 'content', 'image']
df = pd.DataFrame(columns=columns_raw)

In [88]:
folders = next(k.features()).features()

In [89]:
# parse KML, insert data into dataframe
for f in folders:
    for pm in f.features():
        lon, lat, _ =  pm.geometry.coords[0]
        media_ln = ''
        if pm.extended_data is not None:
            media_ln = pm.extended_data.elements[0].value
        df = df.append({
                    'geo_loc': pm.name, 
                    'latitude': lat, 
                    'longitude': lon, 
                    'content': pm.description, 
                    'image': media_ln}, 
                ignore_index=True)

In [90]:
def remove_links(desc):
    '''
        Input: description extracted from KML files
        Output: description without the links of attached images
    '''
    desc = desc.lstrip().rstrip()
    desc = re.sub(r'<img[^>]*/>', '', desc)
    desc = re.sub(r'<br>', '', desc)
    return desc

In [91]:
df['content'] = df['content'].apply(remove_links)

In [92]:
dp = re.compile('([0-9]{4})[. ]+(január|február|március|április|május|június|július|' + 
                'augusztus|szeptember|október|november|december|[0-9]{1,2})[. ]+([0-9]{1,2})')

def standardize_date(date_groups):
    '''
        Input: result of regexp match with 3 groups, corresponding to (year, month, day)
        Output: (year, month, day) tuple, where month is converted to numerical format
    '''
    months_hu = [u'január', u'február', u'március', 
                 u'április', u'május', u'június', 
                 u'július', u'augusztus', u'szeptember', 
                 u'október', u'november', u'december']
    
    y, m, d = date_groups
    
    try: 
        m = str(months_hu.index(m) + 1)
    except:
        return y, m, d
    
    return y, m, d


def get_date(desc):
    '''
        Input: description extracted from KML
        Output: Finds dates in the description, returns it in pandas datetime format
    '''
    res = dp.search(desc)
    if res is not None:
        date = standardize_date(res.groups())
    else:
        date = ''
    
    return pd.to_datetime('-'.join(date))

In [93]:
df['date'] = df['content'].apply(get_date)

In [94]:
lnp = re.compile('https?://[^ ,]*')

def get_link(desc):
    '''
        Input: description extracted from KML
        Output: finds links to external news sites
    '''
    res = lnp.search(desc)
    if res is not None:
        ln = desc[res.start():res.end()]
    else: 
        ln = 'https://hargitamegye.ro/vadkarok/interaktiv-terkep.html'
        
    return ln

In [95]:
df['link'] = df['content'].apply(get_link)

In [96]:
def clean_content(desc):
    '''
        Input: description extracted from KML
        Output: removes links from description
    '''
    if 'http' in desc:
        return desc[:desc.find('http')]
    else:
        return desc

In [97]:
df['content'] = df['content'].apply(clean_content)

In [98]:
print(len(df))

df.head(15)

224


Unnamed: 0,geo_loc,latitude,longitude,content,image,date,link
0,Siménfalva,46.337221,25.107726,"Kórházba került 2019. január 6-án, vasárnap eg...",,2019-01-06,https://hargitamegye.ro/vadkarok/interaktiv-te...
1,Kőrispatak,46.417871,24.968825,2019. március 22-én 11 óra körül életveszélyes...,,2019-03-22,https://hargitamegye.ro/vadkarok/interaktiv-te...
2,Székelyszenterzsébet,46.290531,24.935896,2018. február 6-án egy székelyszenterzsébeti f...,,2018-02-06,https://hargitamegye.ro/vadkarok/interaktiv-te...
3,Homoródremete,46.29265,25.4026,A Homoródszentmárton községhez tartozó Homoród...,https://doc-0k-6g-mymaps.googleusercontent.com...,NaT,https://hargitamegye.ro/vadkarok/interaktiv-te...
4,Szent Anna-tó,46.126892,25.890094,Újabb medvetámadás áldozata került kórházba Ha...,https://doc-0g-6g-mymaps.googleusercontent.com...,NaT,https://hargitamegye.ro/vadkarok/interaktiv-te...
5,Tusnádfürdő,46.146075,25.856959,"Újból emberre támadt a medve Hargita megyében,...",https://doc-10-6g-mymaps.googleusercontent.com...,NaT,https://hargitamegye.ro/vadkarok/interaktiv-te...
6,Tusnádfürdő,46.147443,25.860907,Újabb medvetámadás történt július 24-én éjjel ...,,NaT,https://hargitamegye.ro/vadkarok/interaktiv-te...
7,Tusnádfürdő,46.143518,25.86125,2018. augusztus 5-én kora reggel a 28 éves tus...,https://doc-0k-6g-mymaps.googleusercontent.com...,2018-08-05,https://hargitamegye.ro/vadkarok/interaktiv-te...
8,Tusnádfürdő,46.143697,25.852667,2018 augusztus 5-én kora reggel a 28 éves tusn...,https://doc-0o-6g-mymaps.googleusercontent.com...,2018-08-05,https://hargitamegye.ro/vadkarok/interaktiv-te...
9,Tarcsafalva,46.375859,25.13094,2018. augusztus 21-én emberre támadt a medve K...,,2018-08-21,https://hargitamegye.ro/vadkarok/interaktiv-te...


In [99]:
df['checked']=''
df['exact_loc']=0
df['source']='Vadkárok HR'
df['title']=df['geo_loc']

In [100]:
from utils import text_processor

In [101]:
duplicate=[]
relevant=[]
severity=[]
deaths=[]
for i in df.index:
    l=df.loc[i]['link']
    d=0
    if l:
        if 'szekelyhon' in l: d=1
    duplicate.append(d)
    c=df.loc[i]['content']
    t=df.loc[i]['title']
    r,s,d=text_processor(t,c)
    relevant.append(1) #override r
    severity.append(s)
    deaths.append(d)

In [102]:
df['duplicate']=duplicate
df['relevant']=relevant
df['severity']=severity
df['deaths']=deaths

In [103]:
df=df[['checked',
    'date',
    'link',
    'image',
    'source',
    'title',
    'content',
    'relevant',
    'severity',
    'deaths',
    'duplicate',
    'geo_loc',
    'exact_loc',
    'latitude',
    'longitude']].set_index('checked')

In [104]:
df.to_excel('data/vadkarok_hr.xlsx')
df.to_excel('data/vadkarok_hr_curated.xlsx')

Fill in missing date from image EXIF data

In [106]:
!pip install exifread

Collecting exifread
  Downloading https://files.pythonhosted.org/packages/7b/cb/92b644626830115910cf2b36d3dfa600adbec86dff3207a7de3bfd6c6a60/ExifRead-2.1.2.tar.gz
Building wheels for collected packages: exifread
  Building wheel for exifread (setup.py): started
  Building wheel for exifread (setup.py): finished with status 'done'
  Created wheel for exifread: filename=ExifRead-2.1.2-cp37-none-any.whl size=34204 sha256=6f751370630938344e61d69dea28a19cd91767bc84db752e7f4233a18b1690fb
  Stored in directory: C:\Users\csala\AppData\Local\pip\Cache\wheels\a2\82\de\fd5f70739a3c8d7475cc21f4e186150abbc5d77180af7d94a2
Successfully built exifread
Installing collected packages: exifread
Successfully installed exifread-2.1.2


In [120]:
no_date=df[df['date'].astype(str)=='NaT']

In [129]:
no_date_yes_image=no_date[no_date['image'].astype(str)!='']

In [130]:
no_date_yes_image

Unnamed: 0_level_0,date,link,image,source,title,content,relevant,severity,deaths,duplicate,geo_loc,exact_loc,latitude,longitude
checked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
,NaT,https://hargitamegye.ro/vadkarok/interaktiv-te...,https://doc-0k-6g-mymaps.googleusercontent.com...,Vadkárok HR,Homoródremete,A Homoródszentmárton községhez tartozó Homoród...,1,3,0,0,Homoródremete,0,46.29265,25.4026
,NaT,https://hargitamegye.ro/vadkarok/interaktiv-te...,https://doc-0g-6g-mymaps.googleusercontent.com...,Vadkárok HR,Szent Anna-tó,Újabb medvetámadás áldozata került kórházba Ha...,1,4,0,0,Szent Anna-tó,0,46.126892,25.890094
,NaT,https://hargitamegye.ro/vadkarok/interaktiv-te...,https://doc-10-6g-mymaps.googleusercontent.com...,Vadkárok HR,Tusnádfürdő,"Újból emberre támadt a medve Hargita megyében,...",1,3,0,0,Tusnádfürdő,0,46.146075,25.856959
,NaT,https://hargitamegye.ro/vadkarok/interaktiv-te...,https://doc-00-6g-mymaps.googleusercontent.com...,Vadkárok HR,Karcfalva,– Nyolc medve volt a kukoricásba! - meséli Pre...,1,3,0,0,Karcfalva,0,46.530569,25.761535
,NaT,https://hargitamegye.ro/vadkarok/interaktiv-te...,https://doc-0s-6g-mymaps.googleusercontent.com...,Vadkárok HR,Csíkszentkirály,Szeptember 21-én újabb medvetámadás történt Cs...,1,4,0,0,Csíkszentkirály,0,46.31208,25.821061
,NaT,https://www.facebook.com/borbolycsaba/videos/v...,https://doc-10-6g-mymaps.googleusercontent.com...,Vadkárok HR,Csíkmindszent,Augusztus 26-án gombászás közben támadt a 38 é...,1,3,0,0,Csíkmindszent,0,46.328057,25.887421


In [135]:
import exifread
import urllib.request

In [166]:
for im_link in no_date_yes_image['image'].values:
    im_link=im_link.split(' ')[0] #if multiple images, take only first
    print(im_link)
    try:
        urllib.request.urlretrieve(im_link, "temp.png")
        with open('temp.png', 'rb') as fh:
            tags = exifread.process_file(fh, stop_tag="EXIF DateTimeOriginal")
            if tags:
                dateTaken = tags["EXIF DateTimeOriginal"]
                print(dateTaken)
            else:
                print('No EXIF')
    except:
        print('No readable image')

https://doc-0k-6g-mymaps.googleusercontent.com/untrusted/hostedimage/f6u64nodcabo26320jjaagdfpc/j4pcli5pfmmbes7q30tllaba3o/1580233500000/8vpJ8s-AWmMZHl1okKvYY29jERWnD2LC/*/2AF2TALobI_ipqW5ICbMvGNPKbt3arQBei5-scjRLfl-blvv71F7hO4ennVw-6-206Gk7q1cILxsQno7vHU9NcxiAeNTA6rQT7GlT56CMeEn4BhkBoSpBUkiztnF1qxAabdgzPVO1pvxHRjj5H0cIUibIYNxzd3E48eZ2WI5Gflbib3JHZJzMTw7qT2dH8t-QKGFMoq7EdPnw1uvA_Fr3a2glYJFoeDPiMK2_7QPJRVGwg3C6aF2wiQcXRjtiYfJlWJzmbcw8cUbUoyi0hNWIjPyUVPYNjppGtw?fife
No EXIF
https://doc-0g-6g-mymaps.googleusercontent.com/untrusted/hostedimage/f6u64nodcabo26320jjaagdfpc/mte4p4bnuof32j3hm3v4na9kec/1580233500000/8vpJ8s-AWmMZHl1okKvYY29jERWnD2LC/*/2AF2TALrYq8wIwpYzie0YHfjTBAo3sZCFSf2zsyFkZ2uGMD3WsZiG4IzgO1X1gSnGlit7EJVcQ0elBxxsMmsw8oKzhhDbR2it9bfDhGMteq82J6F1C7AU5O6N_XInLyagczjh64WPqXvruiIeDdN7QhKKjH7w8k7nwJAgfsEgYE6LW_BEohmfI6E7Fa4rMWPKpYU6pDPUWLX8dD1QNWukxsEVBenpGehoZCx10MxagPI3SnvPvmmDiB7qCZWZOMe0yyfGjqx-gN777gk4-zbGbJH1yI4XVWlbrA?fife
No EXIF
https://doc-10-6g-mymaps.googleusercontent.com

Fill in missing dates - if you can, manually