In [7]:
import pandas as pd
import json
import textdistance
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
data_path = '/home/bence/skool/2nd_semester/nlp/traffic_annotation/'
df = pd.read_excel(data_path + 'NLP projekt.xlsx')
# df = df['Corpus']
df = df.dropna()

# Utility functions

In [8]:
replacements = {
    "Á": "A",
    "É": "E",
    "á": "a",
    "é": "e",
    "í": "i",
    "Ó": "O",
    "ó": "o",
    "Ö": "O",
    "ö": "o",
    "Ő": "O",
    "ő": "o",
    "Ú": "U",
    "ú": "u",
    "Ű": "U",
    "ű": "u",
    "ü": "u",
    "Í": "I"
}

def clean_text_accents(text, replacements=replacements):
    text = "".join([replacements.get(c, c) for c in text])
    return text

In [9]:
def get_city_name_from_text(input_str):
    cities_list = []
    for word in word_tokenize(input_str):
        if word[0].isupper() == True and word not in cities_list:
            cities_list.append(word)
    if len(cities_list) == 1:
        return cities_list[0]
    else:
        print("Warning! There is more than 1 city name in the input")
        print(cities_list)
        return cities_list

## test
# get_city_name_from_text('pisztolyosok Kolozsvaron az operanal')

In [10]:
def get_city_name_list_from_dict(cities_dict):
    i = 0
    city_names = []
    for key, value in cities_dict.items():
        city = cities_dict[key]['name_hun']
        city_names.append(city)

        i += 1
        
    print("Extracted %d names" % i)
    return city_names

In [11]:
def lookup_closest_str(input_str, lookup_list):
    min_dist = 999
    closest_str = ''
    for elem in lookup_list:
        dist = textdistance.levenshtein(elem, input_str)

        if dist < min_dist:
            min_dist = dist
            closest_str = elem
    return closest_str

In [4]:
%ls

 cities_coords1.json       'NLP projekt.xlsx'          traffic.ipynb
 cities_coords_clean.json   README.md
 cities_coords.json         traffic_annotation.ipynb


## Get data

In [12]:
# get hungarian stopwords and clean accents from them
hun_stopwords = stopwords.words('hungarian')
hun_stopwords = [clean_text_accents(w,replacements=replacements) for w in hun_stopwords]

# JSON with cities
with open('cities_hu_coords.json', encoding='UTF-8') as f:
    cities_dict = json.load(f)

In [14]:
cities_dict;

In [11]:
city_names = get_city_name_list_from_dict(cities_dict)

Extracted 1047 names


In [56]:
lookup_closest_str('Kolszovar', city_names)

'Kolozsvar'

In [88]:
test_text = 'piszolyosok a Kolozsvaron az opera melletti kis utcaban'

## Get location coords from text

In [108]:
def extract_coords_from_text(input_str, city_names, verbose=False):
    city_name_raw = get_city_name_from_text(input_str)
    city_name = lookup_closest_str(city_name_raw, city_names)
    
    if verbose:
        print("Found city name in string: " + city_name)

    coord_x = cities_dict[city_name]['coords_x']
    coord_y = cities_dict[city_name]['coords_y']

    # NOTE: lehet hogy a koordinatakat fel kell cserelni ahhoz, hogy jo helyre tegye a pin-t
    if verbose:
        print("Coords of {}: {}, {}".format(city_name, coord_x, coord_y))
    
    return [coord_x, coord_y]

In [109]:
extract_coords_from_text(test_text, city_names, verbose=True)

Found city name in string: Kolozsvar
Coords of Kolozsvar: 23.605117571992587, 46.777913902590704


[23.605117571992587, 46.777913902590704]

## Get event from text

In [118]:
# event names
events = [
    'baleset',
    'dugo',
    'torlodas',
    'medve',
    'maci',
    'radar',
    'rendor',
    'pisztolyos'
]
lookup_closest_str('mexdve', events)

'medve'

In [124]:
words = word_tokenize(test_text)
words = [w for w in words if w not in hun_stopwords]
words

['piszolyosok', 'Kolozsvaron', 'opera', 'melletti', 'kis', 'utcaban']

In [140]:
def extract_event_from_text(input_str, events):
    words = word_tokenize(input_str)
    words = [w for w in words if w not in hun_stopwords]
    
    event_in_txt = ''
    min_dist = 99999

    for word in words:
#         print(word)

        inner_dict = {}
        for elem in events:
            dist = textdistance.levenshtein(elem, word)
            inner_dict[elem] = dist

    #         print('event_in_txt: ' + event_in_txt)
    #         print('min_dist = %d, dist = %d' % (min_dist, dist))

            if dist < min_dist:
                min_dist = dist
                event_in_txt = elem

    #     words_dict[word] = inner_dict

#     print('\nfound event: ' + event_in_txt)
    
    return event_in_txt

In [141]:
extract_event_from_text(test_text, events)

'pisztolyos'

# Put event+coords from text into dataFrame

In [158]:
test_text = 'piszolyosok Kolozsvaron az opera melletti kis utcaban'
coords = extract_coords_from_text(test_text, city_names)
event = extract_event_from_text(test_text, events)

print(event, coords)

pisztolyos [23.605117571992587, 46.777913902590704]


In [183]:
df = pd.DataFrame(columns=['event', 'x_coord', 'y_coord'])

In [185]:
def put_data_from_text_into_df(input_text, df, city_names=city_names, events=events):
    coords = extract_coords_from_text(input_text, city_names)
    event = extract_event_from_text(input_text, events)
    
    print(event, coords)
    
    row = {
        'event': event,
        'x_coord': coords[0],
        'y_coord': coords[1]
    }
    
    print(row)
    
    df = df.append(row, ignore_index=True)
    return df

In [186]:
df = put_data_from_text_into_df(test_text, df, city_names=city_names, events=events)

pisztolyos [23.605117571992587, 46.777913902590704]
{'event': 'pisztolyos', 'x_coord': 23.605117571992587, 'y_coord': 46.777913902590704}


In [190]:
# save to .csv
df.to_csv('event_coords_data.csv')

In [191]:
!ls

 cities_coords1.json	    event_coords_data.csv   traffic_annotation.ipynb
 cities_coords_clean.json  'NLP projekt.xlsx'	    traffic.ipynb
 cities_coords.json	    README.md
