# Imports & Settings


In [41]:
# Imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from deep_translator import GoogleTranslator
import re 
from math import isnan

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)

In [115]:
# dtype={'type': str} prevents being confused with data type for large data sets
train = pd.read_csv('data/train.csv', index_col='id', dtype={'type': str})
test = pd.read_csv('data/test.csv', index_col='id', dtype={'type': str})
train_translated = pd.read_csv('data/train_translated.csv', dtype={'type': str})
test_translated = pd.read_csv('data/test_translated.csv', index_col='id', dtype={'type': str})
combined_data = pd.read_csv('data/combined_data.csv', index_col='id', dtype={'type': str})
combined_data_translated = pd.read_csv('data/combined_data_translated.csv', index_col='id', dtype={'type': str})
combined_data_fully_translated = pd.read_csv('data/combined_data_fully_translated.csv', index_col='id', dtype={'type': str})

# Splitting

In [116]:
data = combined_data_fully_translated

In [83]:
data.country_and_unit.value_counts()

                             8212
Eesti                        2232
Eesti  Tallinn               1320
Eesti  Tartu                  185
Saksamaa                      172
Vene                          170
NSVL                          143
Eesti  Pärnu                   83
Rootsi                         68
Soome                          45
Eesti  Viljandi                44
Inglismaa                      42
Läti                           41
Ameerika Ühendriigid           38
Eesti  Haapsalu                33
Eesti  Narva                   29
Venemaa                        26
Eesti  Kõmsi                   22
Eesti  Kohtla-Järve            18
Eesti  Kesklinna               17
Eesti  Rakvere                 17
Prantsusmaa                    16
Eesti  Harju                   15
Austria                        14
Kasahstan                      12
Kanada                         11
Eesti  Narva-Jõesuu            11
Eesti  Kuressaare              10
Leedu                           9
Itaalia       

In [109]:
def extract_city_country(item):
    if item == None:
        return [float('nan'), float('nan')]
        
    if (type(item) == str):
        item = item.strip()
        item = re.sub(' +', ' ', item) # remove multiple spaces

        if (' ' in item) and ('Eesti' in item):        
            split = item.split(' ')
            return [' '.join(split[1:]), split[0]]
        else: 
            return [float('nan'), item]
    else:
        return [float('nan'), float('nan')]

In [110]:
data['city'] = data.apply(lambda item: extract_city_country(item['country_and_unit'])[0], axis=1)
data['country'] = data.apply(lambda item: extract_city_country(item['country_and_unit'])[1], axis=1)

In [120]:
train.head()

Unnamed: 0_level_0,full_nr,name,ks,material,commentary,event_type,location,start,end,before_Christ,country_and_unit,participants_role,participant,parish,text,class,technique,parameter,unit,value,museum_abbr,musealia_mark,musealia_seria_nr,musealia_queue_nr,musealia_additional_nr,collection_mark,collection_queue_nr,collection_additional_nr,element_count,legend,is_original,initial_info,damages,state,color,additional_text,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
232170,ETMM _ 12150:115 Aj 118:44/M20,"Kuno Areng, Bremerhaveni Festwoche medal",118.0,metall,,festivalid,linn Bremerhaven,1979,,ei,Saksamaa,osaleja,"Areng, Kuno",,,,,läbimõõt,cm,4.0,ETMM,_,12150.0,115.0,,Aj,44.0,M20,1.0,,1.0,Festwoche- Breemenhaven,,hea,hall,KUTTER ASTARTE -SCHIFFERGILDE BREMENHAVEN E. V.,medal
2251378,ETMM _ 12584:19 M 102:1/13:13,"Foto-Villem Kapp,foto pühendusega Armilde M,1937",102.0,fotomaterjal,,filmindus ja fotograafia,,1938,,ei,,,,,,,,,,,ETMM,_,12584.0,19.0,,M,1.0,13:13,1.0,"Fotod Villem Kapi ja Juhan Aaviku kogu-\ndesse, ostetud 2013. aastal",,,,hea,,,foto
2070466,,,,,,,,,,,,,,,,,,,,,AM,,,,,F,52.0,,1.0,,,,,hea,,,foto
4085096,ETMM _ Fk 41691/k,"Metspart, Noorsooteater, 1969, osades: Hedvig - Mari Lill, Gina - Silvia Laidla",41691.0,fotomaterjal,,teater,,18.09.1969,,ei,,seosorganisatsioon,Noorsooteater,,,,,filmikaader,60 x 60 mm,1.0,ETMM,_,,,,Fk,,k,1.0,,1.0,TB080321,,rahuldav,,,fotonegatiiv
2697904,ETMM _ 9424 Mo 238:1/62:05,Kiri: Rahvapillimehed: Viiul: Jüri Saal: Kiri A. Pulstile: 16.02.1936,238.0,paber,,,,,,,,,,,,,käsikiri,,,,ETMM,_,9424.0,,,Mo,1.0,62:05:00,1.0,Rahvapillimehed Mo238,1.0,,,rahuldav,,,kiri


In [None]:
data.