# Imports & Settings


In [41]:
# Imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from deep_translator import GoogleTranslator
import re 
from math import isnan

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)

In [178]:
# dtype={'type': str} prevents being confused with data type for large data sets
train = pd.read_csv('data/train.csv', index_col='id', dtype={'type': str})
test = pd.read_csv('data/test.csv', index_col='id', dtype={'type': str})
train_translated = pd.read_csv('data/train_translated.csv', dtype={'type': str})
test_translated = pd.read_csv('data/test_translated.csv', index_col='id', dtype={'type': str})
combined_data = pd.read_csv('data/combined_data.csv', index_col='id', dtype={'type': str})
combined_data_translated = pd.read_csv('data/combined_data_translated.csv', index_col='id', dtype={'type': str})
combined_data_fully_translated = pd.read_csv('data/combined_data_fully_translated.csv', index_col='id', dtype={'type': str})
prep = pd.read_csv('data/prep.csv', index_col='id', dtype={'type': str})

# Splitting

# country_unit

In [116]:
data = combined_data_fully_translated.copy()

In [131]:
def extract_city_country(item):      
    if (type(item) == str):
        item = item.strip()
        # there are some empty (non-nan) values
        if (item == ''):
            return [float('nan'), float('nan')]
    
        item = re.sub(' +', ' ', item) # remove multiple spaces

        if (' ' in item) and ('Eesti' in item):        
            split = item.split(' ')
            return [' '.join(split[1:]), split[0]]
        else: 
            return [float('nan'), item]
    else:
        return [float('nan'), float('nan')]

In [132]:
data['city_municipality'] = data.apply(lambda item: extract_city_country(item['country_and_unit'])[0], axis=1)
data['country'] = data.apply(lambda item: extract_city_country(item['country_and_unit'])[1], axis=1)

In [141]:
data.to_csv('data/prep.csv')

# material

In [165]:
data = prep.copy()

In [159]:
len(data.material.unique())

117

In [166]:
# to make the following work even for nan values

data['material'] = data['material'].replace(np.nan, 'nan')

In [167]:
# prepare single values to be distinguishable
data['material'] = data['material'].apply(lambda x: x.split('>'))

In [171]:
# https://stackoverflow.com/questions/45312377/how-to-one-hot-encode-from-a-pandas-column-containing-a-list

from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
data = data.join(pd.DataFrame(mlb.fit_transform(data.pop('material')),
                          columns=mlb.classes_,
                          index=data.index))

In [176]:
data.to_csv('data/prep.csv')

# technique

In [179]:
data = prep.copy()

In [180]:
data.head(1)

Unnamed: 0_level_0,full_nr,name,ks,commentary,event_type,location,start,end,before_Christ,country_and_unit,participants_role,participant,parish,text,class,technique,parameter,unit,value,museum_abbr,musealia_mark,musealia_seria_nr,musealia_queue_nr,musealia_additional_nr,collection_mark,collection_queue_nr,collection_additional_nr,element_count,legend,is_original,initial_info,damages,state,color,additional_text,type,source,city_municipality,country,Polish,RC Photo Paper,a pearl,acetate cellulose film,albumen paper,albumin paper,aluminium,amber,artificial fiber material,artificial leather,artificial material,atlas,ballpoint pen ink,birch,bone,brass,brocade (clothing variety),bronze,canvas,canvas (type of cloth),cardboard,cast iron,celluloid,ceramics,chalk,chamois leather,chamotte,charcoal,chromogen emulsion,chromogen paper,clay,clothing variety,collodion paper,colloid paper,colour,copper,cotton,crepe,crepe (cloth type),crystal,diffusion paper,email,emulsion,enamel paint,faience,feather,film,film (material),flint,from the bat,glass,gold,granite,graphite,gypsum,handmade paper,ink,iron,kalka,knitwear,leotard (type of clothing),linen,mascara,metal,metal fibers,moire (clothing variety),movie,nan,newsprint,nitrocellulose film,nut,oil paint,organic matter,paper,papier mache,photo emulsion,photo material,photo paper,photo plate,photographic material,plastic,plastic mass,plywood,porcelain,printing ink,quartz,rubber,salt paper,silk,silver,silver gelatin emulsion,silver gelatin paper,skin,slate,stone,synthetic fibers,synthetic material,tempera,textile,tin,trillion,watercolor paint,wax,white metal,wire,wood,wood material,wooden board,wool,yarn
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1
232170,ETMM _ 12150:115 Aj 118:44/M20,"Kuno Areng, Bremerhaven Festwoche medal",118.0,,festivals,linn Bremerhaven,1979,,ei,Saksamaa,participant,"Areng, Kuno",,,,,diameter,cm,4,ETMM,_,12150.0,115.0,,Aj,44.0,M20,1.0,,1.0,Festwoche - Breemenhaven,,good,grey,KUTTER ASTARTE -SCHIFFERGILDE BREMENHAVEN E.V.,medal,train,,Saksamaa,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [181]:
data.technique.unique()


array([nan, 'manuscript', 'enamelling', 'black and white photography',
       'handwriting', 'printing', 'copying', 'oil', 'pen', 'watercolor',
       'typing', 'drawing', 'mixed media', 'soft varnish',
       'photography>black and white photography', 'lithography',
       'mascara', 'painting',
       'photography>black and white photography>wet collodion process>pannotype',
       'sketch', 'photomechanical printing>photolithography>collotype',
       'color photography', 'ink', 'copper engraving', 'etching',
       'sewing', 'taking pictures', 'writing',
       'photography>color photography>chromogen procedure', 'gouache',
       'felt tip pen', 'bronzing', 'monotypy', 'digital photography',
       'color photo', 'painting techniques', 'photography',
       'photographic techniques', 'linocut', 'handicraft',
       'steel engraving', 'turning', 'ballpoint pen', 'ink drawing',
       'stamping', '(close/together) sewing', 'black and white photo',
       'pastel', 'toning', 'crochet