# Imports & Settings


In [1]:
# Imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from deep_translator import GoogleTranslator
import re 
from math import isnan

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)

In [205]:
# dtype={'type': str} prevents being confused with data type for large data sets
train = pd.read_csv('data/train.csv', index_col='id', dtype={'type': str})
test = pd.read_csv('data/test.csv', index_col='id', dtype={'type': str})
train_translated = pd.read_csv('data/train_translated.csv', dtype={'type': str})
test_translated = pd.read_csv('data/test_translated.csv', index_col='id', dtype={'type': str})
combined_data = pd.read_csv('data/combined_data.csv', index_col='id', dtype={'type': str})
combined_data_translated = pd.read_csv('data/combined_data_translated.csv', index_col='id', dtype={'type': str})
combined_data_fully_translated = pd.read_csv('data/combined_data_fully_translated.csv', index_col='id', dtype={'type': str})
prep = pd.read_csv('data/prep.csv', index_col='id', dtype={'type': str})

# Parameter & Unit & Value

# Finish unit tranlation/ unification &  values to float

In [184]:
data = prep.copy()

In [194]:
data['value'] = data['value'].apply(lambda x: float(x.replace(',', '.')) if type(x) == str else x)

In [196]:
data.to_csv('data/prep.csv')

In [206]:
data = prep.copy()

In [32]:
# e.g wrong translation
data['unit'] = data['unit'].replace('e.g','mm')
data['unit'] = data['unit'].replace('pc','pieces')
data['unit'] = data['unit'].replace('page','pages')
# unify units
data['unit'] = data['unit'].replace('10 x 15 cm','100 x 150 mm')

In [33]:
data.to_csv('data/prep.csv')

In [207]:
data.unit.value_counts()

cm              5200
page            2243
24 x 36 mm       808
60 x 60 mm       391
e.g              386
g                231
90 x 120 mm       55
60 x 95 mm        39
pc                37
100 x 150 mm      12
autograph          7
century            6
130 x 180 mm       6
45 x 60 mm         3
A6                 2
180 x 240 mm       2
60 x 90 mm         1
dm3                1
number             1
10 x 15 cm         1
Name: unit, dtype: int64

# Combine parameter, unit & w/h values to value

In [150]:
data = prep.copy()

In [151]:
data['unit'] = data['unit'].replace(np.nan,'*')
data['parameter'] = data['parameter'].replace(np.nan,'*')

In [152]:
def get_squared(item):
    if ' x ' in item:
        return item + '²'
    else:
        return item

In [153]:
data['unit'] = data['unit'].apply(lambda x: get_squared(x))

In [154]:
def extract_width_height_from_unit_to_value(item):
    unit = item[0]
    value = item[1]
    if ' x ' in unit:
        split = unit.split(' ')
        x = split[0]
        y = split[2]
        real_unit = split[3]
        real_value = [x,y]        
        return [real_unit, real_value]

    else:
        return [unit, value]

In [155]:
# execution order is important
data['value'] = data.apply(lambda item: extract_width_height_from_unit_to_value(item[['unit','value']])[1], axis=1)
data['unit'] = data.apply(lambda item: extract_width_height_from_unit_to_value(item[['unit','value']])[0], axis=1)


In [159]:
data['pu'] = data['parameter'] + ' IN ' + data['unit']

In [161]:
data.to_csv('data/prep.csv')

# PUs as single features with respective values

In [175]:
data = prep.copy()

In [178]:
data.head(2)

Unnamed: 0_level_0,full_nr,name,ks,commentary,event_type,location,start,end,before_Christ,country_and_unit,participants_role,participant,parish,text,class,parameter,unit,value,museum_abbr,musealia_mark,musealia_seria_nr,musealia_queue_nr,musealia_additional_nr,collection_mark,collection_queue_nr,collection_additional_nr,element_count,legend,is_original,initial_info,damages,state,color,additional_text,type,source,city_municipality,country,material_Polish,material_RC Photo Paper,material_a pearl,material_acetate cellulose film,material_albumen paper,material_albumin paper,material_aluminium,material_amber,material_artificial fiber material,material_artificial leather,material_artificial material,material_atlas,material_ballpoint pen ink,material_birch,material_bone,material_brass,material_brocade (clothing variety),material_bronze,material_canvas,material_canvas (type of cloth),material_cardboard,material_cast iron,material_celluloid,material_ceramics,material_chalk,material_chamois leather,material_chamotte,material_charcoal,material_chromogen emulsion,material_chromogen paper,material_clay,material_clothing variety,material_collodion paper,material_colloid paper,material_colour,material_copper,material_cotton,material_crepe,material_crepe (cloth type),material_crystal,material_diffusion paper,material_email,material_emulsion,material_enamel paint,material_faience,material_feather,material_film,material_film (material),material_flint,material_from the bat,material_glass,material_gold,material_granite,material_graphite,material_gypsum,material_handmade paper,material_ink,material_iron,material_kalka,material_knitwear,material_leotard (type of clothing),material_linen,material_mascara,material_metal,material_metal fibers,material_moire (clothing variety),material_movie,material_nan,material_newsprint,material_nitrocellulose film,material_nut,material_oil paint,material_organic matter,material_paper,material_papier mache,material_photo emulsion,material_photo material,material_photo paper,material_photo plate,material_photographic material,material_plastic,material_plastic mass,material_plywood,material_porcelain,material_printing ink,material_quartz,material_rubber,material_salt paper,material_silk,material_silver,material_silver gelatin emulsion,material_silver gelatin paper,material_skin,material_slate,material_stone,material_synthetic fibers,material_synthetic material,material_tempera,material_textile,material_tin,material_trillion,material_watercolor paint,material_wax,material_white metal,material_wire,material_wood,material_wood material,material_wooden board,material_wool,material_yarn,technique_(close/together) sewing,technique_addition,technique_aquatint,technique_ballpoint pen,technique_binding techniques,technique_black and white photo,technique_black and white photography,technique_bronzing,technique_brushing,technique_burning,technique_carving,technique_casting,technique_chalk,technique_charcoal,technique_chromogen procedure,technique_collage,technique_collotype,technique_color photo,technique_color photography,technique_colored chalk,technique_colored pencil,technique_coloring,technique_copper engraving,technique_copying,technique_crayon drawing,technique_croaking,technique_crocheting,technique_cutting,technique_cuttlefish,technique_daguerreotype,technique_digital photography,technique_digital printing,technique_drawing,technique_electronic imaging,technique_embroidery,technique_enamelling,technique_engraving,technique_etching,technique_felt tip pen,technique_glass technology,technique_gouache,technique_graphics,technique_graphite,technique_handicraft,technique_handwriting,technique_ink,technique_ink drawing,technique_intarsia,technique_kiln ceramics,technique_knitting,technique_letterpress with raster cliché,technique_linocut,technique_lithography,technique_manuscript,technique_marker,technique_mascara,technique_mezzotinto,technique_mixed media,technique_modeling,technique_molding,technique_monotypy,technique_nan,technique_offset printing,technique_oil,technique_oil painting,technique_painting,technique_painting techniques,technique_pannotype,technique_pastel,technique_pen,technique_photographic techniques,technique_photography,technique_photogravure,technique_photolithography,technique_photomechanical printing,technique_polishing,technique_pressing,technique_printing,technique_raster printing,technique_relief,technique_rishel casting heel,technique_sanguine,technique_serigraphy,technique_sewing,technique_sketch,technique_slide,technique_soft varnish,technique_stamping,technique_steel engraving,technique_strengthening,technique_taking pictures,technique_tempera,technique_toning,technique_turning,technique_typescript,technique_typing,technique_undermining,technique_watercolor,technique_wet collodion process,technique_wood engraving,technique_woodcut,technique_writing,* IN *,amount IN pieces,circumference IN cm,diameter IN cm,diameter IN mm,distance IN cm,document volume IN pages,film frame IN mm²,height IN cm,height IN mm,image height IN cm,image width IN cm,length IN cm,length IN mm,negative format IN mm²,page height IN cm,page width IN cm,paper format IN A6,photo format IN mm²,print volume IN pages,size (clothing) IN number,thickness IN cm,thickness IN mm,time/ duration IN century,volume of the book IN pages,volume of wood IN dm3,volume of writing IN autograph,weight IN g,width IN cm,width IN mm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1,Unnamed: 217_level_1,Unnamed: 218_level_1,Unnamed: 219_level_1,Unnamed: 220_level_1,Unnamed: 221_level_1,Unnamed: 222_level_1,Unnamed: 223_level_1,Unnamed: 224_level_1,Unnamed: 225_level_1,Unnamed: 226_level_1,Unnamed: 227_level_1,Unnamed: 228_level_1,Unnamed: 229_level_1,Unnamed: 230_level_1,Unnamed: 231_level_1,Unnamed: 232_level_1,Unnamed: 233_level_1,Unnamed: 234_level_1,Unnamed: 235_level_1,Unnamed: 236_level_1,Unnamed: 237_level_1,Unnamed: 238_level_1,Unnamed: 239_level_1,Unnamed: 240_level_1,Unnamed: 241_level_1,Unnamed: 242_level_1,Unnamed: 243_level_1,Unnamed: 244_level_1,Unnamed: 245_level_1,Unnamed: 246_level_1,Unnamed: 247_level_1,Unnamed: 248_level_1,Unnamed: 249_level_1,Unnamed: 250_level_1,Unnamed: 251_level_1,Unnamed: 252_level_1,Unnamed: 253_level_1,Unnamed: 254_level_1,Unnamed: 255_level_1,Unnamed: 256_level_1,Unnamed: 257_level_1,Unnamed: 258_level_1,Unnamed: 259_level_1,Unnamed: 260_level_1,Unnamed: 261_level_1,Unnamed: 262_level_1,Unnamed: 263_level_1,Unnamed: 264_level_1,Unnamed: 265_level_1,Unnamed: 266_level_1,Unnamed: 267_level_1,Unnamed: 268_level_1,Unnamed: 269_level_1,Unnamed: 270_level_1,Unnamed: 271_level_1,Unnamed: 272_level_1,Unnamed: 273_level_1,Unnamed: 274_level_1,Unnamed: 275_level_1,Unnamed: 276_level_1,Unnamed: 277_level_1,Unnamed: 278_level_1,Unnamed: 279_level_1,Unnamed: 280_level_1
232170,ETMM _ 12150:115 Aj 118:44/M20,"Kuno Areng, Bremerhaven Festwoche medal",118.0,,festivals,linn Bremerhaven,1979,,ei,Saksamaa,participant,"Areng, Kuno",,,,diameter,cm,4.0,ETMM,_,12150.0,115.0,,Aj,44.0,M20,1.0,,1.0,Festwoche - Breemenhaven,,good,grey,KUTTER ASTARTE -SCHIFFERGILDE BREMENHAVEN E.V.,medal,train,,Saksamaa,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2251378,ETMM _ 12584:19 M 102:1/13:13,"Photo-Villem Kapp, photo with dedication to Armilde M, 1937",102.0,,filmmaking and photography,,1938,,ei,,,,,,,*,*,,ETMM,_,12584.0,19.0,,M,1.0,13:13,1.0,"Photos from the collection of Villem Kapi and Juhan Aavik\ndesse, purchased in 2013",,,,good,,,photo,train,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [176]:
data.rename(columns={'pu':'parameter_and_unit'}, inplace=True)

In [177]:
data= pd.get_dummies(data, columns=['parameter_and_unit'], prefix='', prefix_sep='')

In [None]:
data.to_csv('data/prep.csv')