# Imports & Settings


In [20]:
# Imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from deep_translator import GoogleTranslator
import re 
from math import isnan
from xgboost import XGBClassifier
from collections import Counter

from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)

In [2]:
# dtype={'type': str} prevents being confused with data type for large data sets
text = pd.read_csv('data/text.csv', index_col='id', dtype={'type': str})
train_text = pd.read_csv('data/train_text.csv', index_col='id', dtype={'type': str})
test_text = pd.read_csv('data/test_text.csv', index_col='id', dtype={'type': str})
#babbage = pd.read_csv('data/embedded_1k_reviews.csv', index_col='id', dtype={'type': str})
#curie = pd.read_csv('data/curie.csv', dtype={'type': str})

# Preparation

In [106]:
data = prep.copy()

In [42]:
text_features = ['name', 'commentary', 'text', 'legend', 'initial_info', 'additional_text']

In [107]:
data[text_features] = data[text_features].fillna('')

In [91]:
def collect_text(item):
    return ' '.join(item[text_features]).strip()

In [109]:
data['text_features'] = data.apply(lambda item: collect_text(item),axis=1)

In [138]:
data[['name', 'commentary', 'text', 'legend', 'initial_info', 'additional_text', 'text_features']].head(2)

Unnamed: 0_level_0,name,commentary,text,legend,initial_info,additional_text,text_features
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
232170,"Kuno Areng, Bremerhaven Festwoche medal",,,,Festwoche - Breemenhaven,KUTTER ASTARTE -SCHIFFERGILDE BREMENHAVEN E.V.,"Kuno Areng, Bremerhaven Festwoche medal Festwoche - Breemenhaven KUTTER ASTARTE -SCHIFFERGILDE BREMENHAVEN E.V."
2251378,"Photo-Villem Kapp, photo with dedication to Armilde M, 1937",,,"Photos from the collection of Villem Kapi and Juhan Aavik\ndesse, purchased in 2013",,,"Photo-Villem Kapp, photo with dedication to Armilde M, 1937 Photos from the collection of Villem Kapi and Juhan Aavik\ndesse, purchased in 2013"


In [111]:
data.to_csv('data/prep.csv')

In [None]:
data = prep.copy()
with_damages = combined_data_fully_translated.copy()

In [None]:
data.text_features = data.text_features.replace(float('nan'), ' ',)
with_damages.damages = with_damages.damages.replace(float('nan'), ' ',)

data.text_features = data.text_features + ' ' + with_damages.damages

In [None]:
data.to_csv('data/prep.csv')

In [12]:
data = prep.copy()

In [13]:
text = data[['text_features','type','source']]

In [14]:
text.to_csv('data/text.csv')

In [16]:
data = text.copy()

In [17]:
data.text_features = data.text_features.apply(lambda x: x.strip())

In [18]:
data = data[data.text_features != '']

In [19]:
data.to_csv('data/text.csv')

# check for type contained in texts  ~ rule based

In [36]:
data = train_text.copy()

In [24]:
df = train_text.copy()
data,test = train_test_split(df, test_size=0.3, random_state=0)

In [81]:
# categorize types
types = [
'sculpture',\
'bag', 'suit', 'doll', 'sheet/linen', 'dish/vessel','jewel', 'tape/ribbon',\
'country',\
'paper','book','magazines', 'album', 'newspaper', 'folder/booklet',\
'invitation',  'calendar',\
'audio recording', 'telegram',\
'packaging', 'crate/box',\
'printed notes', 'small print',\
'seal', 'seal/imprint',\
'letter','letter of honor/honorary address',\
'postcard', 'photo, postcard', 'letter, postcard',\
'manuscript','script, song/vocal music', 'music sheet', 'musical instrument', 'manuscript, musical composition', 'manuscript, sheet music',\
'medal', 'coin', 'label/sign',\
'poster','plan','notes', 'document', 'certificate',\
'graphics', 'drawing', 'design/drawing/sketch','caricature','slide',\
'archaeological find',\
'photo', 'photo negative', 'photographic negative, photographic negative', 'photographic material','digital image'
]

In [5]:
data.text_features = data.text_features.replace(float('nan'), '')

In [16]:
true_counter = 0
false_counter = 0
false = []
for i, item in data.iterrows():    
    local_counter = 0
    pred = []
    for type in types:            
        if type in item.text_features:
            local_counter += 1
            pred.append(type)
    if ('drawing' in item.text_features) or ('sketch' in item.text_features) or ('design' in item.text_features):
        pred.append('design/drawing/sketch')
        local_counter += 1
    if 'negative' in item.text_features:
        pred.append('photo negative')
        local_counter += 1
    if 'seal' in item.text_features:
        pred.append('seal/imprint')
        local_counter += 1
    

    if local_counter > 0:
        if pred[-1] == item.type:           
            true_counter += 1
        else:
            false_counter += 1
            false.append(str(pred)+' '+item.type)
    """
    if local_counter > 1:
        multi_counter += 1
    if local_counter == 1:
        one_counter += 1
        if pred[0] == item.type:           
            true_counter += 1
        else:
            #print(pred, item.type) 
            pass
    """

print(false_counter,true_counter)
# from 14900 texts

864 2398


In [13]:
freq = Counter(false)
freq.most_common(10)

[("['photo'] digital image", 112),
 ("['album'] photo", 90),
 ("['photo'] photo negative", 41),
 ("['book'] music sheet", 22),
 ("['paper'] newspaper", 22),
 ("['photo negative'] photo", 21),
 ("['photo'] slide", 19),
 ("['slide', 'photo'] slide", 16),
 ("['document', 'photo'] manuscript, sheet music", 16),
 ("['paper'] photo", 15)]

In [62]:
ordered_types = list(data.type.value_counts().index)
ordered_types.reverse()

In [56]:
# collect most frquent words for each type
for type in ordered_types:
    all = ''
    for i, item in data.iterrows():        
        if item.type == type:
            all += item.text_features
    freq = Counter(all.split())
    #print(type, freq.most_common(50))

In [80]:
type_indicators = {}
with open('data/type_ind_cut.txt', 'r') as f:
    for line in f:
        a = line.split('\'')
        type = a[1]
        indicators = a[2].split()
        type_indicators[type] = indicators

In [84]:
from sklearn.metrics import classification_report
true = 0
false = 0
nothing = 0
yo = []
ho = []

for i, item in data.iterrows():
    pred = []
    for type in types:
        for indicator in type_indicators[type]:
            if indicator in item.text_features:
                pred.append(type)
    if len(pred) > 0:
        yo.append(item.type)
        ho.append(pred[-1])
        if pred[-1] == item.type:
            true += 1
        else:
            false += 1
    else:
        nothing += 1

print(true, false, nothing)
print(classification_report(yo, ho))


3678 2084 4688
                                              precision    recall  f1-score   support

                                       album       0.00      0.00      0.00        11
                         archaeological find       0.97      0.96      0.96       636
                             audio recording       1.00      0.80      0.89        60
                                         bag       0.56      0.45      0.50        11
                                        book       0.51      0.65      0.58       212
                                    calendar       0.00      0.00      0.00        12
                                  caricature       1.00      0.81      0.89        31
                                 certificate       0.50      0.67      0.57         6
                                        coin       0.88      0.97      0.92       186
                                     country       0.00      0.00      0.00         4
                                   cra

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [53]:
# check where idicators are in text
with open('data/type_indicators.txt', 'r') as f:
    for line in f:
        a = line.split('\'')
        cat = a[1]
        indicators = a[2].split()

        counter = {}
            
        for ind in indicators:            
            for i, item in data.iterrows():
                if ind in item.text_features:
                    key = item.type
                    if key not in counter:
                        counter[key] = 0
                    counter[key] += 1
            print(cat, ind, counter)
            print()


sculpture wood {'packaging': 3, 'musical instrument': 3, 'graphics': 1, 'sculpture': 2, 'photo negative': 7, 'photo': 17, 'magazines': 1, 'dish/vessel': 3, 'bag': 1, 'crate/box': 2, 'medal': 1, 'doll': 2, 'book': 2}

sculpture sculpture {'packaging': 3, 'musical instrument': 3, 'graphics': 1, 'sculpture': 6, 'photo negative': 23, 'photo': 18, 'magazines': 1, 'dish/vessel': 3, 'bag': 1, 'crate/box': 2, 'medal': 1, 'doll': 2, 'book': 2}

bag leather {'book': 4, 'graphics': 5, 'photo': 1, 'letter of honor/honorary address': 1, 'bag': 2, 'label/sign': 1, 'music sheet': 2, 'album': 2, 'doll': 1, 'magazines': 1, 'document': 1}

bag pocket {'book': 4, 'graphics': 5, 'photo': 2, 'letter of honor/honorary address': 1, 'bag': 5, 'label/sign': 1, 'music sheet': 2, 'album': 2, 'doll': 1, 'magazines': 1, 'document': 1, 'suit': 1}

bag bag {'book': 4, 'graphics': 5, 'photo': 2, 'letter of honor/honorary address': 1, 'bag': 15, 'label/sign': 1, 'music sheet': 2, 'album': 2, 'doll': 1, 'magazines': 1,

# try with most frequent words only

In [31]:
words_per_type = {}
rem_list = ['The','the','in','of','a','and','on','from','is','at','by','with','to','for','as','an','or','are','this','that','be','which','it','its','was','has','have','had','were','their','they','their','there','these','those','such','such','but','not','no','also','all','any','both','each','either','neither','one','other','another','some','such','what','when','where','which','who','whom','whose','why','will','would','can','could','may','might','must','shall','should','will','would','about','above','across','after','against','along','among','around','before','behind','below','beneath','beside','between','beyond','during','except','for','from','in','inside','into','like','near','of','off','on','onto','out','outside','over','past','since','through','to','toward','under','until','up','upon','with','within','without','and/or','A']
for type in types:
    all = ''
    for i, item in data.iterrows():        
        if item.type == type:
            all += item.text_features
    freq = Counter(all.split())
    most = freq.most_common(20)
    copy = most.copy()
    for i in copy:
        if i[0] in rem_list:
            most.remove(i)
    words_per_type[type] = most

In [33]:
true = 0
false = 0
for i, item in test.iterrows():
    cat = ''
    max = 0
    for type in types:
        counter = 0
        for word in words_per_type[type]:
            if word[0] in item.text_features:
                counter += 1
        if counter > max:
            max = counter
            cat = type
    if cat == item.type:
        true += 1
    else:
        false += 1

print(true,false)



1148 1987


In [32]:
words_per_type

{'sculpture': [('bust', 2),
  ('ActHero', 1),
  ('Socialist', 1),
  ('Labor', 1),
  ('sm.', 1),
  ('Konijärvboat', 1),
  ('modelboat', 1),
  ('modelsculpture,', 1),
  ('Kaarel', 1),
  ('Karm,', 1),
  ('1946,', 1),
  ('patinated', 1),
  ('Purchased', 1),
  ('Tea', 1)],
 'bag': [('Estonian', 4),
  ('bag', 3),
  ('Baltic', 3),
  ('Regatta', 3),
  ('Tallinn', 3),
  ('plastic', 3),
  ('handbag', 3),
  ('front', 2),
  ('attached', 2)],
 'suit': [('dress', 15),
  ('Estonian', 6),
  ('made', 6),
  ('Dress', 5),
  ('M.', 5)],
 'doll': [('-', 16),
  ('Puppet', 4),
  ('Tõnu', 4),
  ('Riho', 4),
  ('lemur', 4),
  ('light', 4),
  ('Mouse', 3),
  ('Theater', 3),
  ('E.', 3),
  ('head', 3),
  ('silk', 3),
  ('Boy', 2),
  ('Estonian', 2),
  ('National', 2)],
 'sheet/linen': [('white', 7),
  ('Made', 6),
  ('cotton', 5),
  ('fabric,', 5),
  ('Tallinn', 5),
  ('embroidered', 4),
  ('decorated', 4),
  ('three', 4),
  ('red', 4)],
 'dish/vessel': [('handle.', 3),
  ('small', 3),
  ('Received', 3),
  ('sid

# gpt3 embeddings


In [3]:
text = text.copy()

In [6]:
import openai
import time
openai.api_key = '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'
count = 0
def get_embedding(text, model="text-similarity-davinci-001"):
    global count
    count += 1
    text = text.replace("\n", " ")
    try:
        result = openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
    except:
        print(count)
        time.sleep(60)
        result = openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
    return result
 
text['curie_similarity'] = text.text_features.apply(lambda x: get_embedding(x, model='text-similarity-curie-001'))
text.to_csv('data/curie.csv', index=False)   


13
52
97
148
185
228
277
323
359
398
441
477
512
554
597
636
675
712
747
786
831
868
911
951
997
1046
1097
1137
1172
1209
1254
1293
1327
1362
1409
1448
1483
1530
1573
1608
1639
1681
1725
1769
1805
1857
1905
1946
2000
2042
2081
2124
2158
2193
2225
2264
2304
2355
2394
2436
2479
2513
2555
2596
2629
2668
2709
2759
2809
2839
2884
2927
2963
3010
3054
3103
3152
3198
3246
3296
3338
3379
3425
3466
3512
3548
3591
3640
3681
3720
3766
3821
3862
3904
3948
3989
4034
4078
4117
4147
4178
4212
4250
4280
4324
4370
4406
4436
4484
4518
4559
4595
4648
4683
4722
4754
4805
4852
4900
4950
4988
5036
5072
5119
5157
5196
5224
5257
5298
5350
5394
5427
5468
5513
5554
5592
5640
5679
5718
5754
5801
5850
5898
5934
5975
6022
6054
6106
6142
6177
6220
6260
6299
6326
6378
6423
6459
6506
6549
6592
6637
6677
6725
6764
6807
6860
6900
6952
7001
7035
7076
7125
7166
7213
7264
7309
7354
7404
7449
7483
7524
7559
7593
7633
7682
7736
7775
7814
7862
7907
7956
8002
8042
8073
8111
8158
8204
8245
8272
8314
8352
8397
8433
8470
8516
856

In [16]:
df = curie.copy()

In [17]:
train_emb = df[df.source == 'train']
test_emb = df[df.source == 'test']

In [18]:
train_emb.to_csv('data/train_curie.csv', index=True)
test_emb.to_csv('data/test_curie.csv', index=True)

# rf on embedding

In [27]:
df = pd.read_csv('data/train_curie.csv', index_col='id', dtype={'type': str})

In [5]:
df['curie_similarity'] = df.curie_similarity.apply(eval).apply(np.array)

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
 
features = list(df.curie_similarity.values)
labels = df.type
# at least xgboost cannot deal with string labels
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(labels)
labels = label_encoder.transform(labels)
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.1, random_state=0)

In [21]:
# Imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from deep_translator import GoogleTranslator
import re 
from math import isnan
import wandb
from xgboost import XGBClassifier

# utilities
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)

df = pd.read_csv('data/train_curie.csv', index_col='id', dtype={'type': str})
df['curie_similarity'] = df.curie_similarity.apply(eval).apply(np.array)
print('eval')

features = list(df.curie_similarity.values)
labels = df.type
# at least xgboost cannot deal with string labels
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(labels)
labels = label_encoder.transform(labels)
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.1, random_state=0)

bst = XGBClassifier(random_state=0)
print('run')
# fit model
bst.fit(X_train, y_train)

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
 
clf = RandomForestClassifier(n_estimators=1000)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds)
print(report)

ValueError: could not convert string to float: '[0.01085097 0.01198783 0.00867431 ... 0.0049957  0.02543602 0.01269028]'

In [1]:
clf.save_model('models/curie_model.json')

NameError: name 'clf' is not defined