# Imports & Settings


In [1]:
# Imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from deep_translator import GoogleTranslator
import re 
from math import isnan

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)

In [2]:
# dtype={'type': str} prevents being confused with data type for large data sets
train = pd.read_csv('data/train.csv', index_col='id', dtype={'type': str})
test = pd.read_csv('data/test.csv', index_col='id', dtype={'type': str})
train_translated = pd.read_csv('data/train_translated.csv', dtype={'type': str})
test_translated = pd.read_csv('data/test_translated.csv', index_col='id', dtype={'type': str})
combined_data = pd.read_csv('data/combined_data.csv', index_col='id', dtype={'type': str})
combined_data_translated = pd.read_csv('data/combined_data_translated.csv', index_col='id', dtype={'type': str})
combined_data_fully_translated = pd.read_csv('data/combined_data_fully_translated.csv', index_col='id', dtype={'type': str})
prep = pd.read_csv('data/prep.csv', index_col='id', dtype={'type': str})
text = pd.read_csv('data/text.csv', index_col='id', dtype={'type': str})

# Preparation

In [106]:
data = prep.copy()

In [42]:
text_features = ['name', 'commentary', 'text', 'legend', 'initial_info', 'additional_text']

In [107]:
data[text_features] = data[text_features].fillna('')

In [91]:
def collect_text(item):
    return ' '.join(item[text_features]).strip()

In [109]:
data['text_features'] = data.apply(lambda item: collect_text(item),axis=1)

In [138]:
data[['name', 'commentary', 'text', 'legend', 'initial_info', 'additional_text', 'text_features']].head(2)

Unnamed: 0_level_0,name,commentary,text,legend,initial_info,additional_text,text_features
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
232170,"Kuno Areng, Bremerhaven Festwoche medal",,,,Festwoche - Breemenhaven,KUTTER ASTARTE -SCHIFFERGILDE BREMENHAVEN E.V.,"Kuno Areng, Bremerhaven Festwoche medal Festwoche - Breemenhaven KUTTER ASTARTE -SCHIFFERGILDE BREMENHAVEN E.V."
2251378,"Photo-Villem Kapp, photo with dedication to Armilde M, 1937",,,"Photos from the collection of Villem Kapi and Juhan Aavik\ndesse, purchased in 2013",,,"Photo-Villem Kapp, photo with dedication to Armilde M, 1937 Photos from the collection of Villem Kapi and Juhan Aavik\ndesse, purchased in 2013"


In [111]:
data.to_csv('data/prep.csv')

In [None]:
data = prep.copy()
with_damages = combined_data_fully_translated.copy()

In [None]:
data.text_features = data.text_features.replace(float('nan'), ' ',)
with_damages.damages = with_damages.damages.replace(float('nan'), ' ',)

data.text_features = data.text_features + ' ' + with_damages.damages

In [None]:
data.to_csv('data/prep.csv')

In [12]:
data = prep.copy()

In [13]:
text = data[['text_features','type','source']]

In [14]:
text.to_csv('data/text.csv')

In [18]:
data = text.copy()

In [26]:
data.text_features = data.text_features.apply(lambda x: x.strip())

In [27]:
data = data[data.text_features != '']

In [31]:
data.to_csv('data/text.csv')

# check for type contained in texts  ~ rule based

In [5]:
data = text.copy()

In [123]:
prep.text_features.info()

<class 'pandas.core.series.Series'>
Int64Index: 20000 entries, 232170 to 2781747
Series name: text_features
Non-Null Count  Dtype 
--------------  ----- 
14900 non-null  object
dtypes: object(1)
memory usage: 312.5+ KB


In [6]:
types = data.type.unique()
# remove nan
types = types[:-1]
types

array(['medal', 'photo', 'photo negative', 'letter', 'coin', 'label/sign',
       'script, song/vocal music', 'music sheet', 'slide',
       'archaeological find', 'book', 'plan', 'postcard', 'seal/imprint',
       'poster', 'design/drawing/sketch', 'audio recording', 'manuscript',
       'document', 'caricature', 'graphics', 'telegram',
       'manuscript, musical composition', 'notes', 'small print',
       'invitation', 'packaging', 'paper', 'manuscript, sheet music',
       'magazines', 'album', 'country', 'digital image',
       'photo, postcard', 'photographic material', 'drawing', 'crate/box',
       'letter of honor/honorary address', 'calendar', 'tape/ribbon',
       'bag', 'newspaper', 'dish/vessel', 'musical instrument',
       'letter, postcard', 'sculpture', 'printed notes', 'seal', 'suit',
       'certificate', 'sheet/linen', 'folder/booklet', 'jewel', 'doll',
       'photographic negative, photographic negative'], dtype=object)

In [136]:
data.text_features = data.text_features.replace(float('nan'), '')

In [163]:
multi_counter = 0
one_counter = 0
true_counter = 0
for i, item in data.iterrows():
    if item.source == 'train':
        local_counter = 0
        pred = []
        for type in types:            
            if type in item.text_features:
                local_counter += 1
                pred.append(type)                

        if local_counter > 1:
            multi_counter += 1
        if local_counter == 1:
            one_counter += 1
            if pred[0] == item.type:           
                true_counter += 1
            else:
                #print(pred, item.type) 
                pass

print('multiple types in text - one type in text - one type and true type in text')
print(multi_counter, one_counter, true_counter)
# from 14900 texts

multiple types in text - one type in text - one type and true type in text
483 1642 954


# gpt3 embeddings


In [3]:
text = text.copy()

In [6]:
import openai
import time
openai.api_key = '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'
count = 0
def get_embedding(text, model="text-similarity-davinci-001"):
    global count
    count += 1
    text = text.replace("\n", " ")
    try:
        result = openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
    except:
        print(count)
        time.sleep(60)
        result = openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
    return result
 
text['curie_similarity'] = text.text_features.apply(lambda x: get_embedding(x, model='text-similarity-curie-001'))
text.to_csv('data/curie.csv', index=False)   


13


In [19]:
text.to_csv('data/embedded_1k_reviews.csv', index=True)  

In [3]:
df = pd.read_csv('data/embedded_1k_reviews.csv')

In [4]:
train_emb = df[df.source == 'train']
test_emb = df[df.source == 'test']

In [6]:
train_emb.to_csv('data/train_emb.csv', index=True)
test_emb.to_csv('data/test_emb.csv', index=True)

# rf on embedding

In [8]:
df = pd.read_csv('data/train_emb.csv', index_col='id', dtype={'type': str})

In [9]:
df['babbage_similarity'] = df.babbage_similarity.apply(eval).apply(np.array)

In [10]:
from sklearn.model_selection import train_test_split
 
X_train, X_test, y_train, y_test = train_test_split(
    list(df.babbage_similarity.values),
    df.type,
    test_size = 0.2,
    random_state=42
)

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
 
clf = RandomForestClassifier(n_estimators=1000)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds)
print(report)

                                              precision    recall  f1-score   support

                                       album       0.00      0.00      0.00         3
                         archaeological find       0.97      0.99      0.98       171
                             audio recording       0.00      0.00      0.00        12
                                         bag       1.00      0.33      0.50         3
                                        book       0.74      0.77      0.75       111
                                    calendar       0.00      0.00      0.00         4
                                  caricature       0.00      0.00      0.00         8
                                 certificate       0.00      0.00      0.00         2
                                        coin       1.00      0.94      0.97        35
                                     country       0.00      0.00      0.00         4
                                   crate/box       0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
