In [1]:
from setup_general import *
from setup_embedding import *

  combined_intermediate_ready = pd.read_csv('./data/general/combined_intermediate_ready.csv', index_col='id', dtype={'type': str})


# Setup


In [2]:
type_indicators = {}
with open('data/type_indicators/type_ind_cut.txt', 'r') as f:
    for line in f:
        a = line.split('\'')
        type = a[1]
        indicators = a[2].split()
        type_indicators[type] = indicators
save_indicators = {}
with open('data/type_indicators/save_indicator.txt', 'r') as f:
    for line in f:
        a = line.split('\'')
        type = a[1]
        indicators = a[2].split()
        save_indicators[type] = indicators

In [3]:
# naive functions for type from text keywords

def filtering(text):
    pred = []
    for type in types:            
        if type in text:
            pred.append(type)
    if ('drawing' in text) or ('sketch' in text) or ('design' in text):
        pred.append('design/drawing/sketch')
    if len(pred) > 0:
        return pred[-1]
    else:
        return -1
    
def indicating(text):
    pred = []
    for type in types:
        for indicator in type_indicators[type]:
            if indicator in text:
                pred.append(type)
    if len(pred) > 0:
        return pred[-1]
    else:
        return -1

def save_indicating(text):
    pred = []
    for type in types:
        if type in save_indicators.keys():
            for indicator in save_indicators[type]:
                if indicator in text:
                    pred.append(type)
    if len(pred) > 0:
        return pred[-1]
    else:
        return -1


In [4]:
# convert to numeric
def replace_value(value: str):
    if pd.isnull(value):
        return value
    return np.float64(value.replace(',', '.'))


# convert to numeric and only keep year part
def replace_start_end(value: str):
    if pd.isnull(value):
        return value
    if re.match('^d?ddd$', value):
        return int(value)
    if re.match('dddd$', value):
        return int(value[-4:])
    elif not value[0].isdigit():
        return int(f'19{value[-2:]}')
    else:
        return nan


def extract_year_from_name(row):
    name = row['name']
    start = row['start']
    if pd.isnull(start) and not pd.isnull(name):
        match = re.search('\d\d\d\d', name)
        if match:
            start = match.group()
    return start


def preprocess_dataframe(df, submission=False):
    categorical_cols = ['material', 'location', 'before_Christ', 'country_and_unit', 'technique', 'parameter',
                        'museum_abbr', 'damages', 'state', 'color', 'event_type', 'collection_mark']
    categorical_cols += ['unit', 'participants_role', 'participant', 'musealia_mark']

    # just keeping track what values are used
    numeric_cols = ['start', 'end', 'value', 'collection_queue_nr', 'is_original', 'ks', 'element_count',
                    'musealia_seria_nr', 'musealia_queue_nr']

    dropped_cols = ['id', 'parish']  # can't use
    dropped_cols += ['full_nr', 'class', 'collection_additional_nr', 'additional_text', 'text', 'initial_info',
                     'musealia_additional_nr']  # 'commentary','name', 'legend'

    if not submission: dropped_cols.append('type')

    df['start'] = df['start'].apply(replace_start_end)
    df['end'] = df['end'].apply(replace_start_end)
    df['value'] = df['value'].apply(replace_value)
    df['start'] = df[['name', 'start']].apply(extract_year_from_name, axis=1)

    df = df.drop(columns=dropped_cols)
    df = pd.get_dummies(df, columns=categorical_cols)
    df = df.fillna(0)
    return df

In [82]:
def extract_label_from_comment(row):
    # comment #################################################
    comment = row['commentary']

    if not pd.isnull(comment):
        comment = str(comment).lower()

        comment_dict = {
            'lakk': 'pitser/templijäljend',
            'must-valge negatiiv': 'fotonegatiiv',
            'pitserilakk': 'pitser/templijäljend',
            'käepide': 'pitsat',
            'перф': 'fotonegatiiv',
            'fotoemulsioon': 'fotomaterjal',
            'plakat':'plakat'
        }
        for key, val in comment_dict.items():
            if comment.startswith(key):
                return val

        if re.match('^\d,\d\d\sg$', comment):
            return 'münt'

        if 'diapositiiv' in comment:
            return 'diapositiiv'

    # name #################################################
    name = row['name']

    if not pd.isnull(name):
        name = str(name).lower()
        if name == ['denaar', 'killing', 'penn', 'schilling', '1/2 örtug', 'dirhem', 'fyrk']:
            return 'münt'

        for val in ['medal', 'plakat', 'märkmed', 'maal', 'kiri', 'kleit', 'kava', 'joonistus', 'graafika', 'dokument',
                    'ajakiri', 'telegramm', 'skulptuur', 'raamat', 'postkaart', 'nukk', 'skulptuur', 'käsikiri']:
            if name.startswith(val):
                return val

        name_dict = {
            'kaustik': 'kaustik/vihik',
            'vihik': 'kaustik/vihik',
            'reprofoto': 'diapositiiv',
        }
        for key, val in name_dict.items():
            if name.startswith(key):
                return val
    return nan


def replace_predictions(labels, pred):
    result = np.array(pred, copy=True)
    for i, label in enumerate(labels):
        if not pd.isnull(label) and label != 0:
            result[i] = label
    return result

# combine models via class-probability combination (soft-voting)


In [6]:
# is the full ds used for submission?
full = True
# submit to 
name = 'red'
sub_name = name + '.csv'

In [7]:
#define models to be used for testing use 03 for submission use full
import pickle
xgb = XGBClassifier()
xgb.load_model('models/xg/xg_est_data_smote100_03.json')

rf = pickle.load(open('./models/rf/train_prep_full_best' , 'rb'))

boost_emb = XGBClassifier()
boost_emb.load_model('models/nlp/xgboost_03.json')



In [8]:
data = test_est_prepared.copy() if full else val_est_prepared.copy()

features = data.drop('type', axis=1)
labels = data.type

if not full:
    # at least xgboost cannot deal with string labels
    label_encoder = LabelEncoder()
    label_encoder = label_encoder.fit(labels)
    labels = label_encoder.transform(labels)
    y_test = labels

X_test = features

In [9]:
results = pd.DataFrame()
results['id'] = X_test.index
results.set_index('id', inplace=True)
if not full: results['type'] = y_test

results['emb'] = [[-1]] * len(results)
results['xg'] = [[-1]] * len(results)

In [10]:
for i,item in enumerate(xgb.predict_proba(X_test)):
    results['xg'].iloc[i] = np.array(item)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['xg'].iloc[i] = np.array(item)


In [11]:
text = test_curie.copy() if full else val_curie.copy()

features = text.drop('type', axis=1)
labels = text.type

#text['pred'] = boost_emb.predict_proba(features)
text['pred'] = [[-1]] * len(features)

In [12]:
for i,item in enumerate(boost_emb.predict_proba(features)):
    text['pred'].iloc[i] = np.array(item)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['pred'].iloc[i] = np.array(item)


In [13]:
for index, item in text.iterrows():
        if index in results.index:
            results.at[index, 'emb'] = item['pred']

# evalaluate

In [14]:
from operator import add
def vote(preds):
    if preds[-1][0] == -1:
        preds = preds[:-1]
    res = np.sum(preds, axis=0)
    return np.argmax(res)

In [15]:
results['prediction'] = results.apply(lambda row: vote([row.xg, row.emb]), axis=1)
if not full:
    results.prediction = results.prediction.replace(type_lookup.id.to_list(), type_lookup.estonian.to_list())

    a = results.prediction.copy().tolist()

    df_submission = pd.read_csv("data/general/val.csv")
    x2 = preprocess_dataframe(df_submission, submission=True)
    # reorder columns + add missing columns + remove extra columns
    x2_labels = x2.apply(extract_label_from_comment, axis=1)    

    results.prediction = replace_predictions(x2_labels, results.prediction)
    submission = pd.DataFrame({'id': results.index ,'type': results.prediction})

    b = results.prediction.copy().tolist()

    count = 0
    for i in range(len(a)):
        if a[i] != b[i]:
            count += 1
            print(a[i], b[i])

    print(count)

    results.type = results.type.replace(type_lookup.id.to_list(), type_lookup.estonian.to_list())

    print(accuracy_score(results.type, results.prediction))
    print(classification_report(results.type, results.prediction))

# submission

In [120]:
def save_extract_label_from_comment(row):
    name = row['name']

    if not pd.isnull(name):
        name = str(name).lower()
        #'plakat', 'maal', 'kiri', 'dokument', 'kava', 'käsikiri', 'postkaart', 'raamat' (slightly off)

        #'medal', 'märkmed', 'kleit', 'joonistus', 'graafika', 'skulptuur', 'nukk', 'telegramm', 'ajakiri' (correct)
        for val in ['medal', 'märkmed', 'kleit', 'joonistus', 'graafika', 'skulptuur', 'nukk', 'telegramm', 'ajakiri']:
            if name.startswith(val):
                return val
        

        name_dict = {
            'kaustik': 'kaustik/vihik',
            'vihik': 'kaustik/vihik'
        }
        for key, val in name_dict.items():
            if name.startswith(key):
                return val
    return nan

if full:
    results.prediction = results.prediction.replace(type_lookup.id.to_list(), type_lookup.estonian.to_list())
    results = results.assign(prediction='not_predicted')
    a = results.prediction.copy().tolist()

    df_submission = pd.read_csv("data/general/test.csv")
    x2 = preprocess_dataframe(df_submission, submission=True)
    # reorder columns + add missing columns + remove extra columns
    x2_labels = x2.apply(save_extract_label_from_comment, axis=1)    

    

    results.prediction = replace_predictions(x2_labels, results.prediction)
    submission = pd.DataFrame({'id': results.index ,'type': results.prediction})

    b = results.prediction.copy().tolist()

    count = 0
    for i in range(len(a)):
        if a[i] != b[i]:
            count += 1
            #print(a[i], b[i])

    print(count, count/6000)


    #submission.groupby('type').nunique()  # predicted classes

    submission.to_csv(f'submissions/{sub_name}', index=False)

162 0.027


: 

In [117]:
(0.003 + 0.00566) / 2

0.004333333333333333

In [None]:
/6000