credits to Kaspar Kadalipp

# Imports

In [1]:
import numpy as np
import pandas as pd
import re
from numpy import nan
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Preprocessing

In [2]:
# convert to numeric
def replace_value(value: str):
    if pd.isnull(value):
        return value
    return np.float64(value.replace(',', '.'))


# convert to numeric and only keep year part
def replace_start_end(value: str):
    if pd.isnull(value):
        return value
    if re.match('^d?ddd$', value):
        return int(value)
    if re.match('dddd$', value):
        return int(value[-4:])
    elif not value[0].isdigit():
        return int(f'19{value[-2:]}')
    else:
        return nan


def extract_year_from_name(row):
    name = row['name']
    start = row['start']
    if pd.isnull(start) and not pd.isnull(name):
        match = re.search('\d\d\d\d', name)
        if match:
            start = match.group()
    return start


def preprocess_dataframe(df, submission=False):
    categorical_cols = ['material', 'location', 'before_Christ', 'country_and_unit', 'technique', 'parameter',
                        'museum_abbr', 'damages', 'state', 'color', 'event_type', 'collection_mark']
    categorical_cols += ['unit', 'participants_role', 'participant', 'musealia_mark']

    # just keeping track what values are used
    numeric_cols = ['start', 'end', 'value', 'collection_queue_nr', 'is_original', 'ks', 'element_count',
                    'musealia_seria_nr', 'musealia_queue_nr']

    dropped_cols = ['id', 'parish']  # can't use
    dropped_cols += ['full_nr', 'class', 'collection_additional_nr', 'additional_text', 'text', 'initial_info',
                     'musealia_additional_nr']  # 'commentary','name', 'legend'

    if not submission: dropped_cols.append('type')

    df['start'] = df['start'].apply(replace_start_end)
    df['end'] = df['end'].apply(replace_start_end)
    df['value'] = df['value'].apply(replace_value)
    df['start'] = df[['name', 'start']].apply(extract_year_from_name, axis=1)

    df = df.drop(columns=dropped_cols)
    df = pd.get_dummies(df, columns=categorical_cols)
    df = df.fillna(0)
    return df

# Extract labels from features: commentary, name, legend

In [3]:
def extract_label_from_comment(row):
    # comment #################################################
    comment = row['commentary']

    if not pd.isnull(comment):
        comment = str(comment).lower()

        comment_dict = {
            'lakk': 'pitser/templijäljend',
            'must-valge negatiiv': 'fotonegatiiv',
            'pitserilakk': 'pitser/templijäljend',
            'käepide': 'pitsat',
            'перф': 'fotonegatiiv',
            'fotoemulsioon': 'fotomaterjal',
            'plakat':'plakat'
        }
        for key, val in comment_dict.items():
            if comment.startswith(key):
                return val

        if re.match('^\d,\d\d\sg$', comment):
            return 'münt'

        if 'diapositiiv' in comment:
            return 'diapositiiv'

    # name #################################################
    name = row['name']

    if not pd.isnull(name):
        name = str(name).lower()
        if name == ['denaar', 'killing', 'penn', 'schilling', '1/2 örtug', 'dirhem', 'fyrk']:
            return 'münt'

        for val in ['medal', 'plakat', 'märkmed', 'maal', 'kiri', 'kleit', 'kava', 'joonistus', 'graafika', 'dokument',
                    'ajakiri', 'telegramm', 'skulptuur', 'raamat', 'postkaart', 'nukk', 'skulptuur', 'käsikiri']:
            if name.startswith(val):
                return val

        name_dict = {
            'kaustik': 'kaustik/vihik',
            'vihik': 'kaustik/vihik',
            'reprofoto': 'diapositiiv',
        }
        for key, val in name_dict.items():
            if name.startswith(key):
                return val
    return nan

# Train model

In [5]:
df = pd.read_csv("data/general/train.csv")
nlp_features = ['legend', 'name', 'commentary']

# preprocess
x = preprocess_dataframe(df)
y = df[['type']]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# extract labels
labels = X_test.apply(extract_label_from_comment, axis=1)
X_train = X_train.drop(columns=nlp_features)
X_test = X_test.drop(columns=nlp_features)
x = x.drop(columns=nlp_features)

In [None]:
X_train.head()

Unnamed: 0,ks,start,end,value,musealia_seria_nr,musealia_queue_nr,collection_queue_nr,element_count,is_original,material_RC fotopaber,...,"participant_Пильский, Петръ",participant_Польза,"participant_Преображенский, А.В.",participant_Просвещение,participant_Редакция Наука и Жизнь,participant_Советский Композитор,"participant_Стасов, В.В.",participant_Универсальная Библиотека,"participant_Юзовский, Ю",musealia_mark__
6066,473.0,1970,0.0,59.5,8540.0,0.0,13.0,1.0,1.0,0,...,0,0,0,0,0,0,0,0,0,1
13352,0.0,0,0.0,0.0,5475.0,0.0,1.0,1.0,1.0,0,...,0,0,0,0,0,0,0,0,0,1
6444,174.0,0,0.0,41.5,8974.0,0.0,2.0,1.0,1.0,0,...,0,0,0,0,0,0,0,0,0,1
12316,4094.0,0,0.0,2.8,6514.0,0.0,0.0,1.0,0.0,0,...,0,0,0,0,0,0,0,0,0,1
1834,0.0,0,0.0,12.0,23607.0,54.0,0.0,1.0,1.0,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
#train model
model = RandomForestClassifier(max_depth=800, n_estimators=200, random_state=0)
model.fit(X_train, y_train.values.ravel())
# validate
y_pred = model.predict(X_test)
print(f'RF accuracy on validation data is: {accuracy_score(y_test, y_pred)}')

def replace_predictions(labels, pred):
    result = np.array(pred, copy=True)
    for i, label in enumerate(labels):
        if not pd.isnull(label) and label != 0:
            result[i] = label
    return result

# accuracy with extracted labels
y_pred_new = replace_predictions(labels, y_pred)  # more accurate data in comments
print(f'New accuracy on validation data is: {accuracy_score(y_test, y_pred_new)}')

# Submission

In [None]:
df_submission = pd.read_csv("test.csv")
x2 = preprocess_dataframe(df_submission, submission=True)
# reorder columns + add missing columns + remove extra columns
x2_labels = x2.apply(extract_label_from_comment, axis=1)
x2 = x2.reindex(columns=x.columns).fillna(0)

# best result
#model = RandomForestClassifier(max_depth=800, n_estimators=200, random_state=0).fit(x, y.values.ravel())
y_submission = model.predict(x2)
y_submission = replace_predictions(x2_labels, y_submission)
submission = pd.DataFrame({'id': df_submission['id'], 'type': y_submission})
submission.groupby('type').nunique()  # predicted classes

submission.to_csv('submission.csv', index=False)