# Finishing off the prep data

## Imports

In [1]:
# Imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)

In [2]:
# dtype={'type': str} prevents being confused with data type for large data sets
train = pd.read_csv('data/train.csv', index_col='id', dtype={'type': str})
test = pd.read_csv('data/test.csv', index_col='id', dtype={'type': str})
train_translated = pd.read_csv('data/train_translated.csv', dtype={'type': str})
test_translated = pd.read_csv('data/test_translated.csv', index_col='id', dtype={'type': str})
combined_data = pd.read_csv('data/combined_data.csv', index_col='id', dtype={'type': str})
combined_data_translated = pd.read_csv('data/combined_data_translated.csv', index_col='id', dtype={'type': str})
combined_data_fully_translated = pd.read_csv('data/combined_data_fully_translated.csv', index_col='id', dtype={'type': str})
prep = pd.read_csv('data/prep.csv', index_col='id', dtype={'type': str})

  prep = pd.read_csv('data/prep.csv', index_col='id', dtype={'type': str})


## Add new colunm for each feature


Features to add a columns for whether the feature has a value:
  
    - location
    - participant
    - damages
    - collection_additional_nr
    - participants_role
    - event_type
    - city_municipality
    - country
    - musealia_additional_nr
    - is_original
    - 'class_manufacturer's name', 'class_name on the source document' -> find orginal feature name 
    - museum_abbr
    - color
    - collection_mark
    - state
    - technique
    - material


    Unsure:
    - 'startYear', 'startMonth','startDay', 'endYear', 'endMonth', 'endDay'
    - 'text_features'
    - 'amount IN pieces', 'circumference IN cm', 'diameter IN cm',
           'distance IN cm', 'document volume IN pages', 'film frame IN mm²',
            'height IN cm', 'image height IN cm', 'image width IN cm',
           'length IN cm', 'negative format IN mm²', 'page height IN cm',
           'page width IN cm', 'paper format IN A6', 'photo format IN mm²',
           'print volume IN pages', 'size (clothing) IN number', 'thickness IN cm',
           'time/ duration IN century', 'volume of the book IN pages',
           'volume of wood IN dm3', 'volume of writing IN autograph',
           'weight IN g', 'width IN cm', 'film frame IN mm²_height',
           'film frame IN mm²_width', 'negative format IN mm²_height',
           'negative format IN mm²_width', 'photo format IN mm²_height'
          'photo format IN mm²_width' etc 
    - 'ks', 'musealia_seria_nr', 'musealia_queue_nr', 'collection_queue_nr',
       'element_count'
    - city_municipality, country

    Not needed for:

    - before_Christ_no
    - musealia_mark__
    - 'location_building', 'location_street', 'location_country', 'location_address'

### Defining Fuctions

In [3]:
# collects all column names that start with col_start
def col_collection(data, col_start):
        cols = []
        for c in data.columns:
            if (c.startswith(col_start)):
                cols.append(c)
        print(str(len(cols)) + ' columns found that start with ' + str(col_start))
        return cols

In [4]:
# creates a new column col_start_has_a_value that states whether that feature has a value or is NaN
# aware this is very inefficient - might use for some cases  
def feature_has_value(data, col_start):
        cols = col_collection(data, col_start)
        data[str(col_start)+'_has_a_value'] = 0
        for i in data.index:
            for c in cols:
                if (data.loc[i][c] == 1):
                    data.at[i, (str(col_start)+'_has_a_value')]  = 1 
                    continue

In [5]:
# creates a new column for each column in cols. Each column shows whether an instance has a value(1) or a NaN (0) for that feature
def add_feature_has_value_cols(data, org_data, cols):
    for c in cols:
        data[str(c)+'_has_a_value'] =  org_data[c].apply(lambda x: 0 if pd.isnull(x) else 1)
    return data

### Apply to features 

In [7]:
#copy of hot encoded data
data = prep.copy()

In [8]:
#data pre manipulation
org_data = combined_data_fully_translated.copy()

In [9]:
#list of all columns to be added
cols = ['location', 'participant', 'damages', 'collection_additional_nr', 'participants_role', 'event_type',  'musealia_additional_nr','is_original', 'class', 'museum_abbr', 'color', 'collection_mark', 'state', 'technique', 'material']

In [10]:
data = add_feature_has_value_cols(data, org_data, cols)

In [11]:
data.columns

Index(['ks', 'musealia_seria_nr', 'musealia_queue_nr', 'collection_queue_nr',
       'element_count', 'type', 'source', 'material_Polish',
       'material_RC Photo Paper', 'material_a pearl',
       'material_acetate cellulose film', 'material_albumen paper',
       'material_albumin paper', 'material_aluminium', 'material_amber',
       'material_artificial fiber material', 'material_artificial leather',
       'material_artificial material', 'material_atlas',
       'material_ballpoint pen ink', 'material_birch', 'material_bone',
       'material_brass', 'material_brocade (clothing variety)',
       'material_bronze', 'material_canvas', 'material_canvas (type of cloth)',
       'material_cardboard', 'material_cast iron', 'material_celluloid',
       'material_ceramics', 'material_chalk', 'material_chamois leather',
       'material_chamotte', 'material_charcoal', 'material_chromogen emulsion',
       'material_chromogen paper', 'material_clay',
       'material_clothing variety', 'm

In [12]:
#Test 

#print(org_data['location'].isnull().sum())
#data['location_value'] = org_data['location'].apply(lambda x: 0 if pd.isnull(x) else 1)
#print(data['location_value'].value_counts())

### Columns to delete

In [13]:
parish_cols = col_collection(data, 'parish')
data = data.drop(columns= parish_cols)

3 columns found that start with parish


### Saving changes to prep

In [14]:
data.to_csv('data/prep.csv')