# Data Cleaning

> In this notebook we will be cleaning these data to prepare it for EDA and Modeling. We will check for null values, outliers, errors, and other attributes that would depreciate EDA and Modeling. We will engineer features to get further insight into these data. We will be using functions such as `to_date`, `style_enumerator`, and `cleaned_data`. The process here serves the purpose for preparing the dataframe for future notebbooks.

---

## Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from PIL import Image
import langid
import requests
from io import BytesIO
import googletrans
import copy
import urllib
import os
import io
import sys
import re

sa = SentimentIntensityAnalyzer()

if sys.version_info < (3, 0):
    from urllib2 import urlopen
else:
    from urllib.request import urlopen

from googletrans import Translator
from colorthief import ColorThief

pd.set_option('max_colwidth', 100)

---

## Reading in Data

In [None]:
df = pd.read_csv('../data/raw_data.csv')
df

## Cleaning Data Functions

In [None]:
# Function that takes in Dataframe, returns numeric values for dates
def to_date(dataframe):
    """ to_date is a function that takes in the date data of the wikiart_scraped.csv and 
        replaces or converts the data to the correct corresponding integer.
        
        args: 
            dataframe : the intended dataframe of use is the wikiart_scraped.csv and was
            designed around the dataset, it is returned with corrected dates
    
    """
    #This converts all roman numbers to century
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XIX-XX cent.'], value='1800')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XX cent.'],value='1900')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XVIII cent.'],value='1700')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XIX cent.'],value='1800')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XX-XXI cent.'],value='1900')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XVI-XVII cent.'],value='1500')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XV-XVI cent.'],value='1400')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XVII-XVIII cent.'],value='1600')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XVI cent.'],value='1500')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XV cent.'],value='1400')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XIV-XV cent.'],value='1300')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XVII cent.'],value='1600')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XII-XIII cent.'],value='1100')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['X cent.'],value='900')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XIII-XIV cent.'],value='1200')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['VIII cent.'],value='700')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['VII-VIII cent.'],value='600')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XIV cent.'],value='1300')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XI cent.'],value='1000')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XI-XII cent.'],value='1000')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XXI cent.'],value='2000')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XIX-XX cent.'],value='1800')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XVIII-XIX cent.'],value='1800')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['47'], value='1447')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['48'],value='1448')

    ## Hyphen remover
    for index, row in dataframe.iterrows():
        date_range = row['Date']
        for c in date_range:
            if c == '-':
                val = date_range[:date_range.index(c)]
                dataframe.at[index, 'Date'] = (val)
                break

    #This converts all date's to numeric values
    dataframe['Date'] = dataframe['Date'].astype(int)

    return dataframe

In [None]:
def lang_column(dataframe):
    df['Language'] = None
    for i in range(0, len(df['Artwork'])):
        df['Language'][i] = langid.classify(dataframe['Artwork'][i])[0]    
    return dataframe

In [None]:
def trans_column(dataframe):
    translator = Translator()
    dataframe['Artwork'] = dataframe['Artwork'].astype(str)
    dataframe['translated'] = dataframe.loc[dataframe.Language != 'en']['Artwork'].apply(translator.translate, 
                                                                                  src='auto', 
                                                                                  dest='en').apply(getattr, 
                                                                                                   args=('text',))
    return dataframe

In [None]:
def cleaned_data(dataframe):
    """
    cleaned_data is a function that does blah blah blah and returns blah blah blah
    
    args: 
        dataframe: the dataframe that the user wants to clean
    """
    print('Before Cleaning')
    print('='*20)
    print('Columns')
    print(dataframe.columns)
    print('_'*20)
    print('Dataframe Size')
    print(dataframe.shape)
    print('_'*20)
    print('Dataframe Unique Values')
    print(dataframe.nunique())
    print('_'*20)
    print('Null Values in Each Column')
    print(dataframe.isna().sum())
    print('_'*20)
    print('Data Types in Each Column')
    print(dataframe.dtypes)
    print('='*20)
    
    #This drops cuplicates of any work of art to decrease chances of sketches
    dataframe = dataframe.drop_duplicates(subset=['Artwork', 'Artist', 'Date'], 
                       keep='last')
    
    #This drops any values in the style columns that have less than a count of 500
    #So the model can properly train on the style
    s = dataframe['Style'].value_counts() > 500
    s = list(s[s == True].index)
    dataframe = dataframe[dataframe['Style'].isin(s)]
    
    #Changing the dates to ints
    dataframe = to_date(dataframe)
    
    #Adding a language column based on the language used in the Artwork column
    dataframe = lang_column(dataframe)
    
    #Adding a translated title column based on Artwork column
    dataframe = trans_column(dataframe)
    
    #Adding sentiment analysis column
    dataframe['v_sent'] = dataframe.Artwork.apply(lambda r: sa.polarity_scores(r)['compound'])
    
    print('After Cleaning')
    print('='*20)
    print('Columns')
    print(dataframe.columns)
    print('_'*20)
    print('Dataframe Size')
    print(dataframe.shape)
    print('_'*20)
    print('Dataframe Unique Values')
    print(dataframe.nunique())
    print('_'*20)
    print('Null Values in Each Column')
    print(dataframe.isna().sum())
    print('_'*20)
    print('Data Types in Each Column')
    print(dataframe.dtypes)
    print('='*20)
    
    #returning the dataframe with an index reset
    return dataframe.reset_index(drop=True)

In [None]:
clean_df = cleaned_data(df)
clean_df

# Creating Art Image Folders

> This for loop creates the list of folders based upon the styles within the `clean_df` to create the directory that will later be used for modeling

In [None]:
root_path = '../images/styles'
list_styles = [i for i in clean_df['Style'].unique()]
for items in list_styles:
    path = os.path.join(root_path, items)
    os.mkdir(path)

# Putting Images in Folders

> The following for loop iterates through the `clean_df` dataframe, reads the image link, converts the image, resizes it, then saves it to it's corresponding `'Style` and index. 

In [None]:
for i, row in clean_df.iterrows():

    response = requests.get(clean_df['Link'][i])
    img = Image.open(BytesIO(response.content))
    img = img.convert("RGB") 
    img = img.resize((250,250))
    filepath = f'../images/styles/{clean_df["Style"][i]}/image_{i}.jpg'
    img.save(filepath)

---

## Grabbing dominant colors from filepaths

In [None]:
from __future__ import print_function
import binascii
import struct
from PIL import Image
import scipy
import scipy.misc
import scipy.cluster
from scipy.spatial import KDTree
from webcolors import CSS3_HEX_TO_NAMES, hex_to_rgb

def dom_color(image):
    # for loop that grabs each image
    NUM_CLUSTERS = 5

    # reading image
    im = Image.open('../images/' + image, mode='r')
    im = im.resize((150, 150))      # optional, to reduce time
    ar = np.asarray(im)
    shape = ar.shape
    ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)

    # finding clusters
    codes, dist = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)
    # print('Top 5 Colors:\n', codes)

    vecs, dist = scipy.cluster.vq.vq(ar, codes)         # assign codes
    counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences

    index_max = scipy.argmax(counts)                    # find most frequent
    peak = codes[index_max]
    colour = binascii.hexlify(bytearray(int(c) for c in peak)).decode('ascii')
    
    # print('Most dominant color: %s (#%s)' % (peak, colour))
    # print(colour)
    
    
    peak = tuple(peak)
    
    # returns top color
    def convert_rgb_to_names(peak):
    
        # a dictionary of all the hex and their respective names in css3
        css3_db = CSS3_HEX_TO_NAMES
        names = []
        rgb_values = []
        for color_hex, color_name in css3_db.items():
            names.append(color_name)
            rgb_values.append(hex_to_rgb(color_hex))
    
        kdt_db = KDTree(rgb_values)
        distance, index = kdt_db.query(peak)
        
        top_color = names[index]
        return top_color
    
    
             
    return colour, convert_rgb_to_names(peak)

# source: https://medium.com/codex/rgb-to-color-names-in-python-the-robust-way-ec4a9d97a01f


list_imgs = os.listdir('../images/')
new_list_imgs = ["../images/" + f for f in list_imgs]

# split up image folder into 20. 
chunks = [new_list_imgs[x:x+4500] for x in range(0, len(new_list_imgs), 4500)]

In [None]:
chunk0 = iter_colors(0)
chunk1 = iter_colors(1)
chunk2 = iter_colors(2)
chunk3 = iter_colors(3)
chunk4 = iter_colors(4)
chunk5 = iter_colors(5)
chunk6 = iter_colors(6)
chunk7 = iter_colors(7)
chunk8 = iter_colors(8)
chunk9 = iter_colors(9)
chunk10 = iter_colors(10)
chunk11 = iter_colors(11)
chunk12 = iter_colors(12)
chunk13 = iter_colors(13)
chunk14 = iter_colors(14)
chunk15 = iter_colors(15)
chunk16 = iter_colors(16)
chunk17 = iter_colors(17)
chunk18 = iter_colors(18)
chunk19 = iter_colors(19)

chunky_df = pd.concat([chunk0, 
                      chunk1,
                      chunk2,
                      chunk3,
                      chunk4,
                      chunk5,
                      chunk6,
                      chunk7,
                      chunk8,
                      chunk9,
                      chunk10,
                      chunk11,
                      chunk12,
                      chunk13,
                      chunk14,
                      chunk15,
                      chunk16,
                      chunk17,
                       chunk18,
                       chunk19])

chunky_df = chunky_df.set_index('Image').sort_index().dropna()

clean_df - clean_df.join(chunky_df, how='left')

In [None]:
clean_df['hex'] = None
clean_df['color'] = None

# iterate over files in that directory
def iter_files(chunks):
    for images in chunks:
        print(images)
        num = ""
        for c in images:
            if c.isdigit():
                num = num + c
        num = int(num)
        # applies dominant color function
        dom_color(images)    
        # assigns dominant color hex to new column based on index
        clean_df['hex'][num] = dom_color(images)[0]
        clean_df['color'][num] = dom_color(images)[1]
        print(dom_color(images))
        print('---' *5)
    return clean_df

---

## Saving the Cleaned Dataframe

> We save the data as `clean_art.csv` so that it may be used later for EDA for visualizations of the available data in `03_EDA.ipynb`

In [None]:
clean_df.to_csv('../data/clean_art.csv', index = False)