In [1]:
%matplotlib inline
from typing import Dict, List, Tuple
import re
import numpy as np
from matplotlib.colors import to_rgb, rgb_to_hsv
from colorspacious import cspace_converter
import seaborn as sns
import pandas as pd
import requests
from bs4 import BeautifulSoup
import pykakasi.kakasi as kakasi

In [2]:
# colors defined by TABLEAU
from matplotlib.colors import TABLEAU_COLORS as tableau_colors

In [3]:
re_brancket = re.compile(r'\(.*\)')

def normalize_name(name: str) -> str:
    return re_brancket.sub('', name).strip().replace("'", '').lower()

In [4]:
colors = list()

In [5]:
for name, rgb in tableau_colors.items():
    colors.append({
        'name': name,
        'normalized_name': normalize_name('tableau ' + name.split(':', 1)[1]),
        'resource': 'tableau',
        'html_color': rgb.upper(),
    })

In [6]:
# colors difined by CSS4
from matplotlib.colors import CSS4_COLORS as css4_colors

In [7]:
for name, rgb in css4_colors.items():
    colors.append({
        'name': name,
        'normalized_name': normalize_name(name),
        'resource': 'css4',
        'html_color': rgb.upper(),
    })

In [8]:
# colors difined by xkcd.com (XKCD color survey)
# https://blog.xkcd.com/2010/05/03/color-survey-results/
# https://xkcd.com/color/rgb/
def get_xkcd_colors() -> Dict:
    colors = dict()
    r = requests.get('https://xkcd.com/color/rgb.txt')
    for line in r.text.splitlines():
        if line.startswith('#'):
            continue
        color = line.strip().split('\t')
        colors[color[0]] = color[1]
    return colors

In [9]:
# from seaborn.colors import xkcd_rgb
xkcd_rgb = get_xkcd_colors()

In [10]:
for name, rgb in xkcd_rgb.items():
    colors.append({
        'name': name,
        'normalized_name': normalize_name(name),
        'resource': 'xkcd',
        'html_color': rgb.upper(),
    })

In [11]:
# colors from Crayola crayons
# https://en.wikipedia.org/wiki/List_of_Crayola_crayon_colors
# https://www.crayola.com/explore-colors.aspx
def get_crayon_colors() -> Dict:
    colors = dict()
    # r = requests.get('https://www.crayola.com/explore-colors.aspx')
    # download above url as 'explore.colors.crayola.com.html'
    class dummy_requests(object):
        def get(self, path: str) -> None:
            with open(path, 'rt') as rf:
                self.text = rf.read()
                return
    
    r = dummy_requests()
    r.get('explore.colors.crayola.com.html')
    soup = BeautifulSoup(r.text, 'html.parser')
    colorlists = soup.find_all('li', class_='color-box')
    for cl in colorlists:
        cstyles = cl.get('style').strip().split(';')
        rgb = None
        for cs in cstyles:
            k, v = cs.strip().split(':', 1)
            if k.strip() == 'background-color':
                rgb = v.strip()
                break
        name = cl.a.text.strip()
        if (
            rgb is not None
        ) and (
            'w/' not in name
        ):
            colors[name] = rgb
    return colors

In [12]:
# from seaborn.colors import crayons
crayons = get_crayon_colors()

In [13]:
for name, rgb in crayons.items():
    colors.append({
        'name': name,
        'normalized_name': normalize_name(name),
        'resource': 'crayon',
        'html_color': rgb.upper(),
    })

In [14]:
# japanese traditional colors from www.colordic.org
def get_japanese_colors() -> List:
    import pykakasi.kakasi as kakasi
    
    kakasi = kakasi()
    kakasi.setMode('H', 'a')
    kakasi.setMode('C', False)
    conv = kakasi.getConverter()
    jpcolors = list()
    r = requests.get('https://www.colordic.org/w')
    soup = BeautifulSoup(r.text, 'html.parser')
    colortable = soup.find_all('table', class_='colortable')[0]
    colordata = colortable.find_all('td')
    for cd in colordata:
        cinfo = cd.a.get('title').split()
        name = cinfo[0]
        normed = conv.do(cinfo[1])
        rgb = cinfo[2].upper()
        jpcolors.append((name, normed, rgb))
    return jpcolors


jpcolors = get_japanese_colors()

In [15]:
for jpci in jpcolors:
    name, normed_name, rgb = jpci
    colors.append({
        'name': name,
        'normalized_name': normalize_name(normed_name),
        'resource': 'japanese',
        'html_color': rgb.upper(),
    })

In [16]:
colors = pd.DataFrame(colors)

In [17]:
def rgb_to_cmyk(rgb: Tuple[float]) -> Tuple[float]:
    r, g, b = rgb
    if (r, g, b) == (0.0, 0.0, 0.0):
        return 0.0, 0.0, 0.0, 1.0
    c = 1 - r
    m = 1 - g
    y = 1 - b
    min_cmy = min(c, m, y)
    c = (c - min_cmy) / (1 - min_cmy)
    m = (m - min_cmy) / (1 - min_cmy)
    y = (y - min_cmy) / (1 - min_cmy)
    k = min_cmy
    return c, m, y, k

In [18]:
def rgb_to_yuv(rgb: Tuple[float]) -> Tuple[float]:
    mat = np.array([
        [0.29900, -0.14713, 0.61500],
        [0.58700, -0.28886, -0.41869],
        [0.11400, 0.43600, -0.10001]
    ])    
    yuv = np.dot(np.array(rgb), mat)
    return tuple(yuv)

In [19]:
def create_color_features(html_colors: List[str]) -> List[Dict]:
    features = list()
    conv = cspace_converter('sRGB1', 'CAM02-UCS')
    for html_color in html_colors:
        rgb = to_rgb(html_color)
        cmyk = rgb_to_cmyk(rgb)
        hsv = rgb_to_hsv(rgb)
        yuv = rgb_to_yuv(rgb)
        if rgb[0] == rgb[1] == rgb[2]:
            temp = 'none'
        elif yuv[1] < 0 and yuv[2] > 0:
            temp = 'warm'
        elif yuv[1] > 0 and yuv[2] < 0:
            temp = 'cool'
        else:
            temp = 'middle'
        if yuv[0] > 0.5:
            bright = 1
        else:
            bright = 0
        lab = conv(rgb)
        features.append({
            'red': rgb[0],
            'green': rgb[1],
            'blue': rgb[2],
            'cyan': cmyk[0],
            'magenta': cmyk[1],
            'yellow': cmyk[2],
            'key': cmyk[3],
            'hue': hsv[0],
            'saturation': hsv[1],
            'value': hsv[2],
            'temparature': temp,
            'lightness': lab[0],
            'a_star': lab[1],
            'b_star': lab[2],
            'temparature': temp,
            'brightness': bright,
        })
    return features

In [20]:
color_features = create_color_features(colors['html_color'])

In [21]:
colors = pd.concat([
    colors, pd.DataFrame(color_features)
], axis='columns')

In [22]:
from sklearn.cluster import KMeans

def create_cluster_ids(data: pd.DataFrame, target: str, n_clusters: int) -> List[int]:
    km = KMeans(n_clusters=n_clusters, init='k-means++', random_state=14)
    raw_cluster_ids = km.fit_predict(data[target].values.reshape(-1, 1))
    cluster_idmap = dict()
    for i, x in enumerate(sorted(
        {
            c: np.mean(data[target].values[raw_cluster_ids == c])
            for c in range(n_clusters)
        }.items(), key=lambda x: x[1]
    )):
        cluster_idmap[x[0]] = i
    return [cluster_idmap[x] for x in raw_cluster_ids]

In [23]:
hue_clusters = create_cluster_ids(data=colors, target='hue', n_clusters=10)

In [24]:
lightness_clusters = create_cluster_ids(data=colors, target='lightness', n_clusters=8)

In [25]:
colors = colors.assign(hue_cluster=hue_clusters)
colors = colors.assign(lightness_cluster=lightness_clusters)

In [26]:
colors.describe()

Unnamed: 0,red,green,blue,cyan,magenta,yellow,key,hue,saturation,value,lightness,a_star,b_star,brightness,hue_cluster,lightness_cluster
count,1806.0,1806.0,1806.0,1806.0,1806.0,1806.0,1806.0,1806.0,1806.0,1806.0,1806.0,1806.0,1806.0,1806.0,1806.0,1806.0
mean,0.590268,0.540382,0.453831,0.223892,0.281797,0.385696,0.251667,0.403865,0.601274,0.748333,63.31596,1.680784,5.764116,0.566445,4.147841,3.834994
std,0.309912,0.286069,0.294682,0.322738,0.304812,0.348815,0.222175,0.314605,0.303187,0.222175,21.47955,17.60369,17.594454,0.495703,2.95727,2.011715
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.540153e-22,-32.233348,-39.158902,0.0,0.0,0.0
25%,0.372549,0.333333,0.223529,0.0,0.0,0.0,0.035294,0.117689,0.365036,0.596078,48.83808,-11.50318,-6.177443,0.0,2.0,2.0
50%,0.623529,0.533333,0.443137,0.0,0.186102,0.339105,0.215686,0.333333,0.6,0.784314,64.10547,-0.255243,9.04876,1.0,4.0,4.0
75%,0.854902,0.760784,0.678431,0.371543,0.485714,0.668377,0.403922,0.653838,0.914385,0.964706,80.73206,14.893527,19.219591,1.0,7.0,5.75
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.999048,1.0,1.0,99.99872,41.372432,35.607858,1.0,9.0,7.0


In [27]:
from collections import Counter

Counter(colors.resource)

Counter({'tableau': 10,
         'css4': 148,
         'xkcd': 949,
         'crayon': 234,
         'japanese': 465})

In [28]:
Counter(colors.temparature)

Counter({'cool': 304, 'warm': 901, 'middle': 573, 'none': 28})

In [29]:
Counter(colors.brightness)

Counter({0: 783, 1: 1023})

In [30]:
Counter(colors.hue_cluster)

Counter({6: 180,
         1: 249,
         4: 138,
         9: 202,
         7: 126,
         0: 201,
         8: 136,
         2: 238,
         5: 169,
         3: 167})

In [31]:
Counter(colors.lightness_cluster)

Counter({2: 225, 5: 246, 3: 281, 4: 323, 7: 204, 6: 248, 0: 84, 1: 195})

In [32]:
colors[['name', 'hue_cluster', 'lightness_cluster']].pivot_table(
    index='lightness_cluster', values='name',
    columns='hue_cluster', aggfunc='count'
)

hue_cluster,0,1,2,3,4,5,6,7,8,9
lightness_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,13,5,1,1,5,3,11,22,18,5
1,25,14,6,6,13,20,29,35,18,29
2,28,27,21,6,5,17,38,23,26,34
3,39,50,21,23,15,17,48,11,25,32
4,42,48,31,29,29,31,18,15,26,54
5,28,51,32,30,15,21,19,9,14,27
6,14,37,44,35,40,35,12,6,8,17
7,12,17,82,37,16,25,5,5,1,4


In [33]:
duplicated_normed_name = set(
    colors.loc[
        colors.normalized_name.duplicated(),
        'normalized_name'
    ].values
)
normed_name_duplicated = np.array([
    x in duplicated_normed_name for x in colors.normalized_name.values
])

duplicated_html_color = set(
    colors.loc[
        colors.html_color.duplicated(),
        'html_color'
    ].values
)
html_color_duplicated = np.array([
    x in duplicated_html_color for x in colors.html_color.values
])

duplicated_double = set(
    tuple(x) for x in colors.loc[
        colors[['normalized_name', 'html_color']].duplicated(),
        ['normalized_name', 'html_color']
    ].values
)
double_duplicated = list()
for i, row in colors[['normalized_name', 'html_color']].iterrows():
    double_duplicated.append(np.array([
        row['normalized_name'] == n and row['html_color'] == c
        for n, c in duplicated_double
    ]).any())
double_duplicated = np.array(double_duplicated)

In [34]:
colors.loc[
    double_duplicated,
    ['name', 'normalized_name', 'resource', 'html_color']
].sort_values(by=['normalized_name', 'html_color'])

Unnamed: 0,name,normalized_name,resource,html_color
12,aqua,aqua,css4,#00FFFF
1111,aqua,aqua,crayon,#00FFFF
14,azure,azure,css4,#F0FFFF
1116,azure,azure,crayon,#F0FFFF
17,black,black,css4,#000000
1071,black,black,xkcd,#000000
1127,black,black,crayon,#000000
30,cyan,cyan,css4,#00FFFF
1084,cyan,cyan,xkcd,#00FFFF
148,teal,teal,css4,#008080


In [35]:
colors.loc[
    normed_name_duplicated,
    ['name', 'normalized_name', 'resource', 'html_color']
].sort_values(by='normalized_name')

Unnamed: 0,name,normalized_name,resource,html_color
1458,亜麻色,amairo,japanese,#D6C6AF
1554,天色,amairo,japanese,#2CA9E1
804,apricot,apricot,xkcd,#FFB16D
1110,apricot,apricot,crayon,#FDD5B1
12,aqua,aqua,css4,#00FFFF
...,...,...,...,...
1339,yellow green,yellow green,crayon,#C5E17A
1340,yellow orange,yellow orange,crayon,#FFAE42
899,yellow orange,yellow orange,xkcd,#FCB001
470,yellowgreen,yellowgreen,xkcd,#BBF90F


In [36]:
colors.loc[
    np.logical_and(
        normed_name_duplicated,
        colors.resource == 'japanese'
    ),
    ['name', 'normalized_name', 'resource', 'html_color']
].sort_values(by='normalized_name')

Unnamed: 0,name,normalized_name,resource,html_color
1458,亜麻色,amairo,japanese,#D6C6AF
1554,天色,amairo,japanese,#2CA9E1
1567,海老茶,ebicha,japanese,#773C30
1612,葡萄茶,ebicha,japanese,#6C2C2F
1347,枯茶,karacha,japanese,#8D6449
1572,唐茶,karacha,japanese,#783C1D
1367,黄唐茶,kigaracha,japanese,#B98C46
1585,黄枯茶,kigaracha,japanese,#765C47
1577,栗色,kuriiro,japanese,#762F07
1610,涅色,kuriiro,japanese,#554738


In [37]:
colors.loc[
    np.logical_and(
        normed_name_duplicated,
        colors.resource != 'japanese'
    ),
    ['name', 'normalized_name', 'resource', 'html_color']
].sort_values(by='normalized_name')

Unnamed: 0,name,normalized_name,resource,html_color
1110,apricot,apricot,crayon,#FDD5B1
804,apricot,apricot,xkcd,#FFB16D
12,aqua,aqua,css4,#00FFFF
1111,aqua,aqua,crayon,#00FFFF
1083,aqua,aqua,xkcd,#13EAC9
...,...,...,...,...
1339,yellow green,yellow green,crayon,#C5E17A
899,yellow orange,yellow orange,xkcd,#FCB001
1340,yellow orange,yellow orange,crayon,#FFAE42
470,yellowgreen,yellowgreen,xkcd,#BBF90F


In [38]:
colors.loc[
    np.logical_and(
        html_color_duplicated,
        ~double_duplicated
    ),
    ['name', 'normalized_name', 'resource', 'html_color']
].sort_values(by=['html_color', 'normalized_name'])

Unnamed: 0,name,normalized_name,resource,html_color
1128,black stars,black stars,crayon,#000000
1155,christmas green,christmas green,crayon,#006400
35,darkgreen,darkgreen,css4,#006400
1132,blue green,blue green,crayon,#0095B6
1242,metallic seaweed,metallic seaweed,crayon,#0095B6
...,...,...,...,...
156,yellow,yellow,css4,#FFFF00
1222,laser lemon,laser lemon,crayon,#FFFF66
1325,unmellow yellow,unmellow yellow,crayon,#FFFF66
1194,gel fx white,gel fx white,crayon,#FFFFFF


In [39]:
colors.shape[0], colors['html_color'].drop_duplicates().shape[0]

(1806, 1755)

In [40]:
colors.to_json('colors.jsonl', orient='records', lines=True, force_ascii=False)

In [41]:
!wc colors.jsonl

    1805    3832  780297 colors.jsonl
