In [1]:
import re
import os
import json
import networkx as nx
from collections import Counter
import datetime
import matplotlib.pyplot as plt
import pandas as pd
import random
import re
import unidecode

## Step 1. Get rid of things in parentheses

In [86]:
names  = [line.strip() for line in open('Ikon/extracted/all_artists.dat')]
cntr_o = Counter(names)  
print('Unique name variants: ', len(names))

Unique name variants:  108212


In [90]:
df_0 = pd.DataFrame([(a,b) for a, b in dict(cntr_o).items()])
df_0 = df_0.rename(columns = {0 : 'step0', 1: 'frequency_original'})
df_0.to_csv('Ikon/cleaning_steps/original_cnt.csv', sep = '\t')

In [3]:
fout = open('Ikon/cleaning_steps/step1.csv', 'w')

names1 = []
step0_step1 = []

for n in names:
        
    c = n.replace('\t', ' ')
    if ('(' in n and ')' in n) or ('[' in n and ']' in n):
        c = re.sub(r'\([^)]*\)', '', c)  
        c = re.sub(r'\[[^)]*\]', '', c)  

    if len(c) > 0:
        names1.append(n)
        fout.write(n + '\t' + c + '\n')
        step0_step1.append((n, c))
    else:
        step0_step1.append((n, 'DELETED'))

  


fout.close()
    
print('Total names:  ', len(names1)) 
print('Unique names: ', len(set(names)))

df_s12 = pd.DataFrame(step0_step1)
df_s12 = df_s12.rename(columns = {0 : 'step0', 1: 'step1'})
df_s12.to_csv('Ikon/cleaning_steps/step0_step1.csv', sep = '\t')

Total names:   108203
Unique names:  27498


## Step 2. Splitting lists

In [4]:
names  = [line.strip().split('\t')[1] for line in open('Ikon/cleaning_steps/step1.csv')]
names1 = []
sep    = ' / | &| •• | \*\*| \* | \| \& | and | • | – | és | - '
fout   = open('Ikon/cleaning_steps/step2.csv', 'w')

step1_step2 = []


for name in names:

    name2  = name.replace('|', 'and')
    splits = re.split(sep, name2)
    
    for s in splits:
        if len(s) > 0:
            fout.write(name + '\t' + s + '\n')
            names1.append(s)
            step1_step2.append((name, s))
        else:
            names1.append('DELETED')
            step1_step2.append((name, 'DELETED ' + str(random.random())))

fout.close()
            
len(names1), len(set(names1))


df_s12 = pd.DataFrame(step1_step2)
df_s12 = df_s12.rename(columns = {0 : 'step1', 1: 'step2'})
df_s12.to_csv('Ikon/cleaning_steps/step1_step2.csv', sep = '\t')

## Step 3. Cleaning strings

In [5]:
names  = [line.strip().split('\t')[1] for line in open('Ikon/cleaning_steps/step2.csv')]
names1 = []
fout   = open('Ikon/cleaning_steps/step3.csv', 'w')
names0 = []

step2_step3 = []


to_replace = [
    '..', ',',
    '-,',
    '-', ';', '.', '/', '?', '*', '!', '+', '••',
    ';.', ':', 'Dr', 'Prof', 'művészettörténész', 'plakátművész', 'kortárs művész', 'résztvevő', 'résztvevők', 
    'képzőművész', ' művész', ' alkotó', 'filmesztéta', 'építész', 'eszmetörténész', 'országos széchényi könyvtár',
    'az ', 'Budapest', '(', ')', '[', ']',
    '"']

for name in names:
    
    name1 = name
    names0.append(name)

    for tr in to_replace:
        name1 = name1.replace(tr, '')
        
    name1 = name1.strip().rstrip()
    
    if len(name1) > 1:
        fout.write(name + '\t' + name1 + '\n')  
        names1.append(name1)
        step2_step3.append((name, name1))
    else:
        names1.append('DELETED')
        step2_step3.append((name, 'DELETED'))

        
  
fout.close()
        
print(len(names1), len(set(names1)), len(names))

df_s23 = pd.DataFrame(step2_step3)
df_s23 = df_s23.rename(columns = {0 : 'step2', 1: 'step3'})
df_s23.to_csv('Ikon/cleaning_steps/step2_step3.csv', sep = '\t')

111634 24540 111634


## Step 4. Drop numbers

In [6]:
names  = [line.strip().split('\t')[1] for line in open('Ikon/cleaning_steps/step3.csv')]
names1 = []
fout   = open('Ikon/cleaning_steps/step4.csv', 'w')

step3_step4 = []

for name in names:
    
    nums = re.findall('\d', name)
    if len(nums) == 0:
        fout.write(name + '\t' + name + '\n')  
        names1.append(name)
        step3_step4.append((name, name))
    else:
        step3_step4.append((name, 'DELETED'))
        

fout.close()
       
print(len(names1), len(set(names1)))

df_s34 = pd.DataFrame(step3_step4)
df_s34 = df_s34.rename(columns = {0 : 'step3', 1: 'step4'})
df_s34.to_csv('Ikon/cleaning_steps/step3_step4.csv', sep = '\t')

105115 24206


## Step 5. Drop words

In [7]:
top_words  = [line.strip() for line in open('web2.2-freq-sorted.top100k.nofreqs.txt' , encoding='latin2')]
top_words += ['MKE', 'Magyar Képzőmìvészeti Egyetem', 'kiállítást', 'művészek', 'könyvet', 'bemutatja', 'művészettörténész','Évzáró party', 'Fiatal Képzőművészek Stúdiója']

top_words = set(top_words)

In [13]:
names  = [line.strip().split('\t')[1] for line in open('Ikon/cleaning_steps/step4.csv')]
names1 = []
fout   = open('Ikon/cleaning_steps/step5.csv', 'w')

step4_step5 = []

for name in names:
    
    words   = name.split(' ')
    profile = []
    name2 = []
    for w in words:
        
        if len(w) > 1:
            if w[0] == w[0].lower() and w in top_words:
                pass
              
            else:
                name2.append(w)

                    

    #print(words, profile, name2)
        
    if len(name2) > 1:
        name2 = ' '.join(name2).strip().rstrip()
        fout.write(name + '\t' + name2 + '\n')  
        names1.append(name2)
        step4_step5.append((name, name2))
        
    else:
        step4_step5.append((name, 'DELETED'))


fout.close()
       
        
len(names1), len(set(names1))


df_s45 = pd.DataFrame(step4_step5)
print(len(df_s45))
df_s45 = df_s45.rename(columns = {0 : 'step4', 1: 'step5'})
df_s45.to_csv('Ikon/cleaning_steps/step4_step5.csv', sep = '\t')

105115


## Step 6. Drop rare and unique variants

 Similarity - are items & Count variants

In [14]:
### save lower-case name variants without accents

names  = [line.strip().split('\t')[1] for line in open('Ikon/cleaning_steps/step5.csv')]
names1 = []
fout   = open('Ikon/cleaning_steps/step6.csv', 'w')

step5_step6 = []

for name in names:
    
    name1 = unidecode.unidecode(name).lower()

    if len(name1) > 0:
        fout.write(name + '\t' + name1 + '\n')  
        names1.append(name1)
        step5_step6.append((name, name1))
    else:     
        step5_step6.append((name, 'DELETED'))

fout.close()    
    
print(len(names1), len(set(names1)))

df_s56 = pd.DataFrame(step5_step6)
df_s56 = df_s56.rename(columns = {0 : 'step5', 1: 'step6'})
df_s56.to_csv('Ikon/cleaning_steps/step5_step6.csv', sep = '\t')

97612 18304


In [17]:
###  frequency distriution of the variants
cntr = dict(Counter(names1))

In [18]:
###  check on the similarity values --- 80 seems like a reasonable threshold

for fn in os.listdir('Ikon/name_matching')[15:16]:  
    with open('Ikon/name_matching/' + fn) as f:
        next(f)
        for line in f:
            n1, n2, score = line.strip().split('\t')
            score = float(score)
            if score <= 80 and score > 79:
                print(n1, '---', n2)
                
                
### --> 80 similarity is different enough

palman zsuzsanna --- palmann zsuzsi
palman zsuzsanna --- papp zsuzsanna
palmann zsuzsi --- palman zsuzsanna
pap kata --- papp katalin
papp gábor --- papp tibor
papp katalin --- pap kata
papp kinga --- papp réka kinga
papp réka kinga --- papp kinga
papp tibor --- papp gábor
papp zsuzsanna --- palman zsuzsanna
part --- partum
partum --- part
pataki szandra --- pataki zora
pataki zora --- pataki szandra
paul --- paul p
paul kessel --- paul klee
paul klee --- paul kessel
paul p --- paul
paál csaba --- pléh csaba
peer krisztián --- peer krisztián  költő
peer krisztián  költő --- peer krisztián
performance --- performer
performer --- performance
perényi tamás --- petrik tamás
peternák miklós --- petrányi miklós
petrik tamás --- perényi tamás
petrika --- petruska
petrina ildikó --- pető anna ildikó
petruska --- petrika
petrányi miklós --- peternák miklós
petöcz andrás --- petőcz andrás író
pető anna ildikó --- petrina ildikó
petőcz andrás költő --- petőcz andrás mv
petőcz andrás mv --- petőcz

In [29]:
###  drop unique and rare variants

step7_stats = []

alll   = []
drop   = []
unique = []
for line in open('Ikon/similarities/COMBINED_scores.dat'):
    name, scores = line.strip().split('\t\t')
    scores = [int(s) for s in scores.split('\t')]
    
    

    if name in cntr:
        step7_stats.append((name, max(scores), cntr[name]))
        alll.append(name)
        if max(scores) < 80 and cntr[name] < 3:
            drop.append(name)
        else:
            unique.append(name)

    


print(len(unique), len(drop))

df_s7 = pd.DataFrame(step7_stats)
print(len(df_s7))
df_s7 = df_s7.rename(columns = {0 : 'step6', 1 : 'version 6 - top sim', 2 : 'version 6 - cnt'})
df_s7.to_csv('Ikon/cleaning_steps/step7_stats.csv', sep = '\t')

6292 11857
18149


## Step 7. Similarity graphs

In [52]:
### counter
cntr = dict(Counter(names1))
len(names1)

97612

In [78]:
### read the similarity values between all pairs of name variants
pairs_similarities = []
nodes_all          = set()

files = os.listdir('Ikon/name_matching')
sims  = []

for fn in files:  
    with open('Ikon/name_matching/' + fn) as f:
        next(f)
        for line in f:
            n1, n2, score = line.strip().split('\t')
            pairs_similarities.append((n1, n2, float(score)))  
            sims.append(float(score))
            nodes_all.add(n1)
            nodes_all.add(n2)

In [54]:
### build the similarity w a given threshold
G = nx.Graph()

similarity_limit = 79

for n1, n2, s in pairs_similarities:
    if s > similarity_limit:
        G.add_edge(n1, n2, weight = s)
        
len(G.nodes), len(G.edges)

(5900, 4410)

In [79]:
### hasonlosagi komponensek listazasa

nodes_added    = set()
components     = sorted(nx.connected_components(G), key=len, reverse=True)
names_clusters = []

for ind, c in enumerate(components):  

    for cc in list(c):
        names_clusters.append((ind, cc))
        nodes_added.add(cc)


nodes_missed = list(nodes_all.difference(nodes_added))

print(len(nodes_missed))

for jnd, nm in enumerate(nodes_missed):
    names_clusters.append((jnd+ind, nm))
        

df_comp = pd.DataFrame(names_clusters)
df_comp = df_comp.rename(columns = {1 : 'step6', 0: 'similarity_component_id'})
df_comp.to_csv('Ikon/cleaning_steps/components.csv', sep = '\t')
print(len(df_comp))
df_comp.head()

16719
22619


Unnamed: 0,similarity_component_id,step6
0,0,szabó ákos
1,0,szabó netta
2,0,szabics
3,0,szabó iván
4,0,szabó tamara


## Step x --- output:
- grafkomponens szerint csoprtositva
- name variant cnt, verzio, index oszlopok