# tileSetter Testing & Process



In [1]:
import string
import pandas as pd
import datetime as dt

## Word Segment Loading &  Setup

In [14]:
#--! Import New Segments ------------------------------------------ #
def new_segments(filepath,old_df=''):
    # Imports a csv file containing a list of word/name new_segments
    # Integrates segments into existing segment data

    segments = pd.read_csv(filepath)
    print('New word segments loaded.')
    clean_seg = segment_cleaner(segments)
    print('New word segments cleaned + determined length + cvc format')
    labeled = label_pres(clean_seg)
    print('New prefixes labeled.')

    input1 = input('Would you like to merge new segments with existing segments? y/n\n')
    if input1 == 'y':
        output = merge_segments(labeled,old_df)
        print('Merge accepted. Returning merged segment dataframe.')
    elif input1 =='n':
        print('Merge declined. Returning new segments only.')
        output = labeled
    else:
        print('Invalid input. Defaulting to return new segments only.')
    return output

#--! Merge new segments into old segments -------------------------- #
def merge_segments(new,old):
    if old == '':
        error('No existing word segment data found.')
    else:
        merged = pd.concat([new,old],ignore_index=True).sort_values(by='bit')
        merged = merged.drop_duplicates().reset_index(drop=True)

        input = ('Word segment data merged. Would you like to save? y/n')
        if input == 'y':
            fname = 'WS_' + str(dt.date.today) + '.csv'
            merged.to_csv('./data/word-segments/' + fname,index=False)
            merged.to_csv('./data/word-segments.csv',index=False)

    return merged

#--! Word Segment Cleaner ------------------------------------------- #
def segment_cleaner(df):
    # Takes in a series of word/name segments
    # Returns dataframe with segments + length + cvc format

    df = df.drop_duplicates().sort_values() # Drop duplicates
    df = df[~df.str.contains(r'[_0-9]')].reset_index(drop=True) # Remove non-letters

    Ls = df.str.len() # get length

    # figure out cvc format of segments
    cvc = df.copy()
    cvc = cvc.str.replace('[^aeiouy]','c',regex=True) #label consonants
    cvc = cvc.str.replace('[aeiouy]','v',regex=True) # label vowels

    # Construct dataframe
    final = pd.DataFrame({})
    final.insert(loc=0,column='bit',value=df)
    final.insert(loc=1,column='L',value=Ls)
    final.insert(loc=2,column='cvc',value=cvc)
    return final

#--! Label Segments As Prefixes ------------------------------------- #
def label_pres(filepath, bit_df, abcs=string.ascii_lowercase):

    stucks = pd.read_csv(filepath)
    stucks = stucks.replace('\_','~',regex=True)
    stucks = stucks[~stucks.prefix.str.startswith('~')].reset_index(drop=True)
    markers = stucks.marker.str.lower()
    
    bit_df['prefix'] = False
    N_tagged = 1
    for n,i in enumerate(markers):
        if n>0 :
            if markers[n][0] != markers[n-1][0]:
                print(markers[n-1][0].upper()+' markers tagged: '+ str(N_tagged))
                N_tagged = 1
            elif n == len(markers)-1:
                N_tagged += 1
                print(markers[n-1][0].upper()+' markers tagged: '+ str(N_tagged))
            else:
                N_tagged += 1

        sliced = bit_df[bit_df.bit.str.startswith(i[0:3])].copy()
        # Get regex filter string
        F = get_regex(i)
        
        # Tag prefixes that match regex as 'True'
        sliced.loc[sliced.bit.str.match(F),'prefix'] = True
        
        bit_df.update(sliced)
    print('Active prefixes updated!')
    return bit_df

#--! Regex Generator for Prefix Filtering --------------------------- #
def get_regex(marker):
    # Takes a "marker" string and generates regex
    # To select all prefixes in that group that
    # come alphabetically after that marker

    m = marker
    # go through marker backwards and omit first 2 letters of marker
    # e.g., dragon -> noga
    for i,L6 in enumerate(m[::-1][:-2]) :
        L5 = m[-i-2]

        if i == 0:
            if m[-1] == '~': # last character is underscore -> do not include last letter in regex selection
                R6 =''
                L5 = abcs[abcs.find(L5)+1]
            else:
                R0 = abcs[abcs.find(L6)]
                R6 = '['+ R0 + '-z]' # range of last letter
            F = L5 + R6

        else:
            if L6 == 'z':
                R0 = abcs[abcs.find(L6)] # avoid bounds errors when L6 = z
            else:
                R0 = abcs[abcs.find(L6)+1] # range starts at letter after L6
            R = '['+ R0 + '-z]'

            F = L5 +'('+R+'|'+F+')' # Add to existing regex


    F = m[0]+F+'.*' # add start indicator and .* to select full segment
    return F

In [100]:
#--! Name Generator class ----------------------------------------- #
# Example formats:
# 
class nameGen:
    def __init__(self,name_format,N=300,spread='equal'):
        # takes in 'n_format' and creates a name constructor matrix
        name_format = name_format.lower()
        seglist = name_format.split(',')
        seg_df = pd.DataFrame({}, columns=['bit','Lmin','Lmax','range'])
        
        for seg in seglist:
            if seg.startswith(':')==True:  # segment types denoted by preceeding :
                if 'bit' in seg:
                    
                    if len(seg) > 4:
                        Lmin = int(seg[4])
                        Lmax = int(seg[5])   
                    else:
                        Lmin = 3
                        Lmax = 5
                    seg_df.loc[len(seg_df)] = ['bit',Lmin,Lmax,('a','z')] 
                    
                elif 'x' in seg or 'v' in seg or 'c' in seg:
                    for l in seg[1:]:
                        if l == 'x':
                            seg_df.loc[len(seg_df)] = ['letter',1,1,('a','z')]
                        elif l =='v':
                            seg_df.loc[len(seg_df)] = ['vowel',1,1,('a','z')]
                        elif l == 'c':
                            seg_df.loc[len(seg_df)] = ['consonant',1,1,('a','z')]
            else:
                seg_df.loc[len(seg_df)] = [seg,len(seg),len(seg),('a','z')]
        
        self.template = seg_df
        self.Lmin = seg_df.Lmin.sum()
        self.Lmax = seg_df.Lmax.sum()
        self.N = N
        self.spread = spread

    def stats(self):
        print(self.template)
        print('------------------------------')
        print('Generator:    ' + self.nf)
        print('Generates:    '+ str(self.N)+ ' names')
        print('Distribution: ' + self.spread)
        print('Name lengths: ' + str(self.Lmin) + 'L-' + str(self.Lmax) + 'L')
        
ns = nameGen('\':bit36,:xcv\'')
ns.stats()
                
        

         bit Lmin Lmax   range
0    ':bit36    7    7  (a, z)
1     letter    1    1  (a, z)
2  consonant    1    1  (a, z)
3      vowel    1    1  (a, z)
------------------------------
Generator:    ':bit36,:xcv'
Generates:    300 names
Distribution: equal
Name lengths: 10L-10L


In [85]:
def refresh(name_generator,stuck=True):
    ng = name_generator
    
    

Unnamed: 0,bit,L,cvc,prefix
654,aadhav,6,vvccvc,False
656,aadhya,6,vvccvv,False
677,aadith,6,vvcvcc,False
759,aadvik,6,vvccvc,False
1353,aahana,6,vvcvcv,False
...,...,...,...,...
1085223,zymere,6,cvcvcv,False
1085238,zymier,6,cvcvvc,False
1085629,zyonna,6,cvvccv,False
1085953,zyriah,6,cvcvvc,False


In [None]:
w_s = pd.read_csv('./data/word-segments.csv')