In [1]:
import syllables 
import pandas as pd
import random
import numpy as np
import re
import language_tool_python
from IPython.display import display
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
import timeit
from decimal import *

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jennifergutman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import language_tool_python

In [3]:
tool = language_tool_python.LanguageTool('en-US')

In [4]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jennifergutman/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [5]:
with open("Dataset_poems.txt", "r",encoding="utf8") as file:
    temp = []
    for line in file:
        line1 = re.sub('[^a-zA-Z ]', '', line)
        line1 = line1.split()
        if line1 != ['', ''] and line1 != ['']:
            for i in line1:
                if i == '':
                    continue
                else:
                    temp.append(i.lower())
words_df = temp[:10000] 
# Remove typos from dataset
words_df.remove('o') 
words_df.remove('n')

In [6]:
# Create lines of neighboring words in dataset
lines = []
for i in range(3, len(words_df)-3):
    lines.append(words_df[i-3]+' '+' '+words_df[i-2]+' '+' '+words_df[i-1]+' '+' '+words_df[i]+' '+' '+words_df[i+1]+' '+' '+words_df[i+2]+' '+' '+words_df[i+3])

# Calculate co-occurence matrix for words that aren't stopwords
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,1), stop_words = stopwords.words('english'))
X = cv.fit_transform(lines)
Xc = (X.T * X) 
Xc.setdiag(0) # set the diagonals to be zeroes as it's pointless to be 1

names = cv.get_feature_names() # Key words
cooccur_df = pd.DataFrame(data = Xc.toarray(), columns = names, index = names)

# Convert occurences to probabilities
total = cooccur_df.to_numpy().sum()
cooccur_df = cooccur_df.div(total)

# Save to csv file to view
cooccur_df.to_csv('co_occur_mat.csv', sep = ',')

In [7]:
class poem_generation:
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', None)

    # Select random words for each poem
    def select_random_words(self,words_df):
        words_array1 = []
        while len(words_array1) <= 15:
            random_number = random.randint(0, len(words_df)-1)
            if words_df[random_number] != 0:
                words_array1.append(words_df[random_number])

        return words_array1
    
    # Create poem lines from array of words
    def create_poems(self,arr):
        poem = [0 for _ in range(4)]
        for i in range(1,5):
            temp =  arr[4*(i-1):4*i]
            poem[i-1] = temp
            poem[i-1]=' '.join(poem[i-1])
        return poem
    
    # Get the number of syllables in each line and how closely it follows poem structure
    def store_syllables(self,arr):
        a = [0 for _ in range(5)]
        temp = 0
        j = 0
        for words,i in zip(arr,range(0,16)):
            temp = temp + syllables.estimate(words)
            if i in [3,7,11,15]:
                a[j] = temp
                temp = 0
                j += 1
        # Evaluated based on poem 5-6-6-5 structure
        if a[0] == 5:
            a[4] += 1
        if a[1] == 6:
            a[4] += 1
        if a[2] == 6:
            a[4] += 1
        if a[3] == 5:
            a[4] += 1
        # Get values down to determined weight - 50% of fitness score
        a[4] = 0.5 * a[4] / 4
        return a
    
    # Evaluate the grammar of each line, return number of errors found in poem
    def Grammar_checker(self,text):
        error = 0
        poem_error = []
        for lines in text:
            matches = tool.check(lines)
            if len(matches)>0:
                error = error + matches[0].errorLength
        poem_error.append(error)
        return poem_error
    
    # Perform n-point non-uniform crossover between two poems
    def crossover(self,p1,p2,cross):
        part1 = []
        part2 = []
        part3 = []
        child1 = []
        child2 = []
        j = 0
        k = 0
        random_number1 = sorted([random.randint(1, 15) for i in range(cross)])
        random_number2 = sorted([random.randint(1, 15) for i in range(cross)])
        for i in random_number1:
            part1.append(p1[j:i])
            j = i
        for i in random_number2:
            part2.append(p2[k:i])
            k = i
        part1.append(p1[j:])
        part2.append(p2[k:])
        part2.extend(part1)
        random.shuffle(part2)  # Add a random mutation to one of the child poems
        for i in range(len(part2)):
            part3.extend(part2[i])
        for i in range(int(len(part3)/2)):
            child1.append(part3[i])
        for i in range(int(len(part3)/2),int(len(part3))):
            child2.append(part3[i])
        child1_poem = self.create_poems(child1)
        child2_poem = self.create_poems(child2)
        return child1,child2,child1_poem,child2_poem
    
    # Get coherence score for a poem
    def coherence_checker(self, text):
        clean_text = ''
        for i in text:
            clean_text = clean_text + ' ' + i
        
        # Get all significant words (not-stop words)
        text_tokens = word_tokenize(clean_text)
        tokens_without_sw = [word for word in text_tokens if not word in stopwords.words('english')]
        
        coherence_val = 0
        pairs = 0
        # Get sum of probabilites from co-occurence matrix of all pairs of words in poem
        for i in range(len(tokens_without_sw) - 1):
            for j in range(i+1, len(tokens_without_sw)):
                coherence_val += cooccur_df[tokens_without_sw[i]][tokens_without_sw[j]]
                pairs += 1
        
        # Divide by the number of pairs to normalize across all poems
        if (pairs > 0):
            coherence_val /= pairs  
        return [coherence_val]

    # Determine which poems become parents to the next generation
    def tournament(self,Population,m,best):
        from_best = len(best)
        except_best = m - from_best
        if len(best) >0: # If user has selected poems
            selected_polulation = []
            words_array_selected = []
            # Automatically make poems selected by user parents
            for b in best:
                c = Population.iloc[b].values[:-1]
                d = Population.iloc[b].values[-1]
                selected_polulation.append(c)
                words_array_selected.append(d)
            words_array_selected = pd.DataFrame(dict(zip(["words_array"],[words_array_selected])))
            selected_polulation = pd.DataFrame(selected_polulation,columns=["Line1","Line2","Line3","Line4","Syllables1","Syllables2","Syllables3","Syllables4","Syll_Count","GrammarError","Coherence","Gram Rank","Coh Rank","Fitness"])
            selected_polulation = pd.concat([selected_polulation,words_array_selected],axis=1)
            
            # Sort rest of poems by fitness score
            Population2 = Population[~Population.isin(selected_polulation).all(1)]
            Population2 = Population2.sort_values(by=['Fitness'], ascending = False)
            selected_polulation2 = Population2
            from_sy = len(selected_polulation2.index)
            if from_sy >= int(except_best/2):
                from_sy = int(except_best/2)
            else:
                from_sy = from_sy
            # Get remainder of 66% of elitist poems from best fitness score
            from_random = except_best - from_sy
            selected_polulation2 = selected_polulation2.iloc[:from_sy]
            selected_polulation = pd.concat([selected_polulation,selected_polulation2],axis=0)
            selected_polulation = selected_polulation.reset_index(drop=True)
            Population2 = Population2[~Population2.isin(selected_polulation).all(1)]
            Population2 = Population2.reset_index(drop=True)
            # Get 33% of parents that will be randomly selected 
            selected_polulation = pd.concat([selected_polulation,Population2.iloc[:from_random]],axis=0)
            selected_polulation = selected_polulation.reset_index(drop=True)
            words_array_selected = list(selected_polulation["words_array"])
            return selected_polulation,words_array_selected
        else:
            # Sort poems by fitness score
            Population2 = Population.sort_values(by=['Fitness'], ascending = False)
            # Get 33% of parents that will be randomly selected 
            random = int(except_best/3)
             # Get remainder of 66% of elitist poems from best fitness score
            grammar = except_best - random
            selected_polulation = Population2[:grammar]
            Population2 = Population2[~Population2.isin(selected_polulation).all(1)]
            selected_polulation2 = Population2[:random]
            selected_polulation = pd.concat([selected_polulation,selected_polulation2],axis=0)
            selected_polulation = selected_polulation.reset_index(drop=True)
            words_array_selected = list(selected_polulation["words_array"])
            return selected_polulation,words_array_selected

    # Create initial set of poems and get their data
    def create_set_of_poems(self,n):
            poem_population = []
            syllables_count = []
            grammar_error = []
            words_array_population = []
            coherence_arr = []
            for i in range(n):
                words_array = self.select_random_words(words_df)
                poem = self.create_poems(words_array)
                sum_syllables = self.store_syllables(words_array)
                poem_error = self.Grammar_checker(poem)
                coherence = self.coherence_checker(poem)
                poem_population.append(poem)
                syllables_count.append(sum_syllables)
                grammar_error.append(poem_error)
                coherence_arr.append(coherence)
                words_array_population.append(words_array)
            words_array2 = pd.DataFrame(dict(zip(["words_array"],[words_array_population])))
            Final = np.hstack((poem_population,syllables_count,grammar_error,coherence_arr))
            Population_data = pd.DataFrame(Final,columns=["Line1","Line2","Line3","Line4","Syllables1","Syllables2","Syllables3","Syllables4","Syll_Count","GrammarError","Coherence"])

############# Calculate Fitness of Poems
            max_rank = len(Population_data["Line1"])
            Population_data["Syll_Count"] = Population_data["Syll_Count"].apply(lambda x: [float(el) for el in x.strip("[]").split(",")])
            Population_data["GrammarError"] = Population_data["GrammarError"].apply(lambda x: [int(el) for el in x.strip("[]").split(",")])
            Population_data["Gram_Rank"] = Population_data["GrammarError"].rank(method = 'min', ascending = False)
            Population_data["Coherence"] = Population_data["Coherence"].apply(lambda x: [float(el) for el in x.strip("[]").split(",")])
            Population_data["Coh_Rank"] = Population_data["Coherence"].rank(method = 'min')
            Population_data = Population_data.explode("Syll_Count")
            Population_data["Fitness"] = Population_data.apply(lambda row: 0.25*row.Gram_Rank/max_rank + 0.25*row.Coh_Rank/max_rank + row.Syll_Count, axis = 1)
            Population_data = Population_data.explode("Coherence")
            Population_data = Population_data.explode("GrammarError")
###########            
            Pop_data = pd.concat([Population_data,words_array2],axis=1)
            return Pop_data
    
    # Get the child poems from parent population
    def child_poems(self,words_array,cross):
        child_population = []
        syllables_count = []
        grammer_error = []
        words_array_child = []
        coherence_arr = []
        if len(words_array)%2 == 2:
            for i in range(0,len(words_array)-1):
                child1,child2,child1_poem,child2_poem = self.crossover(words_array[i],words_array[i+1],cross)
                child_population.append(child1_poem)
                child_population.append(child2_poem)
                sum_syllables1 = self.store_syllables(child1)
                sum_syllables2 = self.store_syllables(child2)
                poem_error1 = self.Grammar_checker(child1_poem)
                poem_error2 = self.Grammar_checker(child2_poem)
                coherence1 = self.coherence_checker(child1_poem)
                coherence2 = self.coherence_checker(child2_poem)
                syllables_count.append(sum_syllables1)
                syllables_count.append(sum_syllables2)
                grammer_error.append(poem_error1)
                grammer_error.append(poem_error2)
                coherence_arr.append(coherence1)
                coherence_arr.append(coherence2)
                words_array_child.append(child1)
                words_array_child.append(child2)
        else:
            for i in range(0,len(words_array)-1,2):
                child1,child2,child1_poem,child2_poem = self.crossover(words_array[i],words_array[i+1],cross)
                child_population.append(child1_poem)
                child_population.append(child2_poem)
                sum_syllables1 = self.store_syllables(child1)
                sum_syllables2 = self.store_syllables(child2)
                poem_error1 = self.Grammar_checker(child1_poem)
                poem_error2 = self.Grammar_checker(child2_poem)
                coherence1 = self.coherence_checker(child1_poem)
                coherence2 = self.coherence_checker(child2_poem)
                syllables_count.append(sum_syllables1)
                syllables_count.append(sum_syllables2)
                grammer_error.append(poem_error1)
                grammer_error.append(poem_error2)
                coherence_arr.append(coherence1)
                coherence_arr.append(coherence2)
                words_array_child.append(child1)
                words_array_child.append(child2)
        words_array_child = pd.DataFrame(dict(zip(["words_array"],[words_array_child])))
        Final1 = np.hstack((child_population,syllables_count,grammer_error,coherence_arr))
        child_data = pd.DataFrame(Final1,columns=["Line1","Line2","Line3","Line4","Syllables1","Syllables2","Syllables3","Syllables4","Syll_Count","GrammarError","Coherence"])
        
######### Calculate Fitness of Child Poems
        max_rank = len(child_data["Line1"])
        child_data["Syll_Count"] = child_data["Syll_Count"].apply(lambda x: [float(el) for el in x.strip("[]").split(",")])
        child_data["GrammarError"] = child_data["GrammarError"].apply(lambda x: [int(el) for el in x.strip("[]").split(",")])
        child_data["Gram_Rank"] = child_data["GrammarError"].rank(method = 'min', ascending = False)
        child_data["Coherence"] = child_data["Coherence"].apply(lambda x: [float(el) for el in x.strip("[]").split(",")])
        child_data["Coh_Rank"] = child_data["Coherence"].rank(method = 'min')
        child_data = child_data.explode("Syll_Count")
        child_data["Fitness"] = child_data.apply(lambda row: 0.25*row.Gram_Rank/max_rank + 0.25*row.Coh_Rank/max_rank + row.Syll_Count, axis = 1)
        child_data = child_data.explode("Coherence")
        child_data = child_data.explode("GrammarError")
########        
        child_data = pd.concat([child_data,words_array_child],axis=1)
        child_data = child_data.reset_index(drop=True)
        return child_data
    
    # Perform selection for the final population to be evaluated
    def selection_from_child_parents(self,x,child_data,selected_Pop_data):
        reduce = (len(list(child_data.index)) + len(list(selected_Pop_data.index))) - x
        n = int(reduce/3)
        p = reduce - n
        if reduce > len(list(child_data.index)) + len(list(selected_Pop_data.index)):
            print("ERROR")
        else:
            if n > len(child_data):
                n = len(child_data)
                p = reduce - p
            if n == 0:
                n = 1
                p = 1
        d = pd.concat([selected_Pop_data,child_data],axis=0)
        d = d.reset_index(drop=True)
        random_number1 = random.sample(list(selected_Pop_data.index),p)
        random_number2 = random.sample(list(range(len(list(selected_Pop_data.index)),len(list(d.index)))),n)
        random_number1.extend(random_number2)
        for i in random_number1:
          d.drop(i,axis=0,inplace = True)
        d = d.reset_index(drop=True)
        r = d[["Line1","Line2","Line3","Line4"]]
        return d,r
    
    def __init__(self,words_df,n,cross,epoch,z):
        # Set up arrays for storing values over epochs
        avg_gram = [0]*(epoch+1)
        avg_coh = [0]*(epoch+1)
        best_gram = [0]*(epoch+1)
        best_coh = [0]*(epoch+1)
        perc_four_syll = [0]*(epoch+1)
        perc_three_syll = [0]*(epoch+1)
        perc_two_syll = [0]*(epoch+1)
        perc_one_syll = [0]*(epoch+1)
        
        Selection_number = int(round(0.85*n))
        best = []
        assert n>1 ,f"Population {n} cannot be less than or equal to 1"
        # assert n>=Selection_number, f"Cannot select {Selection_number} poems from total population of {n}"
        assert cross>1, f"Minimum number for crossover should be 1"
        # assert 2*n >= Selection_number ,f"Cannot make initial population of {n} if only {Selection_number} parents are selected"

        Population_data = self.create_set_of_poems(n)
        
        # Append the scores to array for results
        avg_gram[0] = Population_data["GrammarError"].mean()
        avg_coh[0] = Population_data["Coherence"].mean()
        best_gram[0] = Population_data["GrammarError"].min()
        best_coh[0] = Population_data["Coherence"].max()
        perc_four_syll[0] = len(Population_data[(Population_data.Syll_Count*2*4) == 4])/n
        perc_three_syll[0] = len(Population_data[(Population_data.Syll_Count*2*4) == 3])/n
        perc_two_syll[0] = len(Population_data[(Population_data.Syll_Count*2*4) == 2])/n
        perc_one_syll[0] = len(Population_data[(Population_data.Syll_Count*2*4) == 1])/n

        for i in range(epoch):
            Selected_Population_data,selected_polulation_word_array = self.tournament(Population_data,Selection_number,best)
            child_population = self.child_poems(selected_polulation_word_array,cross)
            final_population,final_poems = self.selection_from_child_parents(n,child_population,Population_data)
            
            # Append the scores to array for results
            avg_gram[i+1] = final_population["GrammarError"].mean()
            avg_coh[i+1] = final_population["Coherence"].mean()
            best_gram[i+1] = final_population["GrammarError"].min()
            best_coh[i+1] = final_population["Coherence"].max()
            perc_four_syll[i+1] = len(final_population[(final_population.Syll_Count*2*4) == 4])/n
            perc_three_syll[i+1] = len(final_population[(final_population.Syll_Count*2*4) == 3])/n
            perc_two_syll[i+1] = len(final_population[(final_population.Syll_Count*2*4) == 2])/n
            perc_one_syll[i+1] = len(final_population[(final_population.Syll_Count*2*4) == 1])/n
            
            if i != epoch-1:
                if (i+1)%10 == 0: # User feedback every 10 epochs
                    print("Select best poems")
                    display(final_poems)
                    best = (list(map(int,input("\n Enter best poems numbers: ").strip().split())))[:z]
                Population_data = final_population
            else:
                print("###################################################")
                print("Last Poems Standing")
                print("###################################################")
                display(final_population)
        
        cols = ['MeanGrammarError', 'BestGrammer', 'MeanCoherence', 'BestCoherence', '4_syll', '3_syll', '2_syll', '1_syll']
        results_df = pd.DataFrame(list(zip(avg_gram, best_gram, avg_coh, best_coh, perc_four_syll, perc_three_syll, perc_two_syll, perc_one_syll)), columns = cols)                       
        results_df.to_csv('results.csv', sep = ',')

In [8]:
#4 (words_df,intial_population,crossover,epochs,user_selection_poems_number):
poem = poem_generation(words_df,20,3,10,0)

###################################################
Last Poems Standing
###################################################


Unnamed: 0,Line1,Line2,Line3,Line4,Syllables1,Syllables2,Syllables3,Syllables4,Syll_Count,GrammarError,Coherence,Gram_Rank,Coh_Rank,Fitness,words_array
0,the go her on,with appear rain cry,any with lived only,the an gas or,4.0,5.0,7.0,4.0,0.0,16,0.0,7.0,1.0,0.125,"[the, go, her, on, with, appear, rain, cry, any, with, lived, only, the, an, gas, or]"
1,light mistily two west,lived life only the,last certain of it,last certain our ants,6.0,6.0,5.0,5.0,0.25,18,5e-06,6.0,14.0,0.5625,"[light, mistily, two, west, lived, life, only, the, last, certain, of, it, last, certain, our, ants]"
2,life only the amplified,is until toppling any,with lived his that,escapes an or on,7.0,7.0,5.0,6.0,0.0,17,1e-05,7.0,16.0,0.359375,"[life, only, the, amplified, is, until, toppling, any, with, lived, his, that, escapes, an, or, on]"
3,last certain our ants,mistily two west lived,life only the last,certain of it toppling,5.0,7.0,5.0,6.0,0.125,22,3e-06,2.0,9.0,0.296875,"[last, certain, our, ants, mistily, two, west, lived, life, only, the, last, certain, of, it, toppling]"
4,any at we together,that an or light,from lived concerned gulf,toppling borrowed time or,7.0,4.0,7.0,7.0,0.0,19,5e-06,6.0,9.0,0.234375,"[any, at, we, together, that, an, or, light, from, lived, concerned, gulf, toppling, borrowed, time, or]"
5,sliced then that with,appear rain cry life,or go stopped rain,life on that cry,5.0,5.0,5.0,4.0,0.125,18,6e-06,8.0,12.0,0.4375,"[sliced, then, that, with, appear, rain, cry, life, or, go, stopped, rain, life, on, that, cry]"
6,certain last the on,sun dust the walk,with time a by,obliquely ichor natural from,5.0,4.0,4.0,10.0,0.125,23,3e-06,4.0,6.0,0.28125,"[certain, last, the, on, sun, dust, the, walk, with, time, a, by, obliquely, ichor, natural, from]"
7,puzzlement mistily two west,lived life only the,last certain of it,toppling we our ants,8.0,6.0,5.0,5.0,0.25,27,6e-06,1.0,12.0,0.453125,"[puzzlement, mistily, two, west, lived, life, only, the, last, certain, of, it, toppling, we, our, ants]"
8,our or puzzlement light,and mistily stopped poetry,a is or our,borrowed time an the,6.0,8.0,4.0,6.0,0.0,15,6e-06,11.0,14.0,0.390625,"[our, or, puzzlement, light, and, mistily, stopped, poetry, a, is, or, our, borrowed, time, an, the]"
9,life only the amplified,is until again on,on with lived go,sliced then that toppling,7.0,6.0,5.0,6.0,0.125,14,8e-06,8.0,14.0,0.46875,"[life, only, the, amplified, is, until, again, on, on, with, lived, go, sliced, then, that, toppling]"
