In [69]:
import pandas as pd
import os
import codecs

In [16]:
l = pd.read_csv('../genius_lyrics.csv')

## Text Preprocessing

#### Using gensim simple_preprocess()

In [17]:
import gensim
from gensim import corpora
from pprint import pprint
texts = [[text for text in doc.split()] for doc in l] #get a list of individual words, creates a list of lists

In [18]:
flat_list = [item for sublist in texts for item in sublist] #convert our lists of lists to a single list

In [19]:
from gensim.utils import simple_preprocess
ll = [simple_preprocess(text, deacc=True) for text in flat_list]
# simple pre process cleaning

In [20]:
ll

[['track'], ['artist'], ['lyrics']]

#### Using spacy

In [61]:
import spacy # Generate list of tokens

string = l[1]
nlp = spacy.load('en_core_web_sm')
doc = nlp(string)
lemmas = [token.lemma_ for token in doc]


# Remove tokens that are not alphabetic
a_lemmas = [lemma for lemma in lemmas if lemma.isalpha() or lemma == '-PRON-']
# Print string after text cleaning
print(' '.join(a_lemmas))

In [39]:
#Getting rid of stop words
stopwords = spacy.lang.en.stop_words.STOP_WORDS
a_lemmas = [lemma for lemma in lemmas if lemma.isalpha() and lemma not in stopwords]
corpus = pd.Series(' '.join(a_lemmas))

In [63]:
corpus

0    gyal dem Schillaci Sean da Paul I I I girl mil...
dtype: object

# NLP tutorial from stackoverflow
https://github.com/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb

In [28]:
import spacy
import pandas as pd
import itertools as it

nlp = spacy.load("en_core_web_sm")

In [29]:
test_dataset = l['lyrics'][0] #testing out the beginning of the tutorial
# test_dataset

In [30]:
test_lyrics = nlp(test_dataset)
# test_lyrics

Looking at individual sentences


In [36]:
for num, sentence in enumerate(test_lyrics.sents):
    print( 'Sentence {}:'.format(num + 1))
    print (sentence)
    print ('')

named entities (not expecting any but it can occasionally appear in songs)


In [38]:
for num, entity in enumerate(test_lyrics.ents):
    print ('Entity {}:'.format(num + 1), entity, '-', entity.label_)
    print ('')

What about part of speech tagging?

In [44]:
token_text = [token.orth_ for token in test_lyrics]
token_pos = [token.pos_ for token in test_lyrics]

pd.DataFrame(zip(token_text, token_pos),
             columns=['token_text', 'part_of_speech'])

What about text normalization, like stemming/lemmatization and shape analysis?

In [45]:
token_lemma = [token.lemma_ for token in test_lyrics]
token_shape = [token.shape_ for token in test_lyrics]

pd.DataFrame(zip(token_text, token_lemma, token_shape),
             columns=['token_text', 'token_lemma', 'token_shape'])

What about token-level entity analysis?

In [49]:
token_entity_type = [token.ent_type_ for token in test_lyrics]
token_entity_iob = [token.ent_iob_ for token in test_lyrics]

pd.DataFrame(zip(token_text, token_entity_type, token_entity_iob),
             columns = ['token_text','entity_type','inside_outside_begin'])


What about a variety of other token-level attributes, such as the relative frequency of tokens, and whether or not a token matches any of these categories?

* stopword
* punctuation
* whitespace
* represents a number
* whether or not the token is included in spaCy's default vocabulary?

In [52]:
token_attributes = [(token.orth_,
                     token.prob,
                     token.is_stop,
                     token.is_punct,
                     token.is_space,
                     token.like_num,
                     token.is_oov)
                    for token in test_lyrics]

df = pd.DataFrame(token_attributes,
                 columns = [
                     'text',
                     'log_probability',
                     'stop?',
                     'punctuation?',
                     'whitespace?',
                     'number?',
                     'out of vocab?']
                 )

df.loc[:,'stop?':'out of vocab?'] = (df.loc[:,'stop?':'out of vocab?'].applymap(lambda x: u'Yes' if x else u''))
df

Unnamed: 0,text,log_probability,stop?,punctuation?,whitespace?,number?,out of vocab?
0,Where,-20.0,Yes,,,,Yes
1,is,-20.0,Yes,,,,Yes
2,the,-20.0,Yes,,,,Yes
3,moment,-20.0,,,,,Yes
4,we,-20.0,Yes,,,,Yes
...,...,...,...,...,...,...,...
505,bad,-20.0,,,,,Yes
506,day,-20.0,,,,,Yes
507,(,-20.0,,Yes,,,Yes
508,Ah,-20.0,,,,,Yes


# Phrase Modeling

In [77]:
#get the filepath

with codecs.open('../lyrics.txt', encoding='utf_8') as f:
    first_song_lyrics = f.readline()
    
print(first_song_lyrics)




In [78]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

In [79]:
l['lyrics'][0:3].to_csv('../testlyrics.txt', sep = ' ',index = False) #test lyrics to loop over

In [82]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def line_lyrics(filename):
    """
    generator function to read in lyrics from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for lyrics in f:
            yield lyrics
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse lyrics,
    lemmatize the text, and yield sentences
    """
    
    for parsed_lyrics in nlp.pipe(line_lyrics(filename),
                                  batch_size=10000):
        
        for sent in parsed_lyrics.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [84]:
%%time

if 1 == 1:
    with codecs.open('../lyrics.txt', 'w', encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus('../lyrics.txt'):
            f.write(sentence + '/n')

Wall time: 963 µs


In [85]:
unigram_sentences = LineSentence('../lyrics.txt')

In [88]:
for unigram_sentence in it.islice(unigram_sentences, 2, 3):
    print(u''.join(unigram_sentence))
    print(u'')