## Imports

In [None]:
import requests
from bs4 import BeautifulSoup 
import re
import pandas as pd 
import os 
import time
import datetime
import csv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pandas.io.json import json_normalize
import csv
from pyquery import PyQuery as pq
from lxml import etree
import requests

## Web Scraping MetroLyrics

In [None]:
sesamesongs = [] 

for x in range(1,11):
    newsong = {'Sesame Street': 'http://www.metrolyrics.com/sesame-street-alpage-' + str(x) + '.html'}
    sesamesongs.append(newsong)

In [None]:
Allsongs = []

for l in sesamesongs:
    for key, value in l.items():
        
        print(value) ## To keep track of progress
        
        response = requests.get(value)
        doc = pq(response.content)       
        titles = doc('.title')

        
        for title in titles:
            if 'Sesame' in (title.attrib['title']):
                header = (title.attrib['title'])
                response_title = requests.get(title.attrib['href'])
                doc2 = pq(response_title.content)     
                verse = doc2('.verse')
            
                lyrics =  (verse.text())
            
                print(header) ## To keep track of progress
            
                newsong = {'Show' : key, 'Lyrics': lyrics, 'Song Name' : header } #### FIGURE THIS OUT!... Song Name' : header 
                Allsongs.append(newsong)
            
        
                time.sleep(4.5)
        

In [None]:
Allsongs.to_csv('SesameSongs.csv')

## NLP - Analysis

### Initial Cleaning

In [None]:
SesameSongs = pd.read_csv('SesameSongs.csv').drop('Unnamed: 0',1)

In [None]:
SesameSongs

In [None]:
SesameSongs_clean = SesameSongs.apply(lambda x: x.str.strip()).replace('', np.nan)

In [None]:
SesameSongs_clean = SesameSongs_clean.dropna().replace(r'\([^)]*\)','', regex=True).replace(r'\#\w*:','', regex=True).replace(r'\w*:','', regex=True)

In [None]:
SesameSongs_clean['Lyrics'][1]

### New Column: Word count

In [None]:
SesameSongs_clean['Word Count'] = SesameSongs_clean['Lyrics'].str.split().str.len()

### New Column: Non-English words removed

In [None]:
import nltk
words = set(nltk.corpus.words.words())

OnlyEng = []

for song in SesameSongs_clean['Lyrics']:
    sent = " ".join(w for w in nltk.wordpunct_tokenize(song) \
                    if w.lower() in words or not w.isalpha())
    OnlyEng.append(sent)
        

In [None]:
SesameSongs_clean['Only English Words'] = OnlyEng

### New Column: Number of unique English words used

In [None]:
Uniquecount = []

for song in SesameSongs_clean['Only English Words']:
    uniqueWords = len(set(nltk.wordpunct_tokenize(song)))
    
    Uniquecount.append(uniqueWords)

# uniqueWords = list(set(" ".join(SesameSongs_clean['Only English Words']).lower().split(" ")))

SesameSongs_clean['Unique Eng Words'] = Uniquecount

### New Column: Proportion of total words that are unique.

In [None]:
Uniqueprop = []

for song in SesameSongs_clean['Only English Words']:
    unique = len(set(nltk.wordpunct_tokenize(song)))
    prop = round(unique / len(nltk.wordpunct_tokenize(song)),3)
    
    Uniqueprop.append(prop)

In [None]:
SesameSongs_clean['Prop of unique words'] = Uniqueprop

### New Column: Count of how many numbers are mentioned

In [None]:
numlist = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','one','two','three','four','five','six','seven','eight','nine','ten']

Numcount = []
counter = 0

for song in SesameSongs_clean['Lyrics']:
    counter = 0
    for w in nltk.wordpunct_tokenize(song):
        if w in numlist:
            counter += 1
    
    Numcount.append(counter)
    

SesameSongs_clean['How many numbers?'] = Numcount

### New Column: Proportion of song that starts with the letter that appears most often

In [None]:
from string import ascii_lowercase

nonnumeric = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','.',',','!','?','...','#','"',"'",')','-']

listodic = []
letterprop = []

d = {}

for song in SesameSongs_clean['Lyrics']:
    d = {}
    x = nltk.wordpunct_tokenize(song)
    for word in x:
        word = word.lower()
        if word not in nonnumeric:
            if word[0] not in d:
                d[word[0]] = 1
            
            else:
                d[word[0]] += 1
                  
    v=list(d.values())
    most = (max(v))
    
    letterprop.append(round(most/len(nltk.wordpunct_tokenize(song)),3))


In [None]:
SesameSongs_clean['Letter Proportion'] = letterprop

### New Column: Number of times "you" "yours "your" is used.

In [None]:
interactive = ['you','your',"y'all",'you all','our','we']
rest = ['I','he','she','their','they','them','hers','his','theirs']

subjects = []


youcounter = 0
restcounter = 0


for song in SesameSongs_clean['Only English Words']:
    youcounter = 0
    restcounter = 0
    for w in nltk.wordpunct_tokenize(song):
        if w in interactive:
            youcounter += 1
        if w in rest:
            restcounter +=1
    
    if (restcounter+youcounter) != 0:
        youprop = (youcounter/(youcounter+restcounter))
        
    else:
        youprop = 0
    
    subjects.append(youprop)

In [None]:
SesameSongs_clean['Interactive language prop'] = subjects

### New Columns: Polarity and Subjectivity

In [None]:
from textblob import TextBlob

In [None]:
SesameSongs_clean = SesameSongs_clean.reset_index(drop = True)

In [None]:
subjectivity = []
polarity = []

for word in range(len(SesameSongs_clean['Lyrics'])):
    w = str(SesameSongs_clean["Lyrics"][word])
    d = list(TextBlob(w).sentiment)
    polarity.append(d[0])
    subjectivity.append(d[1])

In [None]:
SesameSongs_clean['Polarity'] = polarity

In [None]:
SesameSongs_clean['Subjectivity'] = subjectivity

### New Column: Number of words outside Baladan word list

**Creating columns to identify how many "fringe" words are used (i.e. words that are less common in adult speech).** <br>
Baladan 347 words account for 75% of adult speech. 

##### **Creating Baladan List**

In [None]:
baladinpd = pd.read_csv('Baladin_WordList.csv')

In [None]:
baladin = []

import csv
with open('Baladin_WordList.csv', 'r') as f:
    reader = csv.reader(f)
    baladinlist = list(reader)

for listy in baladinlist:
    for word in listy:
        if word != '':
            baladin.append(word)

In [None]:
len(baladin)

##### **Counting number of words outside of the list**

In [None]:
fringewords = []

fringecounts = []
fringeprop = []
    

for song in SesameSongs_clean['Only English Words']:
    fringewords = []
    for w in nltk.wordpunct_tokenize(song):
        w = w.lower()
        if w not in baladin:
            if w not in nonnumeric:
                fringewords.append(w)
    
    
    fringey = (len(set(fringewords)))
    uniques = len(set(nltk.wordpunct_tokenize(song)))
    
    fringecounts.append(fringey)   
    fringeprop.append(round((fringey/uniques), 3)) 

In [None]:
SesameSongs_clean['Adult fringe count'] = fringecounts

In [None]:
SesameSongs_clean['Adult fringe prop'] = fringeprop

### New Column: Number of words outside Marvin word list

**Creating columns to identify how many "fringe" words are used (i.e. words that are less in toddler speech).** <br>
Baladan 333 words account for majority of pre-schooler core speech.

##### **Creating Marvin list**

In [None]:
marvinpd = pd.read_csv('Marvin_WordList.csv')

In [None]:
marvin = []

import csv
with open('Marvin_WordList.csv', 'r') as f:
    reader = csv.reader(f)
    marvinlist = list(reader)

for listy in marvinlist:
    for word in listy:
        if len(word)>0:
            if word[0] not in numlist:
                marvin.append(word)

In [None]:
marvin = (set(marvin))

##### **Counting number of words outside of the list**

In [None]:
mfringewords = []

mfringecounts = []
mfringeprop = []
    

for song in SesameSongs_clean['Only English Words']:
    mfringewords = []
    for w in nltk.wordpunct_tokenize(song):
        w = w.lower()
        if w not in marvin:
            if w not in nonnumeric:
                mfringewords.append(w)
    
    
    fringey = (len(set(mfringewords)))
    uniques = len(set(nltk.wordpunct_tokenize(song)))
    
    mfringecounts.append(fringey)   
    mfringeprop.append(round((fringey/uniques), 3)) 

In [None]:
SesameSongs_clean['PreK fringe count'] = mfringecounts

In [None]:
SesameSongs_clean['PreK fringe prop'] = mfringeprop

## Cluster Modeling

#### Preprocessing

In [None]:
SesameSongs_clean

** Selecting inputs for the model **

In [None]:
S_inputs = SesameSongs_clean.filter(items = [
#        'Word Count', 
#        'Unique Eng Words',
       'Prop of unique words',  
#     'How many numbers?',
#     'Letter Proportion',
    'Interactive language prop',
    'Adult fringe prop',
    'PreK fringe prop',
#     'PreK fringe count',
#     'Adult fringe count'
])

In [None]:
S_inputs

** Scaling **

In [None]:
from sklearn.preprocessing import scale

In [None]:
S_inputs['Word Count'] = scale(S_inputs['Word Count'])

In [None]:
S_inputs['How many numbers?'] = scale(S_inputs['How many numbers?'])

In [None]:
S_inputs['PreK fringe count'] = scale(S_inputs['PreK fringe count'])

In [None]:
S_inputs['Adult fringe count'] = scale(S_inputs['Adult fringe count'])

In [None]:
S_inputs

#### Running Hierarchical Aggllomerative Clustering model

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
Aclus = AgglomerativeClustering(n_clusters = 3,linkage = 'ward')
Aclus.fit(S_inputs)

In [None]:
S_inputs

In [None]:
p = Aclus.fit_predict(S_inputs)

In [None]:
# newsong = np.array([0.28,0.8,0.34,0.5])
# newsong2 = np.reshape(newsong,(1,-1))

# Aclus.predict(S_inputs.iloc[0:2])

In [None]:
Aclus.get_params()

In [None]:
SesameSongs_clustered = SesameSongs_clean

In [None]:
SesameSongs_clustered['Cluster'] = p

In [None]:
SesameSongs_clustered

#### EDA on each cluster

In [None]:
SesameSongs_clustered['Cluster'].value_counts()

In [None]:
cluster_analysis = SesameSongs_clustered.groupby(['Cluster'])[
    'Word Count', 
    'Unique Eng Words',
    'Prop of unique words',  
    'How many numbers?',
    'Letter Proportion',
    'Interactive language prop',
    'Adult fringe prop',
    'PreK fringe prop',
    'PreK fringe count',
    'Adult fringe count'
].mean()

In [None]:
cluster_analysis

**It seems like generally... ** <br>
0. Has the highest proportion of new words.
1. Have the most words. 
2. Has the highest proportion of words that break the fourth wall



## LDA on each cluster

** Seperating each cluster into own dataframe **

In [None]:
cluster0 = SesameSongs_clustered.loc[SesameSongs_clustered['Cluster'] == 0]

In [None]:
cluster1 = SesameSongs_clustered.loc[SesameSongs_clustered['Cluster'] == 1]

In [None]:
cluster2 = SesameSongs_clustered.loc[SesameSongs_clustered['Cluster'] == 2]

** LDA on each cluster **

#### Cluster 0

In [None]:
from __future__ import print_function
# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
# logging for gensim (set to INFO)
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer 


In [None]:
Songs0 = cluster0['Only English Words'].tolist()

In [None]:


vectorizer = CountVectorizer(ngram_range=(1,1), stop_words="english")

# call `fit` to build the vocabulary
vectorizer.fit(Songs0)

x = vectorizer.fit_transform(Songs0)
# # then, use `get_feature_names` to return the tokens
# print(vectorizer.get_feature_names())

counts = vectorizer.transform(Songs0).transpose()


In [None]:
counts.shape

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components= 3, max_iter = 15, random_state=0, 
                                n_jobs=-1, learning_method='batch')

In [None]:
topics = lda.fit_transform(x)

In [None]:
lda.components_.shape

In [None]:
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [None]:
pyLDAvis.sklearn.prepare(lda, x, vectorizer)

In [None]:
from nltk import FreqDist

Songs0all = []

for song in Songs0:
    for word in song:
        Songs0all.append(word)
    
    
pop0words = nltk.probability.FreqDist(Songs0)

print(pop0words)

v=list(pop0words.values())
sort = (sorted(v))

from operator import itemgetter
# listy = sorted(pop0words.items(), key=itemgetter(reverse=True))

# listy

#### Cluster 1

In [None]:
Cluster1 = Cluster1.replace('.', '')

In [None]:
Songs1 = cluster1['Only English Words'].tolist()

In [None]:
Songs1 = filter(lambda i:not(type(i) is str), Songs1)

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,1), stop_words="english")

# call `fit` to build the vocabulary
vectorizer.fit(Songs1)

x = vectorizer.fit_transform(Songs1)
# # then, use `get_feature_names` to return the tokens
# print(vectorizer.get_feature_names())

counts = vectorizer.transform(Songs1).transpose()

#### Cluster 2

In [None]:
Songs2 = cluster2['Only English Words'].tolist()

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,1), stop_words="english")

# call `fit` to build the vocabulary
vectorizer.fit(Songs2)

x = vectorizer.fit_transform(Songs2)
# # then, use `get_feature_names` to return the tokens
# print(vectorizer.get_feature_names())

counts = vectorizer.transform(Songs2).transpose()

In [None]:
counts.shape

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components= 3, max_iter = 15, random_state=0, 
                                n_jobs=-1, learning_method='batch')

In [None]:
topics = lda.fit_transform(x)

In [None]:
pyLDAvis.sklearn.prepare(lda, x, vectorizer)

## Song writing

In [None]:
mysong = 'When Im feeling mad or sad I Follow my imagination till things arent so bad Try these things I like to do, And before you know it youll be joyful too. I Listen to the sounds of a baby bird Or Grab a book and read every word I Bop my head to a song’s little rhymes And sing along to pass the time I think of a place Id like to visit, And in my mind, its quite exquisite! I Honk the horn of an blue car I’m nearly there, I’m not so far! I Follow my dog to his favorite park, And befriend a furry monster when it gets dark, I Count the Hairs on the head of a dear pied piper Or write happy words on a typewriter! These are the things I like to do, I think youll love to do them too!'

** Baladin score **

In [None]:
fringewords = []

fringecounts = []
fringeprop = []
    
for w in nltk.wordpunct_tokenize(mysong):
    w = w.lower()
    if w not in baladin:
        if w not in nonnumeric:
            fringewords.append(w)
    

In [None]:

wordlist = fringewords.remove('youll')
wordlist = fringewords.remove('arent')

fringey = (len(set(fringewords)))
uniques = len(set(nltk.wordpunct_tokenize(mysong)))
    
fringecounts.append(fringey)   
fringeprop.append(round((fringey/uniques), 3)) 

In [None]:
fringeprop

In [None]:
fringecounts

In [None]:
balwordlist = set(fringewords)

In [None]:
balwordlist

** Marvin score **

In [None]:
mfringewords = []

mfringecounts = []
mfringeprop = []
    


for w in nltk.wordpunct_tokenize(mysong):
    w = w.lower()
    if w not in marvin:
        if w not in nonnumeric:
            mfringewords.append(w)
    
    

In [None]:

fringey = (len(set(mfringewords)))
uniques = len(set(nltk.wordpunct_tokenize(mysong)))
    
mfringecounts.append(fringey)   
mfringeprop.append(round((fringey/uniques), 3)) 

In [None]:
set(mfringewords)

In [None]:
mfringecounts

In [None]:
mfringeprop