# Data Processing

## 1. Load Data

In [1]:
#import libraries
import pandas as pd
import numpy as np

In [2]:
#import dataset as data frame
lyrics = pd.read_csv('./lyrics.csv')

## 2. Preview Data

In [3]:
#data frame header
lyrics.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [4]:
#number of unique artists
lyrics.artist.unique().shape[0]

18231

In [5]:
#data for childish gambino
lyrics_childish = lyrics.loc[lyrics['artist'] == 'childish-gambino']
lyrics_childish.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
346741,346741,because-the-internet-screenplay-part-3,2013,childish-gambino,Hip-Hop,*******[PLAY SECRET TRACK 9 1/2 [CODED]Â NOW]...
346742,346742,the-worst-guys,2013,childish-gambino,Hip-Hop,All she needed was some\nAt a Clippers' game o...
346743,346743,regular-show,2013,childish-gambino,Hip-Hop,[Big Trouble (Tyler the Creator)]\nY'all bette...
346744,346744,think-of-me,2013,childish-gambino,Hip-Hop,
346745,346745,i-the-crawl,2013,childish-gambino,Hip-Hop,"Where we were, kinda thing, betcha crawl, all ..."


## 3. Clean Data

In [6]:
#notice null values
pd.isnull(lyrics_childish)

Unnamed: 0,index,song,year,artist,genre,lyrics
346741,False,False,False,False,False,False
346742,False,False,False,False,False,False
346743,False,False,False,False,False,False
346744,False,False,False,False,False,True
346745,False,False,False,False,False,False
346746,False,False,False,False,False,False
346747,False,False,False,False,False,True
346748,False,False,False,False,False,False
346749,False,False,False,False,False,False
346750,False,False,False,False,False,False


In [7]:
#remove rows with null lyrics
lyrics_childish = lyrics_childish[pd.notnull(lyrics_childish['lyrics'])]
lyrics_childish.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
346741,346741,because-the-internet-screenplay-part-3,2013,childish-gambino,Hip-Hop,*******[PLAY SECRET TRACK 9 1/2 [CODED]Â NOW]...
346742,346742,the-worst-guys,2013,childish-gambino,Hip-Hop,All she needed was some\nAt a Clippers' game o...
346743,346743,regular-show,2013,childish-gambino,Hip-Hop,[Big Trouble (Tyler the Creator)]\nY'all bette...
346745,346745,i-the-crawl,2013,childish-gambino,Hip-Hop,"Where we were, kinda thing, betcha crawl, all ..."
346746,346746,iii-life-the-biggest-troll-andrew-auernheimer,2013,childish-gambino,Hip-Hop,"Man made the web, you don't need a name\nMan m..."


## 4. Preview Sample

In [8]:
#sample of first song lyrics
demo1 = lyrics_childish.lyrics.iloc[0]
demo1

'*******[PLAY SECRET TRACK 9 1/2 [CODED]Â\x9d NOW]*******\n(do not read on while "Secret track" plays. Wait until it is finished, then continue reading)\nCUT TO:\n****[PLAY "PLAYING AROUND BEFORE THE PARTY STARTS" NOW]****\nINT. MANSION - NIGHT\n[VISUAL]\nThe Boy sits at his piano; playing randomly, trying to make sense of everything/anything. Steve and Swank talk to Emily and MISLA in the kitchen. They\'re arguing about driving, or cooking with coconut oil instead of olive oil, or something else they themselves will not care about or remember in a month, year, ten years, 100 years, the age of the universe\nThe house is starting to look pretty bad. The cleaners that used to come stopped receiving their payment and stopped coming. Plastic cups are starting to spread like a blue rash across the living room and coffee tables. Every surface has a sticky spot. You used to be able to slide into the kitchen from the living room with a two second run and quick stop. Now, dried alcohol stops yo

## 5.a. Save Data (Childish Gambino)

In [9]:
#save string as txt file
f = open("./demo1.txt","w+")
f.write(demo1)
f.close()

In [10]:
#iterate and save through all song lyrics
n = lyrics_childish.shape[0]
for i in range(n):
    f = open("./artists/childish_gambino/" + lyrics_childish.song.iloc[i] + ".txt","w+")
    f.write(lyrics_childish.lyrics.iloc[i])
    f.close()

## 5.b. Save Data (Drake)

In [11]:
#data for drake
lyrics_drake = lyrics.loc[lyrics['artist'] == 'drake']
#remove rows with null lyrics
lyrics_drake = lyrics_drake[pd.notnull(lyrics_drake['lyrics'])]

#iterate and save through all song lyrics
n = lyrics_drake.shape[0]
for i in range(n):
    f = open("./artists/drake/" + lyrics_drake.song.iloc[i] + ".txt","w+")
    f.write(lyrics_drake.lyrics.iloc[i])
    f.close()

## 5.c. Save Data (Frank Ocean)

In [12]:
#data for frank ocean
lyrics_frank = lyrics.loc[lyrics['artist'] == 'frank-ocean']
#remove rows with null lyrics
lyrics_frank = lyrics_frank[pd.notnull(lyrics_frank['lyrics'])]

#iterate and save through all song lyrics
n = lyrics_frank.shape[0]
for i in range(n):
    f = open("./artists/frank_ocean/" + lyrics_frank.song.iloc[i] + ".txt","w+")
    f.write(lyrics_frank.lyrics.iloc[i])
    f.close()