# Data Processing

## 1. Load Data

In [1]:
#import libraries
import pandas as pd
import numpy as np

In [2]:
#import dataset as data frame
lyrics = pd.read_csv('./lyrics_metrolyrics.csv')

## 2. Preview Data

In [3]:
#data frame header
lyrics.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [4]:
#number of unique artists
lyrics.artist.unique().shape[0]

18231

In [5]:
#data for game (rapper)
lyrics_game = lyrics.loc[lyrics['artist'] == 'game']
lyrics_game.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
42632,42632,beautiful-day,2009,game,Hip-Hop,I'm in love it's a beautiful day\nI'm in love ...
42633,42633,better-on-the-other-side,2009,game,Hip-Hop,[Diddy talking]\nI remember the first time I s...
42634,42634,better-days,2009,game,Hip-Hop,[Chorus]\nBeen holdin' this pain inside for so...
42635,42635,flash-back-memories,2009,game,Hip-Hop,"Play With My Cards\nhere we go,\nbait and reel..."
42636,42636,i-m-so-wavy,2009,game,Hip-Hop,"N-gga pop tags, I pop the fo-five\nN-gga got s..."


## 3. Clean Data

In [6]:
#notice null values
pd.isnull(lyrics_game)

Unnamed: 0,index,song,year,artist,genre,lyrics
42632,False,False,False,False,False,False
42633,False,False,False,False,False,False
42634,False,False,False,False,False,False
42635,False,False,False,False,False,False
42636,False,False,False,False,False,False
42637,False,False,False,False,False,False
42638,False,False,False,False,False,False
42639,False,False,False,False,False,False
42640,False,False,False,False,False,False
42641,False,False,False,False,False,False


In [7]:
#remove rows with null lyrics
lyrics_game = lyrics_game[pd.notnull(lyrics_game['lyrics'])]
lyrics_game.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
42632,42632,beautiful-day,2009,game,Hip-Hop,I'm in love it's a beautiful day\nI'm in love ...
42633,42633,better-on-the-other-side,2009,game,Hip-Hop,[Diddy talking]\nI remember the first time I s...
42634,42634,better-days,2009,game,Hip-Hop,[Chorus]\nBeen holdin' this pain inside for so...
42635,42635,flash-back-memories,2009,game,Hip-Hop,"Play With My Cards\nhere we go,\nbait and reel..."
42636,42636,i-m-so-wavy,2009,game,Hip-Hop,"N-gga pop tags, I pop the fo-five\nN-gga got s..."


## 4. Preview Sample

In [8]:
#sample of first song lyrics
demo = lyrics_game.lyrics.iloc[0]
demo

"I'm in love it's a beautiful day\nI'm in love it's a beautiful way\nI don't care what the weathermen say\nIt's a beautiful day (West side)\nI'm in love it's a beautiful day (We the future)\nI'm in love it's a beautiful way (We the future)\nI dont care what the weathermen say (We the future)\nIt's a beautiful day\nI'm free as a mother fucking bird I swear\nDissappearantly in air\nThere go game nigga where\nHosted on the block\nIn the black ass\nIn that all black phantom hug a block like a bear\nYeah That V12 is roaring\nFlying through the city with the pedal to the floor and\nI'm putting 26 inches on the curve\nTell the hood I'm back\nGive me the Quarter and let me serve,\nSwab, I'm still dope that's my word\nAll I did was wished the kitches and some track all on a bird\nEerb, Goddy u know I'm a murderer\nHalf these niggas beefing with me\nI never heard of them\nIf I was the old me I would murdered them\nIn matter of face if I was the old me I would hurt him still\nCourtesy, um a sleep

In [9]:
#save string as txt file
f = open("./demo.txt","w+")
f.write(demo)
f.close()

## 5.a. Save Data (Game)

In [10]:
#iterate and save through all song lyrics
n = lyrics_game.shape[0]
print("Number of songs: " + str(n))
f = open("./game/input.txt","w+")
for i in range(n):
    f.write(lyrics_game.lyrics.iloc[i])
    f.write('\n')
f.close()

Number of songs: 341


## 5.b. Save Data (Drake)

In [11]:
#data for drake
lyrics_drake = lyrics.loc[lyrics['artist'] == 'drake']
#remove rows with null lyrics
lyrics_drake = lyrics_drake[pd.notnull(lyrics_drake['lyrics'])]

#iterate and save through all song lyrics
n = lyrics_drake.shape[0]
print("Number of songs: " + str(n))
f = open("./drake/input.txt","w+")
for i in range(n):
    f.write(lyrics_drake.lyrics.iloc[i])
    f.write('\n')
f.close()

Number of songs: 373


## 5.c. Save Data (Childish Gambino)

In [12]:
#data for childish gambino
lyrics_childish = lyrics.loc[lyrics['artist'] == 'childish-gambino']
#remove rows with null lyrics
lyrics_childish = lyrics_childish[pd.notnull(lyrics_childish['lyrics'])]

#iterate and save through all song lyrics
n = lyrics_childish.shape[0]
print("Number of songs: " + str(n))
f = open("./childish_gambino/input.txt","w+")
for i in range(n):
    f.write(lyrics_childish.lyrics.iloc[i])
    f.write('\n')
f.close()

Number of songs: 130


## 5.d. Save Data (Frank Ocean)

In [13]:
#data for frank ocean
lyrics_frank = lyrics.loc[lyrics['artist'] == 'frank-ocean']
#remove rows with null lyrics
lyrics_frank = lyrics_frank[pd.notnull(lyrics_frank['lyrics'])]

#iterate and save through all song lyrics
n = lyrics_frank.shape[0]
print("Number of songs: " + str(n))
f = open("./frank_ocean/input.txt","w+")
for i in range(n):
    f.write(lyrics_frank.lyrics.iloc[i])
    f.write('\n')
f.close()

Number of songs: 129


## 5.e. Save Data (Rap Genre)

In [15]:
#data for all rap songs
lyrics_rap = lyrics.loc[lyrics['genre'] == 'Hip-Hop']
#remove rows with null lyrics
lyrics_rap = lyrics_rap[pd.notnull(lyrics_rap['lyrics'])]

#iterate and save through all song lyrics
n = lyrics_rap.shape[0]
print("Number of songs: " + str(n))

#grab a random sample of 10,000 songs
lyrics_rap = lyrics_rap.sample(frac=10000/n)
f = open("./rap_genre/input.txt","w+")
for i in range(10000):
    f.write(lyrics_rap.lyrics.iloc[i])
    f.write('\n')
f.close()

Number of songs: 24850
