Skip to content

Commit

Permalink
cleaning titles
Browse files Browse the repository at this point in the history
  • Loading branch information
suzan baert committed May 17, 2018
1 parent ce35200 commit 3986889
Show file tree
Hide file tree
Showing 13 changed files with 8,044 additions and 742,525 deletions.
22 changes: 13 additions & 9 deletions playlist_project/1_playlist_scrapedata.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ library(tidyverse)
source("PlaylistScrape.R")





#------------------
# SCRAPING THE DATA
#------------------
Expand All @@ -31,15 +34,6 @@ dates_mar <- paste0(1:31, "-03-2018")
date_selection <- c(dates_jan, dates_feb, dates_mar)


#test stubru only
# input_df_stubru <- data.frame(date = date_selection)
# input_df_stubru$radio <- "studiobrussel"
#
# stubru <- map2_df(input_df_stubru$radio, input_df_stubru$date, read_playlist_and_sleep)
# saveRDS(stubru, "playlist_project/data/4weeks_stubru.RDS")



#get them all
input_df <- merge(date_selection, radio_selection)
names(input_df) <- c("date", "radio")
Expand All @@ -49,6 +43,10 @@ saveRDS(all_radios, "playlist_project/data/10weeks_allradios.RDS")





# --------------

# adding new dates
dates_apr <- paste0(1:30, "-04-2018")
input_df <- merge(dates_apr, radio_selection)
Expand All @@ -61,6 +59,12 @@ saveRDS(all_radios_combined, "playlist_project/data/allradios_30apr.RDS")

write.csv2(all_radios_combined, "playlist_project/data/all_radios_apr_uncleaned2.csv")





#-------------

#quick check
#do i have all the dates for all radios
table(all_radios_combined$radio, all_radios_combined$date)
30 changes: 17 additions & 13 deletions playlist_project/2_playlist_cleaning.R
Original file line number Diff line number Diff line change
@@ -1,13 +1,23 @@
library(tidyverse)
# all_radios <- readRDS("playlist_project/data/10weeks_allradios.RDS")
# all_radios <- readRDS("playlist_project/data/allradios_30apr.RDS")





#------------------
# CHECKING THE DATA
#------------------

# cleaning title - removing items between brackets () and [], removing multiple punctuation at end of sentence
# cleaning artist - removing anything between brackets, removing punctuation
# all: removing double spaces, to lower

#basic cleaning
radios_clean1 <- all_radios %>%
mutate(title = str_replace_all(title, "\\(.+\\)", "")) %>%
mutate(title = str_replace_all(title, "\\(.+\\)", ""),
title = str_replace_all(title, "\\[.+\\]", ""),
title = str_replace(title, "(\\w+)\\s?[[:punct:]]+$", "\\1")) %>%
mutate(artist = str_replace_all(artist, "\\(.+\\)", ""),
artist = str_replace_all(artist, "[.*,-]", " ")) %>%
mutate_all(tolower) %>%
Expand All @@ -17,18 +27,11 @@ radios_clean1 <- all_radios %>%
mutate(day = lubridate::ymd_hm(day))


#### Cleaning up artist list ####

#artist list and look at all names with and for isntance
artist_list <- count(radios_clean1, artist, sort=TRUE)
unlist(str_extract_all(artist_list$artist, ".+ and .+"))

artist_list2 <- count(radios_clean2, artist, sort=TRUE)
unlist(str_extract_all(artist_list2$artist, ".+ and .+"))



#changing all feat. ft. + & etc into "and"
#copy original artist
radios_clean1$artist_original <- radios_clean1$artist


Expand Down Expand Up @@ -67,13 +70,14 @@ radios_clean2 <- radios_clean1 %>%



saveRDS(radios_clean2, "playlist_project/data/3m_radios_clean.RDS")
write.csv(radios_clean2, "playlist_project/data/3m_radios_clean.csv")
saveRDS(radios_clean2, "playlist_project/data/radios_clean.RDS")
write.csv(radios_clean2, "playlist_project/data/radios_clean.csv")






write.csv(radios_clean2, "playlist_project/data/all_radios_apr_cleaned.csv")


#### check tot hier ####
Expand Down
34 changes: 31 additions & 3 deletions playlist_project/2b_typos_cleaning.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@


##umloat o a u a acent
radios_clean1$artist <- str_replace_all(radios_clean1$artist, "ö", "o")




radios_clean1$artist <- str_replace(radios_clean1$artist, "blof", "bløf")
radios_clean1$artist <- str_replace(radios_clean1$artist, "bløf", "bløf")

radios_clean1$artist <- str_replace(radios_clean1$artist, "^1?0cc", "10 cc")
radios_clean1$artist <- str_replace(radios_clean1$artist, "^jimi hendri.+", "jimi hendrix")
radios_clean1$artist <- str_replace(radios_clean1$artist, "^p\\!nk", "pink")
Expand All @@ -9,6 +18,7 @@ radios_clean1$artist <- str_replace(radios_clean1$artist, "and victoria bergsman
radios_clean1$artist <- str_replace(radios_clean1$artist, "keala and", "keala settle and")
radios_clean1$artist <- str_replace(radios_clean1$artist, "gorky", "gorki")
radios_clean1$artist <- str_replace(radios_clean1$artist, "mc fiott?i .+", "mc fioti")
radios_clean1$artist <- str_replace(radios_clean1$artist, "mc fiott?i", "mc fioti")
radios_clean1$artist <- str_replace(radios_clean1$artist, "todiefor .+", "todiefor")
radios_clean1$artist <- str_replace(radios_clean1$artist, "omd", "orchestral manoeuvres in the dark")
radios_clean1$artist <- str_replace(radios_clean1$artist, "jayz", "jay z")
Expand All @@ -25,18 +35,36 @@ radios_clean1$artist <- str_replace(radios_clean1$artist, "rob.+n.+raz", "rob 'n
radios_clean1$artist <- str_replace(radios_clean1$artist, "reel 2 real", "reel to real")


radios_clean1$artist <- str_replace(radios_clean1$artist, "the jackson.5?", "the jackson five")
radios_clean1$artist <- str_replace(radios_clean1$artist, "(the)? jackson 5", "the jackson five")
radios_clean1$artist <- str_replace(radios_clean1$artist, "ph d", "phd")



radios_clean1$artist <- str_replace(radios_clean1$artist, "\\bwith\\b", "and")
radios_clean1$artist <- str_replace(radios_clean1$artist, "mark and a k", "mark with a k")
radios_clean1$artist <- str_replace(radios_clean1$artist, "sin and sebastian", "sin with sebastian")
radios_clean1$artist <- str_replace(radios_clean1$artist, "tom robinson band", "tom robinson")
radios_clean1$artist <- str_replace(radios_clean1$artist, "popgun", "pop gun")
radios_clean1$artist <- str_replace(radios_clean1$artist, "wizstars", "the wiz stars")
radios_clean1$artist <- str_replace(radios_clean1$artist, "a caus' des ", "a cause des")
radios_clean1$artist <- str_replace(radios_clean1$artist, "rafaella carra", "raffaella carra")
radios_clean1$artist <- str_replace(radios_clean1$artist, "elvis", "raffaella carra")

radios_clean1$artist <- str_replace(radios_clean1$artist, "témé tan", "teme tan")
radios_clean1$artist <- str_replace(radios_clean1$artist, "tatu", "t a t u")
radios_clean1$artist <- str_replace(radios_clean1$artist, "de booy", "de booij")
radios_clean1$artist <- str_replace(radios_clean1$artist, "polarkreiss", "polarkreis")
radios_clean1$artist <- str_replace(radios_clean1$artist, "rené froger", "rene froger")
radios_clean1$artist <- str_replace(radios_clean1$artist, "don mcclean", "don mclean")
radios_clean1$artist <- str_replace(radios_clean1$artist, "greenday", "green day")
radios_clean1$artist <- str_replace(radios_clean1$artist, "patti smith group", "patti smith")
radios_clean1$artist <- str_replace(radios_clean1$artist, "jasper stervelinck", "jasper steverlinck")
radios_clean1$artist <- str_replace(radios_clean1$artist, "2pac", "2 pac")




radios_clean1$artist <- str_replace(radios_clean1$artist, "tom robinson band", "tom robinson")

radios_clean1$artist <- str_replace(radios_clean1$artist, "popgun", "pop gun")

radios_clean1$title <- str_replace(radios_clean1$title, "!!!embargo tot vrijdag 02/02!!! ", "")
radios_clean1$title <- str_replace(radios_clean1$title, "a caus' des ", "a cause des")
Binary file removed playlist_project/data/4weeks_allradios.RDS
Binary file not shown.
Binary file removed playlist_project/data/4weeks_radios_clean.RDS
Binary file not shown.
Binary file removed playlist_project/data/4weeks_stubru.RDS
Binary file not shown.
Loading

0 comments on commit 3986889

Please sign in to comment.