In [116]:
import pandas as pd
import numpy as np

In [117]:
df = pd.read_csv("artist_concert_data.csv")
clean_df = df.copy()
df.head()

Unnamed: 0,Date,Concert,Venue,Location,Unnamed: 4,Artist
0,26-Nov-22,Faithless,Concorde 2,"Brighton, England, United Kingdom",,Faithless
1,24-Sep-22,Faithless / Crazy P / Don Letts / Dat Brass,Dreamland Margate,"Margate, United Kingdom",,Faithless
2,15-Sep-22,faithless,Sportpaleis Antwerpen,"Antwerp, Flanders, Belgium",,Faithless
3,21-Aug-22,"Camp Bestival ""Camp Bestival"" / Rag N Bone Man...",Weston Park,"Staffordshire, UK",,Faithless
4,29-Jul-22,"Camp Bestival ""Camp Bestival"" / Rag N Bone Man...",Lulworth Castle,"Dorset, United Kingdom",,Faithless


In [118]:
# Remove bad dates from 'Date' column - Show Duplicate, Upcoming, Rescheduled, Cancelled
mask = clean_df["Date"].str.contains("(Show Duplicate|Upcoming|Rescheduled|Cancelled|No concerts found)")
clean_df = clean_df[~mask]

clean_df.head()

  mask = clean_df["Date"].str.contains("(Show Duplicate|Upcoming|Rescheduled|Cancelled|No concerts found)")


Unnamed: 0,Date,Concert,Venue,Location,Unnamed: 4,Artist
0,26-Nov-22,Faithless,Concorde 2,"Brighton, England, United Kingdom",,Faithless
1,24-Sep-22,Faithless / Crazy P / Don Letts / Dat Brass,Dreamland Margate,"Margate, United Kingdom",,Faithless
2,15-Sep-22,faithless,Sportpaleis Antwerpen,"Antwerp, Flanders, Belgium",,Faithless
3,21-Aug-22,"Camp Bestival ""Camp Bestival"" / Rag N Bone Man...",Weston Park,"Staffordshire, UK",,Faithless
4,29-Jul-22,"Camp Bestival ""Camp Bestival"" / Rag N Bone Man...",Lulworth Castle,"Dorset, United Kingdom",,Faithless


In [119]:
# Format dates not in the format DD-MMM-YY
test_date_regex = "^\d{1,2}-[a-zA-Z]{3}-\d{1,2}"
date_format_mask = clean_df["Date"].str.match(test_date_regex)

# Grab dates that are in range, split, select earliest date
unformatted_date_df = clean_df[~date_format_mask]["Date"]
unformatted_date_df = unformatted_date_df.str.split("–", expand=True)[0]
unformatted_date_df

# Replace based on indices
clean_df["Date"].loc[unformatted_date_df.index] = unformatted_date_df

# Format MMM DD, YYYY to DD-MM-YY
clean_df["Date"] = clean_df["Date"].astype("datetime64[ns]")
clean_df

Unnamed: 0,Date,Concert,Venue,Location,Unnamed: 4,Artist
0,2022-11-26,Faithless,Concorde 2,"Brighton, England, United Kingdom",,Faithless
1,2022-09-24,Faithless / Crazy P / Don Letts / Dat Brass,Dreamland Margate,"Margate, United Kingdom",,Faithless
2,2022-09-15,faithless,Sportpaleis Antwerpen,"Antwerp, Flanders, Belgium",,Faithless
3,2022-08-21,"Camp Bestival ""Camp Bestival"" / Rag N Bone Man...",Weston Park,"Staffordshire, UK",,Faithless
4,2022-07-29,"Camp Bestival ""Camp Bestival"" / Rag N Bone Man...",Lulworth Castle,"Dorset, United Kingdom",,Faithless
...,...,...,...,...,...,...
200733,2022-06-06,Dave Matthews Band Setlists,Daily's Place,"Jacksonville, Florida, United States",,Dave Matthews Band
200734,2022-06-06,Dave Matthews Band,Blossom Music Center,"Cuyahoga Falls, OH, US",,Dave Matthews Band
200735,2022-06-04,Dave Matthews Band Setlists,Credit One Stadium,,,Dave Matthews Band
200736,2022-06-03,Dave Matthews Band Setlists,Credit One Stadium,"Charleston, South Carolina, United States",,Dave Matthews Band


In [120]:
# Remove unknown venue
na_venue_mask = clean_df["Venue"].isna()
clean_df = clean_df[~na_venue_mask]

# Get venue names where location unknown
na_location_mask = clean_df["Location"].isna()
venue_name = set(clean_df[na_location_mask]["Venue"])

venue_filter_mask = clean_df["Venue"].isin(venue_name)
venue_loc_infer = clean_df[~na_location_mask & venue_filter_mask][["Venue", "Location"]]

In [121]:
# Rank venue location map and get the first value
resolved_venue_loc = venue_loc_infer.groupby("Venue").nth(0)
type(resolved_venue_loc)

# Map function to replace NA
def remove_loc_na(row):
    if not pd.isnull(row["Location"]):
        return row["Location"]
    if pd.isnull(row["Location"]) and row["Venue"] in set(resolved_venue_loc.index):
        return resolved_venue_loc.loc[row["Venue"]]
    else:
        return np.nan
        
# Loop through DF and resolve missing locations
clean_df["Location"] = clean_df.apply(remove_loc_na, axis = 1)
clean_df[na_location_mask]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df["Location"] = clean_df.apply(remove_loc_na, axis = 1)


Unnamed: 0,Date,Concert,Venue,Location,Unnamed: 4,Artist
227,2022-10-07,Alice Cooper Setlists,Yaamava' Theater,"Location Highland, California, United State...",,Alice Cooper
240,2022-09-20,Alice Cooper Photos Setlists,"Emens Auditorium, Ball State University",,,Alice Cooper
540,2019-08-06,P.O.D.,Starland Ballroom,"Location Newark, New Jersey, United States ...",,P.O.D.
1269,2012-10-05,Gotye / Missy Higgins Setlists,Chastain Park Amphitheatre,"Location Atlanta, Georgia, United States Na...",,Gotye
1505,2023-01-30,70000 Tons of Metal Hei'An / Vicious Rumors / ...,Royal Caribbean - Freedom of the Seas,,,Nightwish
...,...,...,...,...,...,...
200210,2017-12-01,Grizzly Bear Photos Setlists,Palace Theatre,"Location Albany, New York, United States Na...",,Grizzly Bear
200538,2020-08-30,Matchbox Twenty,Budweiser Stage,"Location Toronto, Ontario, Canada Name: Bud...",,Matchbox Twenty
200539,2020-07-31,Matchbox Twenty,Veterans United Home Loans Amphitheater,"Location Virginia Beach, Virginia, United S...",,Matchbox Twenty
200735,2022-06-04,Dave Matthews Band Setlists,Credit One Stadium,"Location Charleston, South Carolina, United...",,Dave Matthews Band


In [122]:
# Remove rows where location is still null
na_loc_still_mask = clean_df["Location"].isna()
clean_df = clean_df[~na_loc_still_mask]
clean_df

Unnamed: 0,Date,Concert,Venue,Location,Unnamed: 4,Artist
0,2022-11-26,Faithless,Concorde 2,"Brighton, England, United Kingdom",,Faithless
1,2022-09-24,Faithless / Crazy P / Don Letts / Dat Brass,Dreamland Margate,"Margate, United Kingdom",,Faithless
2,2022-09-15,faithless,Sportpaleis Antwerpen,"Antwerp, Flanders, Belgium",,Faithless
3,2022-08-21,"Camp Bestival ""Camp Bestival"" / Rag N Bone Man...",Weston Park,"Staffordshire, UK",,Faithless
4,2022-07-29,"Camp Bestival ""Camp Bestival"" / Rag N Bone Man...",Lulworth Castle,"Dorset, United Kingdom",,Faithless
...,...,...,...,...,...,...
200733,2022-06-06,Dave Matthews Band Setlists,Daily's Place,"Jacksonville, Florida, United States",,Dave Matthews Band
200734,2022-06-06,Dave Matthews Band,Blossom Music Center,"Cuyahoga Falls, OH, US",,Dave Matthews Band
200735,2022-06-04,Dave Matthews Band Setlists,Credit One Stadium,"Location Charleston, South Carolina, United...",,Dave Matthews Band
200736,2022-06-03,Dave Matthews Band Setlists,Credit One Stadium,"Charleston, South Carolina, United States",,Dave Matthews Band


In [123]:
# Drop column Unnamed: 4
clean_df = clean_df.drop(["Unnamed: 4"], axis = 1)
clean_df

Unnamed: 0,Date,Concert,Venue,Location,Artist
0,2022-11-26,Faithless,Concorde 2,"Brighton, England, United Kingdom",Faithless
1,2022-09-24,Faithless / Crazy P / Don Letts / Dat Brass,Dreamland Margate,"Margate, United Kingdom",Faithless
2,2022-09-15,faithless,Sportpaleis Antwerpen,"Antwerp, Flanders, Belgium",Faithless
3,2022-08-21,"Camp Bestival ""Camp Bestival"" / Rag N Bone Man...",Weston Park,"Staffordshire, UK",Faithless
4,2022-07-29,"Camp Bestival ""Camp Bestival"" / Rag N Bone Man...",Lulworth Castle,"Dorset, United Kingdom",Faithless
...,...,...,...,...,...
200733,2022-06-06,Dave Matthews Band Setlists,Daily's Place,"Jacksonville, Florida, United States",Dave Matthews Band
200734,2022-06-06,Dave Matthews Band,Blossom Music Center,"Cuyahoga Falls, OH, US",Dave Matthews Band
200735,2022-06-04,Dave Matthews Band Setlists,Credit One Stadium,"Location Charleston, South Carolina, United...",Dave Matthews Band
200736,2022-06-03,Dave Matthews Band Setlists,Credit One Stadium,"Charleston, South Carolina, United States",Dave Matthews Band


In [124]:
# Correct future dates by substracting a century
def replace_future_date(row):
    if not row["Date"] >= pd.Timestamp.today():
        return row["Date"]
    else:
        return row["Date"] - pd.DateOffset(years=100)
    
clean_df["Date"] = clean_df.apply(replace_future_date, axis = 1)
clean_df[clean_df["Date"] > pd.Timestamp.today()]

Unnamed: 0,Date,Concert,Venue,Location,Artist


In [125]:
# Check if there are any artists that play concert on the same date
group_df = clean_df.groupby(["Date", "Artist"]).nth(0)
group_df.reset_index()
clean_df = group_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Concert,Venue,Location
Date,Artist,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1951-01-26,Édith Piaf,Édith Piaf,Cinéma Acora,"Brussels, Brussels Capital, Belgium"
1951-01-27,Édith Piaf,Édith Piaf,Opéra de Lille,"Lille, Hauts-de-France, France"
1951-02-20,Édith Piaf,Édith Piaf,Alhambra,"Bordeaux, France"
1952-09-26,Sarah Vaughan,Nat King Cole / Stan Kenton / Sarah Vaughan / ...,William B. Bell Auditorium,"Augusta, Georgia, United States"
1956-02-23,Brenda Lee,Brenda Lee / Red Foley,William B. Bell Auditorium,"Augusta, Georgia, United States"
...,...,...,...,...
2023-02-24,Theory of a Deadman,Theory of a Deadman / Skillet / Saint Asonia,Allentown Fairgrounds Grandstand,"Allentown, Pennsylvania, United States"
2023-02-24,Vance Joy,Vance Joy,Jack Singer Concert Hall,"Calgary, Alberta, Canada"
2023-02-24,Willie Nelson,Willie Nelson,The St. Augustine Amphitheatre,"St. Augustine, Florida, United States"
2023-02-24,Yo La Tengo,Yo La Tengo,The Fillmore,"San Francisco, California, United States"


In [131]:
# Remove non-ASCII character records
artist_mask = clean_df["Artist"].str.contains(r'[^\x00-\x7F]+')
clean_df["Artist"][artist_mask].drop_duplicates()

28318             Blue Öyster Cult
37719                  Mötley Crüe
45038             Jane’s Addiction
49956                 Télépopmusik
70378             Eagle‐Eye Cherry
76324                      Hard‐Fi
84871                    Anti‐Flag
85394         Bone Thugs‐n‐Harmony
95200                    The B‐52s
97692                     CHVRCHΞS
99976                Róisín Murphy
107089         Sophie Ellis‐Bextor
109974              Antonín Dvořák
109980                  Édith Piaf
111954             At the Drive‐In
113042         “Weird Al” Yankovic
122461               Janelle Monáe
129966                     Beyoncé
134896                   blink‐182
164823    The All‐American Rejects
171098               Michael Bublé
172113             Destiny’s Child
181642                      Tiësto
187669                   Motörhead
Name: Artist, dtype: object

In [128]:
clean_df.to_csv("cleaned_concert_data.csv")