In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import re
from sqlalchemy import create_engine

In [2]:
os.chdir("..")
print(os.getcwd())

c:\Users\valen\Desktop\etl_workshop002


In [3]:
with open ("credentials.json", "r", encoding="utf-8") as file:
    credentials = json.load(file)

db_host = credentials["db_host"]
db_name = credentials["db_name"]
db_user = credentials["db_user"]
db_password = credentials["db_password"]

conn = create_engine(f"postgresql://{db_user}:{db_password}@{db_host}:5432/{db_name}?client_encoding=utf8")


In [4]:
query = "SELECT * FROM grammys_raw_data;"
grammy_df = pd.read_sql(query, conn)

grammy_df.head()

Unnamed: 0,year,title,published_at,updated_at,category,nominee,artist,workers,img,winner
0,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,Bad Guy,Billie Eilish,"Finneas O'Connell, producer; Rob Kinelski & Fi...",https://www.grammy.com/sites/com/files/styles/...,True
1,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,"Hey, Ma",Bon Iver,"BJ Burton, Brad Cook, Chris Messina & Justin V...",https://www.grammy.com/sites/com/files/styles/...,True
2,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,7 rings,Ariana Grande,"Charles Anderson, Tommy Brown, Michael Foster ...",https://www.grammy.com/sites/com/files/styles/...,True
3,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,Hard Place,H.E.R.,"Rodney “Darkchild” Jerkins, producer; Joseph H...",https://www.grammy.com/sites/com/files/styles/...,True
4,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,Talk,Khalid,"Disclosure & Denis Kosiak, producers; Ingmar C...",https://www.grammy.com/sites/com/files/styles/...,True


In [5]:
grammy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4810 entries, 0 to 4809
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   year          4810 non-null   int64 
 1   title         4810 non-null   object
 2   published_at  4810 non-null   object
 3   updated_at    4810 non-null   object
 4   category      4810 non-null   object
 5   nominee       4804 non-null   object
 6   artist        2970 non-null   object
 7   workers       2620 non-null   object
 8   img           3443 non-null   object
 9   winner        4810 non-null   bool  
dtypes: bool(1), int64(1), object(8)
memory usage: 343.0+ KB


In [6]:
grammy_df.describe()

Unnamed: 0,year
count,4810.0
mean,1995.566944
std,17.14972
min,1958.0
25%,1983.0
50%,1998.0
75%,2010.0
max,2019.0


In [7]:
grammy_df.describe(include="object")

Unnamed: 0,title,published_at,updated_at,category,nominee,artist,workers,img
count,4810,4810,4810,4810,4804,2970,2620,3443
unique,62,4,10,638,4131,1658,2366,1463
top,62nd Annual GRAMMY Awards (2019),2017-11-28T00:03:45-08:00,2019-09-10T01:08:19-07:00,Song Of The Year,Robert Woods,(Various Artists),"John Williams, composer (John Williams)",https://www.grammy.com/sites/com/files/styles/...
freq,433,4205,778,70,7,66,20,26


In [8]:
grammy_df.describe(include="boolean")

Unnamed: 0,winner
count,4810
unique,1
top,True
freq,4810


In [9]:
print(f"Number of duplicates: {grammy_df.duplicated().sum()}")

Number of duplicates: 0


In [10]:
print(f"Dataset Shape: {grammy_df.shape}")
print("\nMissing Values per Column:")
print(grammy_df.isnull().sum().sort_values())

Dataset Shape: (4810, 10)

Missing Values per Column:
year               0
title              0
published_at       0
updated_at         0
category           0
winner             0
nominee            6
img             1367
artist          1840
workers         2190
dtype: int64


In [11]:
cols = ['nominee', 'img', 'artist']
coverage = grammy_df[cols].notna().mean().sort_values(ascending=True) * 100

print("Data coverage by columns with null values (% of present values):")
print(coverage)

Data coverage by columns with null values (% of present values):
artist     61.746362
img        71.580042
nominee    99.875260
dtype: float64


In [12]:
grammy_df.drop(columns=['title', 'published_at', 'updated_at', 'img'], inplace=True)

In [13]:
grammy_df[grammy_df["nominee"].isnull()]

Unnamed: 0,year,category,nominee,artist,workers,winner
2274,2000,"Remixer of the Year, Non-Classical",,,,True
2372,1999,"Remixer Of The Year, Non-Classical",,,,True
2464,1998,"Remixer Of The Year, Non-classical",,,,True
2560,1997,"Remixer Of The Year, Non-Classical",,,,True
4527,1965,Best New Country & Western Artist,,,,True
4574,1964,Best New Country & Western Artist Of 1964,,,,True


In [14]:
grammy_df = grammy_df[grammy_df['nominee'].notna()]

In [15]:
print("\nMissing Values per Column:")
print(grammy_df.isnull().sum().sort_values())


Missing Values per Column:
year           0
category       0
nominee        0
winner         0
artist      1834
workers     2184
dtype: int64


In [16]:
def normalize_grammy_text(text):
    if pd.isna(text):
        return text
    text = str(text)
    # Remover paréntesis y contenido dentro (ej. "feat. (uncredited)")
    text = re.sub(r'\([^)]*\)', '', text)
    # Remover "feat.", "ft.", etc. y normalizar
    text = re.sub(r'(feat\.|ft\.|featuring|with)', ';', text, flags=re.IGNORECASE)
    return (
        text.strip()
        .lower()
        .replace('&', ';')
        .replace(' x ', ';')
    )

grammy_df['artist'] = grammy_df['artist'].apply(normalize_grammy_text)

In [17]:
print(grammy_df.duplicated(subset=['year', 'category', 'nominee']).sum())  

0


In [18]:
both_null_values = grammy_df.loc[grammy_df["artist"].isna() & grammy_df["workers"].isna()]
both_null_values.head()

Unnamed: 0,year,category,nominee,artist,workers,winner
24,2019,Best New Artist,Billie Eilish,,,True
25,2019,Best New Artist,Black Pumas,,,True
26,2019,Best New Artist,Lil Nas X,,,True
27,2019,Best New Artist,Lizzo,,,True
28,2019,Best New Artist,Maggie Rogers,,,True


In [19]:
both_null_values["category"].value_counts()

category
Best New Artist                                                                              50
Producer Of The Year, Non-Classical                                                          22
Producer Of The Year, Classical                                                              22
Classical Producer Of The Year                                                               18
Producer Of The Year (Non-Classical)                                                         10
Producer Of The Year                                                                         10
Best New Artist Of The Year                                                                   9
Best Classical Vocal Soloist Performance                                                      7
Best Classical Vocal Performance                                                              4
Best Small Ensemble Performance (With Or Without Conductor)                                   4
Best Classical Performance - In

In [20]:
categories = [
    "Best Classical Vocal Soloist Performance",
    "Best Classical Vocal Performance",
    "Best Small Ensemble Performance (With Or Without Conductor)",
    "Best Classical Performance - Instrumental Soloist Or Soloists (With Or Without Orchestra)",
    "Most Promising New Classical Recording Artist",
    "Best Classical Performance - Vocal Soloist (With Or Without Orchestra)",
    "Best New Classical Artist",
    "Best Classical Vocal Soloist",
    "Best Performance - Instrumental Soloist Or Soloists (With Or Without Orchestra)",
    "Best Classical Performance - Vocal Soloist"
]

both_filtered = both_null_values[both_null_values["category"].isin(categories)]
both_filtered.head()

Unnamed: 0,year,category,nominee,artist,workers,winner
2382,1999,Best Small Ensemble Performance (With Or Witho...,"Colors Of Love - Works Of Thomas, Stucky, Tave...",,,True
2475,1998,Best Small Ensemble Performance (With Or Witho...,Reich: Music For 18 Musicians,,,True
2570,1997,Best Small Ensemble Performance (With Or Witho...,"Hindemith: Kammermusik No. 1 With Finale 1921,...",,,True
2571,1997,Best Classical Vocal Performance,"An Italian Songbook - Works Of Bellini, Donize...",,,True
2658,1996,Best Small Ensemble Performance (With Or Witho...,Boulez: ...Explosante-Fixe...,,,True


In [21]:
both_null_values = both_null_values.drop(both_filtered.index, axis=0)
grammy_df = grammy_df.drop(both_filtered.index, axis=0)

In [22]:
grammy_df.loc[both_null_values.index, "artist"] = both_null_values["nominee"]

In [23]:
grammy_df.loc[grammy_df["artist"].isna() & grammy_df["workers"].isna()]

Unnamed: 0,year,category,nominee,artist,workers,winner


In [24]:
artist_null = grammy_df.loc[grammy_df["artist"].isna()]
artist_null.shape

(1654, 6)

In [25]:
artist_null_sample = artist_null.head()
artist_null_sample

Unnamed: 0,year,category,nominee,artist,workers,winner
16,2019,Song Of The Year,Bad Guy,,"Billie Eilish O'Connell & Finneas O'Connell, s...",True
17,2019,Song Of The Year,Always Remember Us This Way,,"Natalie Hemby, Lady Gaga, Hillary Lindsey & Lo...",True
18,2019,Song Of The Year,Bring My Flowers Now,,"Brandi Carlile, Phil Hanseroth, Tim Hanseroth ...",True
19,2019,Song Of The Year,Hard Place,,"Ruby Amanfu, Sam Ashworth, D. Arcelious Harris...",True
20,2019,Song Of The Year,Lover,,"Taylor Swift, songwriter (Taylor Swift)",True


In [26]:
for i in artist_null_sample.index:
    print(artist_null_sample.loc[i, "workers"])

Billie Eilish O'Connell & Finneas O'Connell, songwriters (Billie Eilish)
Natalie Hemby, Lady Gaga, Hillary Lindsey & Lori McKenna, songwriters (Lady Gaga)
Brandi Carlile, Phil Hanseroth, Tim Hanseroth & Tanya Tucker, songwriters (Tanya Tucker)
Ruby Amanfu, Sam Ashworth, D. Arcelious Harris, H.E.R. & Rodney Jerkins, songwriters (H.E.R.)
Taylor Swift, songwriter (Taylor Swift)


In [27]:
import re

def extract_artist(workers):
    match = re.search(r'\((.*?)\)', workers)
    if match:
        return match.group(1)
    return None

grammy_df["artist"] = (grammy_df.apply
    (lambda row:
        extract_artist(row["workers"])
        if pd.isna(row["artist"])
            else row["artist"], axis=1))

In [28]:
grammy_df.iloc[16:20]

Unnamed: 0,year,category,nominee,artist,workers,winner
16,2019,Song Of The Year,Bad Guy,Billie Eilish,"Billie Eilish O'Connell & Finneas O'Connell, s...",True
17,2019,Song Of The Year,Always Remember Us This Way,Lady Gaga,"Natalie Hemby, Lady Gaga, Hillary Lindsey & Lo...",True
18,2019,Song Of The Year,Bring My Flowers Now,Tanya Tucker,"Brandi Carlile, Phil Hanseroth, Tim Hanseroth ...",True
19,2019,Song Of The Year,Hard Place,H.E.R.,"Ruby Amanfu, Sam Ashworth, D. Arcelious Harris...",True


In [29]:

grammy_df.isna().sum()

year           0
category       0
nominee        0
artist       288
workers     2156
winner         0
dtype: int64

In [30]:
grammy_df = grammy_df.dropna(subset=["artist"])

In [31]:

grammy_df = grammy_df.drop(columns=["workers"])

In [32]:
grammy_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4488 entries, 0 to 4807
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   year      4488 non-null   int64 
 1   category  4488 non-null   object
 2   nominee   4488 non-null   object
 3   artist    4488 non-null   object
 4   winner    4488 non-null   bool  
dtypes: bool(1), int64(1), object(3)
memory usage: 179.7+ KB


In [33]:
grammy_df["artist"].value_counts()

artist
                                                                                                          93
Various Artists                                                                                           43
Chicago Symphony Orchestra                                                                                29
John Williams                                                                                             23
u2                                                                                                        18
                                                                                                          ..
Lindsey Lang, Matthew Gladden, Pamela Williamson, Rebecca Lloyd & Sarah Tannehill; Kansas City Chorale     1
stan freberg                                                                                               1
Robert Preston, Barbara Cook, David Burns, Eddie Hodges, Pert Kelton, Helen Raymond                        1
Nelson Riddl

In [34]:
grammy_df['primary_artist'] = grammy_df['artist'].str.split(';').str[0].str.strip()
grammy_df['featured_artists'] = grammy_df['artist'].str.split(';').str[1:].apply(
    lambda x: [a.strip() for a in x if a.strip()] if isinstance(x, list) else None
)

In [35]:
print(grammy_df["category"].unique())

['Record Of The Year' 'Album Of The Year' 'Song Of The Year'
 'Best New Artist' 'Best Pop Solo Performance'
 'Best Pop Duo/Group Performance' 'Best Traditional Pop Vocal Album'
 'Best Pop Vocal Album' 'Best Dance Recording'
 'Best Dance/Electronic Album' 'Best Contemporary Instrumental Album'
 'Best Rock Performance' 'Best Metal Performance' 'Best Rock Song'
 'Best Rock Album' 'Best Alternative Music Album' 'Best R&B Performance'
 'Best Traditional R&B Performance' 'Best R&B Song'
 'Best Urban Contemporary Album' 'Best R&B Album' 'Best Rap Performance'
 'Best Rap/Sung Performance' 'Best Rap Song' 'Best Rap Album'
 'Best Country Solo Performance' 'Best Country Duo/Group Performance'
 'Best Country Song' 'Best Country Album' 'Best New Age Album'
 'Best Jazz Vocal Album' 'Best Jazz Instrumental Album'
 'Best Large Jazz Ensemble Album' 'Best Latin Jazz Album'
 'Best Engineered Album, Classical' 'Best Gospel Album'
 'Best Contemporary Christian Music Album' 'Best Roots Gospel Album'
 'Best 

In [36]:
genre_mapping = {
    r'pop|dance|electronic': 'pop',
    r'rock|metal|alternative': 'rock',
    r'rap|hip.?hop': 'hiphop',
    r'r&b|soul': 'r&b',
    r'jazz': 'jazz',
    r'country': 'country',
    r'latin': 'latin'
}

def map_grammy_genre(category):
    for pattern, genre in genre_mapping.items():
        if re.search(pattern, category, re.IGNORECASE):
            return genre
    return 'other'

grammy_df['award_genre'] = grammy_df['category'].apply(map_grammy_genre)

In [38]:
def classify_category(category):
    category = category.lower()
    if 'album' in category:
        return 'album'
    elif any(kw in category for kw in ['song', 'record', 'performance']):
        return 'track'
    elif 'artist' in category:
        return 'artist'
    return 'other'

grammy_df['award_type'] = grammy_df['category'].apply(classify_category)

In [39]:
grammy_df['clean_track_name'] = grammy_df['nominee'].where(
    grammy_df['award_type'] == 'track',
    None
).str.replace(r'[^\w\s]', '', regex=True)

In [43]:
grammy_df.head()

Unnamed: 0,year,category,nominee,artist,winner,primary_artist,featured_artists,award_genre,award_type,clean_track_name
0,2019,Record Of The Year,Bad Guy,billie eilish,True,billie eilish,[],other,track,Bad Guy
1,2019,Record Of The Year,"Hey, Ma",bon iver,True,bon iver,[],other,track,Hey Ma
2,2019,Record Of The Year,7 rings,ariana grande,True,ariana grande,[],other,track,7 rings
3,2019,Record Of The Year,Hard Place,h.e.r.,True,h.e.r.,[],other,track,Hard Place
4,2019,Record Of The Year,Talk,khalid,True,khalid,[],other,track,Talk


In [40]:
grammy_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4488 entries, 0 to 4807
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   year              4488 non-null   int64 
 1   category          4488 non-null   object
 2   nominee           4488 non-null   object
 3   artist            4488 non-null   object
 4   winner            4488 non-null   bool  
 5   primary_artist    4488 non-null   object
 6   featured_artists  4488 non-null   object
 7   award_genre       4488 non-null   object
 8   award_type        4488 non-null   object
 9   clean_track_name  2332 non-null   object
dtypes: bool(1), int64(1), object(8)
memory usage: 355.0+ KB


In [45]:
duplicates = grammy_df.duplicated(
    subset=['year', 'primary_artist', 'clean_track_name','category'], 
    keep=False
)
print(f"Entradas duplicadas potenciales: {duplicates.sum()}")

Entradas duplicadas potenciales: 12


aqui termina lo que copie :(

In [20]:
null_df = grammy_df[grammy_df["artist"].isnull()]
null_df['category'].unique()


array(['Song Of The Year', 'Best New Artist', 'Best Rock Song',
       'Best R&B Song', 'Best Rap Song', 'Best Country Song',
       'Best Contemporary Classical Composition',
       'Best Improvised Jazz Solo', 'Best Engineered Album, Classical',
       'Best Gospel Performance/Song',
       'Best Contemporary Christian Music Performance/Song',
       'Best American Roots Song', 'Best Musical Theater Album',
       'Best Classical Compendium',
       'Best Score Soundtrack For Visual Media',
       'Best Song Written For Visual Media',
       'Best Instrumental Composition',
       'Best Arrangement, Instrumental or A Cappella',
       'Best Arrangement, Instruments and Vocals',
       'Best Recording Package',
       'Best Boxed Or Special Limited Edition Package',
       'Best Album Notes', 'Best Historical Album',
       'Best Engineered Album, Non-Classical',
       'Producer Of The Year, Non-Classical', 'Best Remixed Recording',
       'Best Immersive Audio Album', 'Producer Of T

In [None]:
palabras_clave = ['artist', 'performance', 'vocal', 'singer', 'duo', 'group']
referidas_a_artistas = [cat for cat in null_df['category'].unique() 
                        if any(palabra in cat.lower() for palabra in palabras_clave)]

print(referidas_a_artistas)


['Best New Artist', 'Best Arrangement, Instruments and Vocals', 'Best Classical Solo Vocal Album', 'Best Instrumental Arrangement Accompanying Vocalist(s)', 'Best Classical Vocal Solo', 'Best Instrumental Arrangement Accompanying Vocal(s)', 'Best Classical Vocal Performance', 'Best Classical Vocal Soloist Performance', 'Best Instrumental Arrangement Accompanying A Vocalist(s)', 'Best Instrumental Arrangement With Accompanying Vocal(s)', 'Best Instrumental Arrangement With Accompanying Vocals', 'Best Classical Vocal Soloist', 'Best Instrumental Arrangement Accompanying Vocals', 'Best Vocal Arrangement For Two Or More Voices', 'Best New Classical Artist', 'Best Arrangement Accompanying Vocalist(s)', 'Best Arrangement Accompanying Vocals', 'Best Arrangement Accompanying Vocal(s)', 'Best New Artist Of The Year', 'Best Arrangement For Voices (Duo, Group Or Chorus)', 'Best Arrangement Accompanying Vocalists', 'Best Classical Performance - Vocal Soloist', 'Best Vocal Soloist Performance, Clas

In [34]:
categorias_artisticas = [
    "Best New Artist", "Best Classical Solo Vocal Album", "Best Classical Vocal Solo",
    "Best Classical Vocal Performance", "Best Classical Vocal Soloist Performance",
    "Best Classical Vocal Soloist", "Best New Classical Artist", "Best New Artist Of The Year",
    "Best Classical Performance - Vocal Soloist", "Best Vocal Soloist Performance, Classical",
    "Best Vocal Soloist Performance", "Best New Artist Of 1964", 
    "Best Classical Vocal Soloist Performance (With Or Without Orchestra)",
    "Best Vocal Soloist Performance (With Or Without Orchestra)",
    "Most Promising New Classical Recording Artist", "Best New Artist Of 1963",
    "Best New Artist Of 1962", "Best New Artist Of 1961", 
    "Best Classical Performance - Vocal Soloist (With Or Without Orchestra)",
    "Best New Artist Of 1960", "Best New Artist Of 1959"
]

#grammy_df['artist'] = grammy_df.apply(lambda row: row['nominee'] if pd.isna(row['artist']) and row['category'] in categorias_artisticas else row['artist'], axis=1)

In [39]:
df = grammy_df[grammy_df["category"].isin(categorias_artisticas)]
df["category"].value_counts()

category
Best New Artist                                                           51
Best Classical Vocal Soloist Performance                                  22
Best Classical Vocal Performance                                          20
Best Classical Solo Vocal Album                                           11
Best New Artist Of The Year                                                9
Best Classical Performance - Vocal Soloist (With Or Without Orchestra)     5
Best Classical Vocal Solo                                                  3
Most Promising New Classical Recording Artist                              3
Best New Classical Artist                                                  1
Best Classical Performance - Vocal Soloist                                 1
Best Vocal Soloist Performance, Classical                                  1
Best Vocal Soloist Performance                                             1
Best Classical Vocal Soloist                                       

In [41]:
categories = [
    "Best Classical Vocal Soloist Performance",
    "Best Classical Vocal Performance",
    "Best Small Ensemble Performance (With Or Without Conductor)",
    "Best Classical Performance - Instrumental Soloist Or Soloists (With Or Without Orchestra)",
    "Most Promising New Classical Recording Artist",
    "Best Classical Performance - Vocal Soloist (With Or Without Orchestra)",
    "Best New Classical Artist",
    "Best Classical Vocal Soloist",
    "Best Performance - Instrumental Soloist Or Soloists (With Or Without Orchestra)",
    "Best Classical Performance - Vocal Soloist"
]

In [42]:
df_check = df[df["category"].isin(categories)]
df_check.head()

Unnamed: 0,year,category,nominee,artist,workers,winner
1213,2010,Best Classical Vocal Performance,sacrificium,,"Cecilia Bartoli, soloist; Arend Prohmann, prod...",True
1322,2009,Best Classical Vocal Performance,verismo arias,,"Renée Fleming, soloist; David Frost, producer;...",True
1433,2008,Best Classical Vocal Performance,corigliano: mr. tambourine man: seven poems of...,,"Hila Plitmann, soloist; John Corigliano & Tim ...",True
1543,2007,Best Classical Vocal Performance,lorraine hunt lieberson sings peter lieberson:...,,"Lorraine Hunt Lieberson, soloist; Dirk Sobotka...",True
1654,2006,Best Classical Vocal Performance,rilke songs,,"Lorraine Hunt Lieberson, soloist",True


In [30]:
def imputar_artist(row):
    if pd.isna(row['artist']) and row['category'] in categorias_artisticas:
        return row['nominee']
    return row['artist']

grammy_df['artist'] = grammy_df.apply(imputar_artist, axis=1)


NameError: name 'categorias_artisticas' is not defined

In [34]:
print("\nMissing Values per Column:")
print(grammy_df.isnull().sum().sort_values())


Missing Values per Column:
year        0
category    0
nominee     0
artist      0
winner      0
dtype: int64
