In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import re
from sqlalchemy import create_engine

In [2]:
os.chdir("..")
print(os.getcwd())

c:\Users\valen\Desktop\etl_workshop002


In [3]:
with open ("credentials.json", "r", encoding="utf-8") as file:
    credentials = json.load(file)

db_host = credentials["db_host"]
db_name = credentials["db_name"]
db_user = credentials["db_user"]
db_password = credentials["db_password"]

conn = create_engine(f"postgresql://{db_user}:{db_password}@{db_host}:5432/{db_name}?client_encoding=utf8")


In [4]:
query = "SELECT * FROM grammys_raw_data;"
grammy_df = pd.read_sql(query, conn)

grammy_df.head()

Unnamed: 0,year,title,published_at,updated_at,category,nominee,artist,workers,img,winner
0,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,Bad Guy,Billie Eilish,"Finneas O'Connell, producer; Rob Kinelski & Fi...",https://www.grammy.com/sites/com/files/styles/...,True
1,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,"Hey, Ma",Bon Iver,"BJ Burton, Brad Cook, Chris Messina & Justin V...",https://www.grammy.com/sites/com/files/styles/...,True
2,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,7 rings,Ariana Grande,"Charles Anderson, Tommy Brown, Michael Foster ...",https://www.grammy.com/sites/com/files/styles/...,True
3,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,Hard Place,H.E.R.,"Rodney “Darkchild” Jerkins, producer; Joseph H...",https://www.grammy.com/sites/com/files/styles/...,True
4,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,Talk,Khalid,"Disclosure & Denis Kosiak, producers; Ingmar C...",https://www.grammy.com/sites/com/files/styles/...,True


In [5]:
grammy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4810 entries, 0 to 4809
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   year          4810 non-null   int64 
 1   title         4810 non-null   object
 2   published_at  4810 non-null   object
 3   updated_at    4810 non-null   object
 4   category      4810 non-null   object
 5   nominee       4804 non-null   object
 6   artist        2970 non-null   object
 7   workers       2620 non-null   object
 8   img           3443 non-null   object
 9   winner        4810 non-null   bool  
dtypes: bool(1), int64(1), object(8)
memory usage: 343.0+ KB


In [6]:
grammy_df.describe()

Unnamed: 0,year
count,4810.0
mean,1995.566944
std,17.14972
min,1958.0
25%,1983.0
50%,1998.0
75%,2010.0
max,2019.0


In [7]:
grammy_df.describe(include="object")

Unnamed: 0,title,published_at,updated_at,category,nominee,artist,workers,img
count,4810,4810,4810,4810,4804,2970,2620,3443
unique,62,4,10,638,4131,1658,2366,1463
top,62nd Annual GRAMMY Awards (2019),2017-11-28T00:03:45-08:00,2019-09-10T01:08:19-07:00,Song Of The Year,Robert Woods,(Various Artists),"John Williams, composer (John Williams)",https://www.grammy.com/sites/com/files/styles/...
freq,433,4205,778,70,7,66,20,26


In [8]:
grammy_df.describe(include="boolean")

Unnamed: 0,winner
count,4810
unique,1
top,True
freq,4810


In [9]:
print(f"Dataset Shape: {grammy_df.shape}")
print("\nMissing Values per Column:")
print(grammy_df.isnull().sum().sort_values())

Dataset Shape: (4810, 10)

Missing Values per Column:
year               0
title              0
published_at       0
updated_at         0
category           0
winner             0
nominee            6
img             1367
artist          1840
workers         2190
dtype: int64


drop columns

In [10]:
grammy_df.drop(columns=['title', 'published_at', 'updated_at', 'img'], inplace=True)

Rename columns

In [11]:
grammy_df["grammy_nominated"] = grammy_df["winner"]
grammy_df.drop(columns=['winner'], inplace=True)

In [12]:
grammy_df[grammy_df["nominee"].isnull()]

Unnamed: 0,year,category,nominee,artist,workers,grammy_nominated
2274,2000,"Remixer of the Year, Non-Classical",,,,True
2372,1999,"Remixer Of The Year, Non-Classical",,,,True
2464,1998,"Remixer Of The Year, Non-classical",,,,True
2560,1997,"Remixer Of The Year, Non-Classical",,,,True
4527,1965,Best New Country & Western Artist,,,,True
4574,1964,Best New Country & Western Artist Of 1964,,,,True


Drop nulls from nominee that can be imputed due to lack of data

Drop nominee nulls

In [13]:
grammy_df = grammy_df[grammy_df['nominee'].notna()]

In [14]:
print(f"Number of duplicates: {grammy_df.duplicated().sum()}")

Number of duplicates: 0


In [15]:
print(grammy_df.duplicated(subset=['year', 'category', 'nominee']).sum())  

0


In [16]:
print("\nMissing Values per Column:")
print(grammy_df.isnull().sum().sort_values())


Missing Values per Column:
year                   0
category               0
nominee                0
grammy_nominated       0
artist              1834
workers             2184
dtype: int64


Se buscan nulos en artita y workers al mismo tiempo para ver si se pueden imputar datos mediante la columna nominee

In [17]:
both_null = grammy_df.loc[grammy_df["artist"].isna() & grammy_df["workers"].isna()]
both_null.head()

Unnamed: 0,year,category,nominee,artist,workers,grammy_nominated
24,2019,Best New Artist,Billie Eilish,,,True
25,2019,Best New Artist,Black Pumas,,,True
26,2019,Best New Artist,Lil Nas X,,,True
27,2019,Best New Artist,Lizzo,,,True
28,2019,Best New Artist,Maggie Rogers,,,True


In [18]:
both_null["category"].value_counts()

category
Best New Artist                                                                              50
Producer Of The Year, Non-Classical                                                          22
Producer Of The Year, Classical                                                              22
Classical Producer Of The Year                                                               18
Producer Of The Year (Non-Classical)                                                         10
Producer Of The Year                                                                         10
Best New Artist Of The Year                                                                   9
Best Classical Vocal Soloist Performance                                                      7
Best Classical Vocal Performance                                                              4
Best Small Ensemble Performance (With Or Without Conductor)                                   4
Best Classical Performance - In

Categorias que podrian no referirse a artistas

In [19]:
categories = [
    "Best Classical Vocal Soloist Performance",
    "Best Classical Vocal Performance",
    "Best Small Ensemble Performance (With Or Without Conductor)",
    "Best Classical Performance - Instrumental Soloist Or Soloists (With Or Without Orchestra)",
    "Most Promising New Classical Recording Artist",
    "Best Classical Performance - Vocal Soloist (With Or Without Orchestra)",
    "Best New Classical Artist",
    "Best Classical Vocal Soloist",
    "Best Performance - Instrumental Soloist Or Soloists (With Or Without Orchestra)",
    "Best Classical Performance - Vocal Soloist"
]

both_null_filtered = both_null[both_null["category"].isin(categories)]
both_null_filtered.head()

Unnamed: 0,year,category,nominee,artist,workers,grammy_nominated
2382,1999,Best Small Ensemble Performance (With Or Witho...,"Colors Of Love - Works Of Thomas, Stucky, Tave...",,,True
2475,1998,Best Small Ensemble Performance (With Or Witho...,Reich: Music For 18 Musicians,,,True
2570,1997,Best Small Ensemble Performance (With Or Witho...,"Hindemith: Kammermusik No. 1 With Finale 1921,...",,,True
2571,1997,Best Classical Vocal Performance,"An Italian Songbook - Works Of Bellini, Donize...",,,True
2658,1996,Best Small Ensemble Performance (With Or Witho...,Boulez: ...Explosante-Fixe...,,,True


Se eliminan los nulos que no podemos imputar mediante nominee

In [20]:
both_null = both_null.drop(both_null_filtered.index, axis=0)
grammy_df = grammy_df.drop(both_null_filtered.index, axis=0)

Imputamos artistas mediante nominee

In [21]:
grammy_df.loc[both_null.index, "artist"] = both_null["nominee"]

se valida la info y se verifica que ya no hay nulos para estas dos columnas al mismo tiempo

In [22]:
grammy_df.loc[grammy_df["artist"].isna() & grammy_df["workers"].isna()]

Unnamed: 0,year,category,nominee,artist,workers,grammy_nominated


Se valida el numero de nulos por artista

In [23]:
artist_null = grammy_df.loc[grammy_df["artist"].isna()]
artist_null.shape

(1654, 6)

In [24]:
artist_null_sample = artist_null.head()
artist_null_sample

Unnamed: 0,year,category,nominee,artist,workers,grammy_nominated
16,2019,Song Of The Year,Bad Guy,,"Billie Eilish O'Connell & Finneas O'Connell, s...",True
17,2019,Song Of The Year,Always Remember Us This Way,,"Natalie Hemby, Lady Gaga, Hillary Lindsey & Lo...",True
18,2019,Song Of The Year,Bring My Flowers Now,,"Brandi Carlile, Phil Hanseroth, Tim Hanseroth ...",True
19,2019,Song Of The Year,Hard Place,,"Ruby Amanfu, Sam Ashworth, D. Arcelious Harris...",True
20,2019,Song Of The Year,Lover,,"Taylor Swift, songwriter (Taylor Swift)",True


Se observa que en los workers a veces tambien esta el artista

In [25]:
for i in artist_null_sample.index:
    print(artist_null_sample.loc[i, "workers"])

Billie Eilish O'Connell & Finneas O'Connell, songwriters (Billie Eilish)
Natalie Hemby, Lady Gaga, Hillary Lindsey & Lori McKenna, songwriters (Lady Gaga)
Brandi Carlile, Phil Hanseroth, Tim Hanseroth & Tanya Tucker, songwriters (Tanya Tucker)
Ruby Amanfu, Sam Ashworth, D. Arcelious Harris, H.E.R. & Rodney Jerkins, songwriters (H.E.R.)
Taylor Swift, songwriter (Taylor Swift)


In [None]:
def extract_artist(workers):
    match = re.search(r'\((.*?)\)', workers)
    if match:
        return match.group(1)
    return None

grammy_df["artist"] = (grammy_df.apply
    (lambda row:
        extract_artist(row["workers"])
        if pd.isna(row["artist"])
            else row["artist"], axis=1))

Se verifica que se hayan imputado los datos 

In [27]:
grammy_df.iloc[16:20]

Unnamed: 0,year,category,nominee,artist,workers,grammy_nominated
16,2019,Song Of The Year,Bad Guy,Billie Eilish,"Billie Eilish O'Connell & Finneas O'Connell, s...",True
17,2019,Song Of The Year,Always Remember Us This Way,Lady Gaga,"Natalie Hemby, Lady Gaga, Hillary Lindsey & Lo...",True
18,2019,Song Of The Year,Bring My Flowers Now,Tanya Tucker,"Brandi Carlile, Phil Hanseroth, Tim Hanseroth ...",True
19,2019,Song Of The Year,Hard Place,H.E.R.,"Ruby Amanfu, Sam Ashworth, D. Arcelious Harris...",True


In [28]:

grammy_df.isna().sum()

year                   0
category               0
nominee                0
artist               288
workers             2156
grammy_nominated       0
dtype: int64

In [29]:
grammy_df = grammy_df.dropna(subset=["artist"])

función de limpieza de texto para normalizar los nombres de artistas en el DataFrame de los Grammy, especialmente en campos donde puede haber artistas principales y featured

In [30]:
def normalize_grammy_text(text):
    if pd.isna(text):
        return text
    
    text = str(text)
    text = re.sub(r'\(([^)]*)\)', r'\1', text)
    text = re.sub(r'\b(feat\.?|ft\.?|featuring|with|w/)\b', ';', text, flags=re.IGNORECASE)
    text = re.sub(r'\s&\s', ';', text)
    text = re.sub(r',\s', ';', text)
    text = text.replace('"', '')
    return text.strip()

grammy_df['artist'] = grammy_df['artist'].apply(normalize_grammy_text)

Estando completamente satisfecha con los datos que se resccataron se eliminan los nulos restantes

Una vez manejados los nulos de artista se elimina la columna trabajadores ya que no sera utilizada para mi analisis

In [31]:

grammy_df = grammy_df.drop(columns=["workers"])

In [32]:
grammy_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4488 entries, 0 to 4807
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   year              4488 non-null   int64 
 1   category          4488 non-null   object
 2   nominee           4488 non-null   object
 3   artist            4488 non-null   object
 4   grammy_nominated  4488 non-null   bool  
dtypes: bool(1), int64(1), object(3)
memory usage: 179.7+ KB


In [33]:
grammy_df["artist"].value_counts()

artist
Various Artists                                        109
Chicago Symphony Orchestra                              29
John Williams                                           24
Henry Mancini                                           22
Bruce Springsteen                                       19
                                                      ... 
Burna Boy                                                1
Bokanté;Metropole Orkest Conducted By Jules Buckley      1
Percy Faith;His Orchestra                                1
Charles Munch;conductor                                  1
Robert Russell Bennett;conductor                         1
Name: count, Length: 2302, dtype: int64

Se dividen los artistas en artista principal y artistas de colaboracion

In [34]:
# Asegurarse de que todos los valores en 'artist' son strings (por si acaso)
grammy_df['artist'] = grammy_df['artist'].astype(str)

# Extraer artista principal
grammy_df['primary_artist'] = grammy_df['artist'].str.split(';').str[0].str.strip()

# Extraer artistas en featuring
def extract_featured(artist_str):
    try:
        parts = artist_str.split(';')
        featured = [a.strip() for a in parts[1:] if a.strip()]
        return featured if featured else []
    except:
        return []

grammy_df['featured_artists'] = grammy_df['artist'].apply(extract_featured)

grammy_df = grammy_df.drop(columns=["artist"])

In [35]:
value = "A Lot"
grammy_df[grammy_df["nominee"]== value]

Unnamed: 0,year,category,nominee,grammy_nominated,primary_artist,featured_artists
128,2019,Best Rap Song,A Lot,True,21 Savage,[J. Cole]


In [36]:
print(grammy_df["category"].unique())

['Record Of The Year' 'Album Of The Year' 'Song Of The Year'
 'Best New Artist' 'Best Pop Solo Performance'
 'Best Pop Duo/Group Performance' 'Best Traditional Pop Vocal Album'
 'Best Pop Vocal Album' 'Best Dance Recording'
 'Best Dance/Electronic Album' 'Best Contemporary Instrumental Album'
 'Best Rock Performance' 'Best Metal Performance' 'Best Rock Song'
 'Best Rock Album' 'Best Alternative Music Album' 'Best R&B Performance'
 'Best Traditional R&B Performance' 'Best R&B Song'
 'Best Urban Contemporary Album' 'Best R&B Album' 'Best Rap Performance'
 'Best Rap/Sung Performance' 'Best Rap Song' 'Best Rap Album'
 'Best Country Solo Performance' 'Best Country Duo/Group Performance'
 'Best Country Song' 'Best Country Album' 'Best New Age Album'
 'Best Jazz Vocal Album' 'Best Jazz Instrumental Album'
 'Best Large Jazz Ensemble Album' 'Best Latin Jazz Album'
 'Best Engineered Album, Classical' 'Best Gospel Album'
 'Best Contemporary Christian Music Album' 'Best Roots Gospel Album'
 'Best 

Se clasifican las categoria en una columna en donde se muestra el tipo de premio

In [37]:
def classify_category(category):
    category = category.lower()
    if 'album' in category:
        return 'album'
    elif any(kw in category for kw in ['song', 'record', 'performance']):
        return 'track'
    elif 'artist' in category:
        return 'artist'
    return 'other'

grammy_df['award_class'] = grammy_df['category'].apply(classify_category)

In [38]:
grammy_df.head()

Unnamed: 0,year,category,nominee,grammy_nominated,primary_artist,featured_artists,award_class
0,2019,Record Of The Year,Bad Guy,True,Billie Eilish,[],track
1,2019,Record Of The Year,"Hey, Ma",True,Bon Iver,[],track
2,2019,Record Of The Year,7 rings,True,Ariana Grande,[],track
3,2019,Record Of The Year,Hard Place,True,H.E.R.,[],track
4,2019,Record Of The Year,Talk,True,Khalid,[],track


In [41]:
grammy_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4488 entries, 0 to 4807
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   year              4488 non-null   int64 
 1   category          4488 non-null   object
 2   nominee           4488 non-null   object
 3   grammy_nominated  4488 non-null   bool  
 4   primary_artist    4488 non-null   object
 5   featured_artists  4488 non-null   object
 6   award_class       4488 non-null   object
dtypes: bool(1), int64(1), object(5)
memory usage: 249.8+ KB


In [42]:
duplicates = grammy_df.duplicated(
    subset=['year', 'nominee', 'primary_artist','category'], 
    keep=False
)
print(f"Potential duplicate entries: {duplicates.sum()}")

Potential duplicate entries: 0


In [44]:
def get_all_artists(row):
    all_artists = [row['primary_artist']] + row['featured_artists']
    return all_artists

grammy_df['all_artists'] = grammy_df.apply(get_all_artists, axis=1)

# Paso 3: Expandir para que cada artista tenga su propia fila
exploded = grammy_df.explode('all_artists').copy()
exploded = exploded.rename(columns={'all_artists': 'artist'})
exploded['artist'] = exploded['artist'].astype(str)

# Paso 4: Agrupar y contar nominaciones
artist_summary = (
    exploded.groupby('artist')
    .agg(
        nominations=('artist', 'count'),
        year_with_most_nominations=('year', lambda x: x.mode().iloc[0]),
        most_common_category=('category', lambda x: x.mode().iloc[0]),
        most_common_award_class=('award_class', lambda x: x.mode().iloc[0])
    )
    .reset_index()
)



In [45]:
# 5. Mostrar resumen
artist_summary.head(10)

Unnamed: 0,artist,nominations,year_with_most_nominations,most_common_category,most_common_award_class
0,112,1,1997,Best Rap Performance By A Duo Or Group,track
1,2 Chainz,1,2016,Best Rap Performance,track
2,2+2 Plus,1,1986,"Best Jazz Vocal Performance, Duo Or Group",track
3,21 Savage,2,2019,Best Rap Album,album
4,50 Cent,1,2009,Best Rap Performance By A Duo Or Group,track
5,5th Dimension,7,1967,Record Of The Year,track
6,A .L. Hines,1,1969,Best Score From An Original Cast Show Album,album
7,A Bad Think,1,2019,Best Immersive Audio Album,album
8,A Flock Of Seagulls,1,1982,Best Rock Instrumental Performance,track
9,A Great Big World,1,2014,Best Pop Duo/Group Performance,track


In [49]:
artist_summary.shape

(2792, 5)

In [50]:
artist_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2792 entries, 0 to 2791
Data columns (total 5 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   artist                      2792 non-null   object
 1   nominations                 2792 non-null   int64 
 2   year_with_most_nominations  2792 non-null   int64 
 3   most_common_category        2792 non-null   object
 4   most_common_award_class     2792 non-null   object
dtypes: int64(2), object(3)
memory usage: 109.2+ KB


In [51]:
artist_summary.to_csv("data/clean/artist_grammy_nomination.csv", index=False)