# TD Transformation

## 1. Enrichissement

Import des données

In [1]:
import pandas as pd

# Import du fichier data_NY
df = pd.read_csv("data_NY.csv")

# Supprimer les colonnes inutiles
df = pd.read_csv("data_NY.csv").drop(columns=["Unnamed: 0", "streamingEvent", "pinIconSrc", "watchLiveText", "isPlus", "callToActionText", "timezone", "displayRule", "locale", "date_range"])

# Convertir les colonnes en datetime
df['startsAt'] = pd.to_datetime(df['startsAt'], errors='coerce')
df['endsAt'] = pd.to_datetime(df['endsAt'], errors='coerce')

df.head()

Unnamed: 0,artistImageSrc,properlySizedImageURL,callToActionRedirectUrl,fallbackImageUrl,artistName,venueName,title,locationText,eventUrl,artistUrl,rsvpCount,rsvpCountInt,startsAt,endsAt
0,https://photos.bandsintown.com/thumb/13985339....,https://media.bandsintown.com/110x110/13985339...,https://www.bandsintown.com/e/105307658-emily-...,https://assets.prod.bandsintown.com/images/hom...,Emily Beck,John Jay Homestead Farm Market,Emily Beck at the John Jay Homestead,"Katonah, NY",https://www.bandsintown.com/e/105307658-emily-...,https://www.bandsintown.com/a/14545332-emily-b...,0,0,2024-10-26 11:00:00,2024-10-26 13:00:00
1,https://photos.bandsintown.com/thumb/12163906....,https://media.bandsintown.com/110x110/12163906...,https://www.bandsintown.com/e/105660087-john-f...,https://assets.prod.bandsintown.com/images/hom...,John Finch,St Mary's Winfield,St. Mary's Winfield High School Retreat,"Queens, NY",https://www.bandsintown.com/e/105660087-john-f...,https://www.bandsintown.com/a/2115895-john-fin...,0,0,2024-10-26 12:00:00,NaT
2,https://photos.bandsintown.com/thumb/17578563....,https://media.bandsintown.com/110x110/17578563...,https://www.bandsintown.com/e/105917670-songbi...,https://assets.prod.bandsintown.com/images/hom...,Songbird Official Music,The Grove at Shrewsbury,The Grove at Shrewsbury West,"Shrewsbury, NJ",https://www.bandsintown.com/e/105917670-songbi...,https://www.bandsintown.com/a/15507503-songbir...,0,0,2024-10-26 12:00:00,2024-10-26 15:00:00
3,https://assets.prod.bandsintown.com/images/hom...,https://assets.prod.bandsintown.com/images/hom...,https://www.bandsintown.com/e/1032435913-stere...,https://assets.prod.bandsintown.com/images/hom...,StereoPhonic,John Golden Theatre,,"New York City, NY",https://www.bandsintown.com/e/1032435913-stere...,https://www.bandsintown.com/a/2449338-stereoph...,0,0,2024-10-26 13:00:00,NaT
4,https://assets.prod.bandsintown.com/images/hom...,https://assets.prod.bandsintown.com/images/hom...,https://www.bandsintown.com/e/1032519750-il-tr...,https://assets.prod.bandsintown.com/images/hom...,Il Trovatore,Metropolitan Opera,,"New York, NY",https://www.bandsintown.com/e/1032519750-il-tr...,https://www.bandsintown.com/a/29288-il-trovato...,0,0,2024-10-26 13:00:00,NaT


Durée de l'évènement

In [2]:
df['event_duration'] = round((df['endsAt'] - df['startsAt']).dt.total_seconds() / 3600, 1)  # durée en heures

df[['startsAt', 'endsAt', 'event_duration']]

Unnamed: 0,startsAt,endsAt,event_duration
0,2024-10-26 11:00:00,2024-10-26 13:00:00,2.0
1,2024-10-26 12:00:00,NaT,
2,2024-10-26 12:00:00,2024-10-26 15:00:00,3.0
3,2024-10-26 13:00:00,NaT,
4,2024-10-26 13:00:00,NaT,
...,...,...,...
7064,2024-10-13 22:15:00,NaT,
7065,2024-10-13 22:30:00,2024-10-13 23:59:00,1.5
7066,2024-10-13 23:00:00,2024-10-13 23:30:00,0.5
7067,2024-10-13 23:00:00,NaT,


Week-end ou non

In [3]:
df['is_weekend'] = df['startsAt'].dt.weekday >= 5  # 5 = samedi, 6 = dimanche

df[['startsAt', 'is_weekend']]

Unnamed: 0,startsAt,is_weekend
0,2024-10-26 11:00:00,True
1,2024-10-26 12:00:00,True
2,2024-10-26 12:00:00,True
3,2024-10-26 13:00:00,True
4,2024-10-26 13:00:00,True
...,...,...
7064,2024-10-13 22:15:00,True
7065,2024-10-13 22:30:00,True
7066,2024-10-13 23:00:00,True
7067,2024-10-13 23:00:00,True


Segmentation - popularité

In [4]:
# Calcul des quartiles
Q1 = df['rsvpCountInt'].quantile(0.25)  # Premier quartile
Q2 = df['rsvpCountInt'].quantile(0.50)  # Médiane (deuxième quartile)
Q3 = df['rsvpCountInt'].quantile(0.75)  # Troisième quartile

# Fonction de segmentation basée sur les quartiles
def segment_rsvp(rsvp_count):
    if rsvp_count <= Q1:
        return 'Low'
    elif Q1 < rsvp_count <= Q2:
        return 'Medium'
    elif Q2 < rsvp_count <= Q3:
        return 'High'
    else:
        return 'Very High'

# Appliquer la fonction pour créer une nouvelle colonne 'popularity'
df['popularity'] = df['rsvpCountInt'].apply(segment_rsvp)

Nationalité

In [None]:
import requests
import pandas as pd

def get_artist_info_wikidata(artist_name):
    try:
        # Effectuer une recherche sur Wikidata pour l'artiste
        url = f'https://www.wikidata.org/w/api.php?action=wbsearchentities&search={artist_name}&language=en&limit=5&format=json'
        response = requests.get(url)
        response.raise_for_status()  # Vérifie les erreurs de statut HTTP
        
        # Vérifier si la réponse contient des résultats
        if not response.json()['search']:
            return pd.Series([pd.NA])
        
        # Récupérer l'ID de l'entité (exemple: Q123456)
        artist_id = response.json()['search'][0]['id']
        
        # Requête pour obtenir des informations détaillées sur l'artiste
        entity_url = f'https://www.wikidata.org/w/api.php?action=wbgetentities&ids={artist_id}&props=labels|descriptions&languages=en&format=json'
        entity_response = requests.get(entity_url)
        entity_response.raise_for_status()
        
        # Extraire les informations de l'artiste
        data = entity_response.json()
        artist_data = data['entities'][artist_id]
        
        # Récupérer la description disponible
        description = artist_data['descriptions'].get('en', {}).get('value', pd.NA)

        return pd.Series([description])
    
    except requests.exceptions.RequestException as e:
        return pd.Series([pd.NA])

# Appliquer la fonction à chaque nom d'artiste et ajouter les résultats au DataFrame
df['description'] = df['artistName'].apply(get_artist_info_wikidata)

# Si vous voulez enregistrer ce DataFrame dans un fichier CSV
df.to_csv("data_enrichies.csv", index=False)

In [2]:
import pandas as pd

df = pd.read_csv('data_enrichies.csv')

In [3]:
import re

# Nettoyage de la colonne 'description' en remplaçant certains termes par pd.NA
df['description'] = df['description'].apply(lambda x: pd.NA if any(term in str(x) for term in ['given name', 'family name']) else x)

# Remplacement des ORCID par pd.NA avec une meilleure correspondance pour l'expression régulière
df['description'] = df['description'].apply(lambda x: pd.NA if isinstance(x, str) and re.search(r'researcher \(ORCID \d{4}-\d{4}-\d{4}-\d{4}[A-Za-z]?\)', x) else x)

In [4]:
df

Unnamed: 0,artistImageSrc,properlySizedImageURL,callToActionRedirectUrl,fallbackImageUrl,artistName,venueName,title,locationText,eventUrl,artistUrl,rsvpCount,rsvpCountInt,startsAt,endsAt,event_duration,is_weekend,popularity,description
0,https://photos.bandsintown.com/thumb/13985339....,https://media.bandsintown.com/110x110/13985339...,https://www.bandsintown.com/e/105307658-emily-...,https://assets.prod.bandsintown.com/images/hom...,Emily Beck,John Jay Homestead Farm Market,Emily Beck at the John Jay Homestead,"Katonah, NY",https://www.bandsintown.com/e/105307658-emily-...,https://www.bandsintown.com/a/14545332-emily-b...,0,0,2024-10-26 11:00:00,2024-10-26 13:00:00,2.0,True,Low,
1,https://photos.bandsintown.com/thumb/12163906....,https://media.bandsintown.com/110x110/12163906...,https://www.bandsintown.com/e/105660087-john-f...,https://assets.prod.bandsintown.com/images/hom...,John Finch,St Mary's Winfield,St. Mary's Winfield High School Retreat,"Queens, NY",https://www.bandsintown.com/e/105660087-john-f...,https://www.bandsintown.com/a/2115895-john-fin...,0,0,2024-10-26 12:00:00,,,True,Low,English diplomat; (1626-1682)
2,https://photos.bandsintown.com/thumb/17578563....,https://media.bandsintown.com/110x110/17578563...,https://www.bandsintown.com/e/105917670-songbi...,https://assets.prod.bandsintown.com/images/hom...,Songbird Official Music,The Grove at Shrewsbury,The Grove at Shrewsbury West,"Shrewsbury, NJ",https://www.bandsintown.com/e/105917670-songbi...,https://www.bandsintown.com/a/15507503-songbir...,0,0,2024-10-26 12:00:00,2024-10-26 15:00:00,3.0,True,Low,
3,https://assets.prod.bandsintown.com/images/hom...,https://assets.prod.bandsintown.com/images/hom...,https://www.bandsintown.com/e/1032435913-stere...,https://assets.prod.bandsintown.com/images/hom...,StereoPhonic,John Golden Theatre,,"New York City, NY",https://www.bandsintown.com/e/1032435913-stere...,https://www.bandsintown.com/a/2449338-stereoph...,0,0,2024-10-26 13:00:00,,,True,Low,method of sound reproduction using two audio c...
4,https://assets.prod.bandsintown.com/images/hom...,https://assets.prod.bandsintown.com/images/hom...,https://www.bandsintown.com/e/1032519750-il-tr...,https://assets.prod.bandsintown.com/images/hom...,Il Trovatore,Metropolitan Opera,,"New York, NY",https://www.bandsintown.com/e/1032519750-il-tr...,https://www.bandsintown.com/a/29288-il-trovato...,0,0,2024-10-26 13:00:00,,,True,Low,opera by Giuseppe Verdi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7064,https://photos.bandsintown.com/thumb/17491803....,https://media.bandsintown.com/110x110/17491803...,https://www.bandsintown.com/e/1032332991-t.i.-...,https://assets.prod.bandsintown.com/images/hom...,T.I.,City Winery New York City,,"New York City, NY",https://www.bandsintown.com/e/1032332991-t.i.-...,https://www.bandsintown.com/a/5-t.i.?came_from...,31,31,2024-10-13 22:15:00,,,True,Very High,American rapper and actor
7065,https://photos.bandsintown.com/thumb/17350358....,https://media.bandsintown.com/110x110/17350358...,https://www.bandsintown.com/e/105635483-the-da...,https://assets.prod.bandsintown.com/images/hom...,The Dang-it Bobbys,Pete's Candy Store,,"Brooklyn, NY",https://www.bandsintown.com/e/105635483-the-da...,https://www.bandsintown.com/a/490968-the-dang-...,0,0,2024-10-13 22:30:00,2024-10-13 23:59:00,1.5,True,Low,
7066,https://photos.bandsintown.com/thumb/12169306....,https://media.bandsintown.com/110x110/12169306...,https://www.bandsintown.com/e/105801395-mark-a...,https://assets.prod.bandsintown.com/images/hom...,Mark Anthony,TAO Downtown Nightclub,MARK ANTHONY @ TAO DOWNTOWN NYC,"New York City, NY",https://www.bandsintown.com/e/105801395-mark-a...,https://www.bandsintown.com/a/263263-mark-anth...,0,0,2024-10-13 23:00:00,2024-10-13 23:30:00,0.5,True,Low,Roman politician and general (83 BC – 30 BC)
7067,https://photos.bandsintown.com/thumb/7170653.jpeg,https://media.bandsintown.com/110x110/7170653....,https://www.bandsintown.com/e/1033084776-miss-...,https://assets.prod.bandsintown.com/images/hom...,Miss Jennifer,Somewhere Nowhere NYC - Lounge & Rooftop Pool,,"New York, NY",https://www.bandsintown.com/e/1033084776-miss-...,https://www.bandsintown.com/a/7870982-miss-jen...,1,1,2024-10-13 23:00:00,,,True,Medium,recipient of a New Zealand Suffrage Medal 1993
